In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## **Data Preprocessing**

In [None]:
# Load Data
train_df = pd.read_csv('/kaggle/input/digit-recognizer/train.csv')
test_df = pd.read_csv('/kaggle/input/digit-recognizer/test.csv')

train_df.head()

In [None]:
from keras.utils.np_utils import to_categorical

X_train = train_df.drop(['label'], axis=1)
y_train = train_df['label']

X_train = X_train.values.reshape(-1, 28, 28, 1)
X_test = test_df.values.reshape(-1, 28, 28, 1)


y_train = to_categorical(y_train, num_classes=10)

In [None]:
# view image

plt.figure(figsize=(12,10))
x, y = 10, 4
for i in range(40):  
    plt.subplot(y, x, i+1)
    plt.imshow(X_train[i].reshape((28,28)),interpolation='nearest')
plt.show()

In [None]:
# label distribution

train_df['label'].value_counts().sort_index().plot(kind='bar', figsize=(10,6), rot=0)

In [None]:
# normalizing data

X_train = X_train / 255.0
X_test = y_train / 255.0

In [None]:
y_train

## **Decision Tree**

In [None]:
X_train = train_df.drop(['label'], axis=1)
y_train = train_df['label']

X_train = X_train / 255.0
x_test = y_train / 255.0

y_train = to_categorical(y_train, num_classes=10)

In [None]:
# train test split

from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25)

In [None]:
# train model

from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier(max_depth=10)
dt_clf.fit(X_train, y_train)

In [None]:
# evaluate model

from sklearn.metrics import accuracy_score

train_pred = dt_clf.predict(X_train)
train_acc = accuracy_score(y_train, train_pred)

val_pred = dt_clf.predict(X_val)
val_acc = accuracy_score(y_val, val_pred)

print('Training Accuracy for DecisionTree:', np.round(train_acc, 2))
print('Validation Accuracy for DecisionTree:', np.round(val_acc, 2))

In [None]:
# GridSearchCV

from sklearn.model_selection import GridSearchCV

params = {
    'max_depth' : [6, 8, 10, 12, 16, 20]
}

grid_cv = GridSearchCV(dt_clf, param_grid=params, scoring='accuracy', cv=5)
grid_cv.fit(X_train, y_train)

print('Highest GridSeachCV average accuracy:', grid_cv.best_score_)
print('Optimized parameter:', grid_cv.best_params_)

In [None]:
# predict with best parameter

dt_clf_best = grid_cv.best_estimator_
pred = dt_clf.predict(X_val)


print('Validation Accuracy for best DecisionTree:', np.round(accuracy_score(y_val, pred), 2))

## **Random Forest**

In [None]:
# train model

from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train, y_train)

In [None]:
# evaluate model

train_pred = rf_clf.predict(X_train)
train_acc = accuracy_score(y_train, train_pred)

val_pred = rf_clf.predict(X_val)
val_acc = accuracy_score(y_val, val_pred)

print('Training Accuracy for DecisionTree:', np.round(train_acc, 2))
print('Validation Accuracy for DecisionTree:', np.round(val_acc, 2))

In [None]:
# GridSearchCV

from sklearn.model_selection import GridSearchCV

params = {
    'n_estimators':[100],
    'max_depth':[6, 8, 10, 12],
    'min_samples_leaf':[8, 12, 18],
    'min_samples_leaf':[8, 16, 20]
}

grid_cv = GridSearchCV(rf_clf, param_grid=params, scoring='accuracy', cv=5)
grid_cv.fit(X_train, y_train)

print('Highest GridSeachCV average accuracy:', grid_cv.best_score_)
print('Optimized parameter:', grid_cv.best_params_)

In [None]:
# predict with best parameter

rf_clf_best = grid_cv.best_estimator_
pred = rf_clf.predict(X_val)


print('Validation Accuracy for best DecisionTree:', np.round(accuracy_score(y_val, pred), 2))

## **PCA**

In [None]:
X_train = train_df.drop(['label'], axis=1)
y_train = train_df['label']

In [None]:
# Standardize data for applying PCA

from sklearn.preprocessing import StandardScaler
standardized_features = StandardScaler().fit_transform(X_train)

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=150)
pca.fit(standardized_features)

print(sum(pca.explained_variance_ratio_))

In [None]:
X_transformed = pca.transform(standardized_features)
X_transformed.shape

In [None]:
# train and evaluate DecisionTree with GridSearchCV

from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_transformed, y_train, test_size=0.25)

dt_clf = DecisionTreeClassifier()

params = {
    'max_depth' : [6, 8, 10, 12, 16, 20]
}

grid_cv = GridSearchCV(dt_clf, param_grid=params, scoring='accuracy', cv=5)
grid_cv.fit(X_train, y_train)

print('Highest GridSeachCV average accuracy:', grid_cv.best_score_)
print('Optimized parameter:', grid_cv.best_params_)

In [None]:
# train and evaluate RandomForest with GridSearchCV

from sklearn.model_selection import GridSearchCV

rf_clf = RandomForestClassifier()

params = {
    'n_estimators':[100],
    'max_depth':[6, 8, 10, 12],
    'min_samples_leaf':[8, 12, 18],
    'min_samples_leaf':[8, 16, 20]
}

grid_cv = GridSearchCV(rf_clf, param_grid=params, scoring='accuracy', cv=5)
grid_cv.fit(X_train, y_train)

print('Highest GridSeachCV average accuracy:', grid_cv.best_score_)
print('Optimized parameter:', grid_cv.best_params_)

In [None]:
rf_clf_best = grid_cv.best_estimator_
pred = rf_clf_best.predict(X_val)


print('Validation Accuracy for best DecisionTree:', np.round(accuracy_score(y_val, pred), 2))

**RandomForest Accuracy increased thorough PCA!    
I'll keep trying to search for better PCA method to increase the classification accuracy!🔥🔥**

## **Submission**

In [None]:
test_transformed = pca.transform(test_df)
pred = rf_clf_best.predict(test_transformed)
submit_df = pd.read_csv('/kaggle/input/digit-recognizer/sample_submission.csv')
submit_df['Label'] = pred
submit_df.to_csv('submission.csv')