In [2]:
import time

import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier

In [3]:
column_names = np.zeros(785, dtype=object)
column_names[0] = 'label'

for i in range(1, 785):
    column_names[i] = 'pixel' + str(i)
df_train = pd.read_csv('mnist_train.csv', header=None, names=column_names)
df_test = pd.read_csv('mnist_test.csv', header=None, names=column_names)

df_train.head(10)

Unnamed: 0,label,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784
0,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
X_train = df_train.drop('label', axis=1)
y_train = df_train['label']
X_test = df_test.drop('label', axis=1)
y_test = df_test['label']

In [5]:
# Normalization
X_train = X_train / 255
X_test = X_test / 255

## Standard default parameters 

In [5]:
start = time.perf_counter()
dt = DecisionTreeClassifier(
    max_depth=6,
    criterion='gini'
)
dt.fit(X_train, y_train)
accuracy = dt.score(X_test, y_test)
end = time.perf_counter()
print('Decision Tree time: ', end - start)
print('Decision Tree accuracy: ', accuracy)

Decision Tree time:  3.4780856130000757
Decision Tree accuracy:  0.7415


In [12]:
start = time.perf_counter()
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=6,
    criterion='gini')
rf.fit(X_train, y_train)
accuracy = rf.score(X_test, y_test)
end = time.perf_counter()
print('Random Forest time: ', end - start)
print('Random Forest accuracy: ', accuracy)

Random Forest time:  18.0830515580019
Random Forest accuracy:  0.8978


In [9]:
start = time.perf_counter()
et = ExtraTreesClassifier(
    n_estimators=200,
    max_depth=6,
    criterion='gini')
et.fit(X_train, y_train)
accuracy = et.score(X_test, y_test)
end = time.perf_counter()
print('Extra Trees time: ', end - start)
print('Extra Trees accuracy: ', accuracy)

Extra Trees time:  11.875051544000598
Extra Trees accuracy:  0.8776


In [10]:
start = time.perf_counter()
xgb = XGBClassifier(
    n_estimators=200,
    max_depth=6,
)
xgb.fit(X_train, y_train)
accuracy = xgb.score(X_test, y_test)
end = time.perf_counter()
print('XGBoost time: ', end - start)
print('XGBoost accuracy: ', accuracy)

XGBoost time:  145.97236217700265
XGBoost accuracy:  0.9804


# Grid Search
## <font color="red">WARNING</font>
This takes very long time to run.

In [9]:
# Decision Tree
param_grid = {'criterion': ['gini', 'entropy', 'log_loss'],
              'max_depth': [i for i in range(6, 20)]
              }
dt = DecisionTreeClassifier()
grid_search = GridSearchCV(dt, param_grid, n_jobs=-1)
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
print(grid_search.best_score_)

{'criterion': 'entropy', 'max_depth': 13}
0.8738666666666667


In [10]:
# Random Forest
param_grid = {'n_estimators': [10, *[i for i in range(50, 250, 50)]],
              'criterion': ['gini', 'entropy', 'log_loss'],
              'max_depth': [i for i in range(6, 20)]
              }
rf = RandomForestClassifier()
grid_search = GridSearchCV(rf, param_grid, n_jobs=-1)
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
print(grid_search.best_score_)

{'criterion': 'entropy', 'max_depth': 18, 'n_estimators': 150}
0.9669666666666666


In [11]:
# Extra Trees
param_grid = {'n_estimators': [10, *[i for i in range(50, 250, 50)]],
              'criterion': ['gini', 'entropy', 'log_loss'],
              'max_depth': [i for i in range(6, 20)]
              }
et = ExtraTreesClassifier()
grid_search = GridSearchCV(et, param_grid, n_jobs=-1)
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
print(grid_search.best_score_)



{'criterion': 'log_loss', 'max_depth': 19, 'n_estimators': 200}
0.9695666666666668
