In [1]:
import pandas as pd
import numpy as np
import os
import sys
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

# Data

In [2]:
DATA_PATH = ("../data/")

In [3]:
# Alternative way of downloading the dataset
#from keras.datasets import mnist
#(X_train, y_train), (X_test, y_test) = mnist.load_data()

In [45]:
# Reproducible results
random_state = 42

In [46]:
# Reshape does not currently work
def get_data(one_hot_enc=True, reshape=False):
    file_list = os.listdir(DATA_PATH)
    images_path = file_list[0]
    labels_path = file_list[1]
    images_path_full = os.path.join(DATA_PATH + images_path)
    labels_path_full = os.path.join(DATA_PATH + labels_path)
    X = pd.read_csv(images_path_full)
    y = pd.read_csv(labels_path_full)
    # Normalize data
    X = X / 255
    # Encode labels
    if reshape:
        X = X.reshape(X.shape[0], 28, 28, 1)
        y = y.reshape(X.shape[0], 28, 28, 1)
    if one_hot_enc:
        encoder = OneHotEncoder(sparse=False, categories='auto')
        y = encoder.fit_transform(y)
    # Divide into train and test set 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)
    return X_train, X_test, y_train, y_test

In [47]:
# Divide into train and test set
X_train, X_test, y_train, y_test = get_data()

In [48]:
# Divide further into train and validation
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3, random_state=random_state)

In [49]:
print(X_train.shape)
print(y_train.shape)

(39199, 784)
(39199, 10)


In [50]:
print(X_val.shape)
print(y_val.shape)

(16800, 784)
(16800, 10)


In [51]:
print(X_test.shape)
print(y_test.shape)

(14000, 784)
(14000, 10)


# Classifiers

### Feed Forward Neural Net

In [52]:
from keras.models import Sequential
from keras.layers.core import Dense
from keras.optimizers import Adam
from keras import regularizers

In [53]:
regularizer_l2 = regularizers.l2(0.01)

In [54]:
model = Sequential()
model.add(Dense(256, input_dim=784, activation='relu', kernel_regularizer=regularizer_l2))
model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(8, activation='relu', kernel_regularizer=regularizer_l2))
model.add(Dense(10, activation='softmax'))

In [55]:
adam = Adam(lr=1e-3)

In [56]:
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])

In [57]:
model.fit(X_train, y_train,validation_data=(X_val, y_val), epochs=12, batch_size=64)
scores = model.evaluate(X_test, y_test)

Train on 39199 samples, validate on 16800 samples
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


In [58]:
print(scores)

[0.2572172294003623, 0.9576428571428571]


> Can see that the model is somewhat overfitting - Will try to implement *Regularizer*
<br>
This resulted in a loss in accuracy, but there is a trade off between accuracy and overfitting

### CNN

In [59]:
from keras.layers import Dropout
from keras.layers import Flatten
from keras.layers.convolutional import Conv2D
from keras.layers.convolutional import MaxPooling2D
from keras.datasets import mnist
from keras.utils import np_utils

In [62]:
(X_train_cnn, y_train_cnn), (X_test_cnn, y_test_cnn) = mnist.load_data()

In [73]:
# Currently not working
#X_train_cnn, y_train_cnn, X_test_cnn, y_test_cnn = get_data(one_hot_enc=False, reshape=True)

In [63]:
# Reshape for CNN 
X_train_cnn = X_train_cnn.reshape(X_train_cnn.shape[0], 28, 28, 1).astype('float32')
X_test_cnn = X_test_cnn.reshape(X_test_cnn.shape[0], 28, 28, 1).astype('float32')

In [64]:
# Reproducible results
np.random.seed(random_state)

In [65]:
# Normalize inputs from 0-255 to 0-1
X_train_cnn = X_train_cnn / 255
X_test_cnn = X_test_cnn / 255
# one hot encode outputs
y_train_cnn = np_utils.to_categorical(y_train_cnn)
y_test_cnn = np_utils.to_categorical(y_test_cnn)

In [66]:
X_train_cnn.shape

(60000, 28, 28, 1)

In [67]:
X_test_cnn.shape

(10000, 28, 28, 1)

In [68]:
X_train_cnn, X_val_cnn, y_train_cnn, y_val_cnn = train_test_split(X_train_cnn, y_train_cnn, test_size=0.3, random_state=random_state)

In [69]:
model = Sequential()
model.add(Conv2D(32, (5, 5), input_shape=(28, 28, 1), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.4))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(10, activation='softmax'))

In [70]:
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])

In [71]:
model.fit(X_train_cnn, y_train_cnn, validation_data=(X_val_cnn, y_val_cnn), epochs=4, batch_size=200)
scores = model.evaluate(X_test_cnn, y_test_cnn)

Train on 42000 samples, validate on 18000 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [72]:
print(scores)

[0.037081126303761265, 0.9874]


### Random Forest

In [30]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [31]:
X_train_rf, X_test_rf, y_train_rf, y_test_rf = get_data()

In [32]:
print(X_train_rf.shape)
print(X_test_rf.shape)

(55999, 784)
(14000, 784)


In [33]:
print(y_train_rf.shape)
print(y_test_rf.shape)

(55999, 10)
(14000, 10)


In [34]:
random_f_clf = RandomForestClassifier(n_estimators=50,
                                     n_jobs = 2,
                                     random_state=random_state)
random_f_clf.fit(X_train_rf, y_train_rf)
y_pred_rf = random_f_clf.predict(X_test_rf)
print(accuracy_score(y_test_rf, y_pred_rf))

0.8957857142857143


### XGBoost

In [35]:
# If needing to install xgboost (using conda)
# ! conda install py-xgboost

In [74]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

In [75]:
# Divide into train and test set 
X_train_xg, X_test_xg, y_train_xg, y_test_xg = get_data(one_hot_enc=False)

In [76]:
params = {
        'gamma': [0.5, 2],
        'subsample': [0.6, 1.0],
        'colsample_bytree': [0.6, 1.0],
        'max_depth': [3, 5],
        'n_estimators': [10]
        }

Fun to try with more parameters, but my poor CPU cannot take any more..

## Disclaimer: This takes a lot of time

In [79]:
%%capture 
xgb_clf = XGBClassifier()
rs = GridSearchCV(xgb_clf,
                  params,
                  cv=2,
                  scoring="accuracy",
                  n_jobs=1,
                  verbose=2)

rs.fit(X_train_xg, y_train_xg.values.ravel())

In [80]:
optim_est_xg = rs.best_estimator_
print(optim_est_xg)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.6, gamma=0.5, learning_rate=0.1,
       max_delta_step=0, max_depth=5, min_child_weight=1, missing=None,
       n_estimators=10, n_jobs=1, nthread=None, objective='multi:softprob',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1.0)


In [81]:
y_pred_xg = optim_est_xg.predict(X_test_xg)
print(accuracy_score(y_test_xg, y_pred_xg))

0.9046428571428572
