In [1]:
import pandas as pd
import numpy as np
import os
import sys
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

# Data

In [2]:
DATA_PATH = ("../data/")

In [3]:
# Alternative way of downloading the dataset
#from keras.datasets import mnist
#(X_train, y_train), (X_test, y_test) = mnist.load_data()

In [4]:
random_state = 42

In [5]:
file_list = os.listdir(DATA_PATH)

In [99]:
images_path = file_list[0]
labels_path = file_list[1]

In [100]:
images_path_full = os.path.join(DATA_PATH + images_path)
labels_path_full = os.path.join(DATA_PATH + labels_path)

In [101]:
X = pd.read_csv(images_path_full)
y = pd.read_csv(labels_path_full)

In [102]:
# Normalize data
X = X / 255

In [103]:
X.shape

(69999, 784)

In [104]:
y.shape

(69999, 1)

In [105]:
encoder = OneHotEncoder(sparse=False, categories='auto')
y = encoder.fit_transform(y)

In [106]:
y.shape

(69999, 10)

In [107]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_state)

# Classifiers

### Feed Forward Neural Net

In [108]:
from keras.models import Sequential
from keras.layers.core import Dense
from keras.optimizers import Adam

In [109]:
X.shape

(69999, 784)

In [110]:
y.shape

(69999, 10)

In [47]:
model = Sequential()
model.add(Dense(256, input_dim=784, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(10, activation='softmax'))

In [48]:
adam = Adam(lr=1e-3)

In [49]:
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])

In [50]:
model.fit(X_train, y_train,validation_data=(X_test, y_test), epochs=12, batch_size=64)
scores = model.evaluate(X_test, y_test)

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


In [51]:
print(scores)

[0.10872642755793363, 0.9750952380952381]


### CNN

In [85]:
from keras.layers import Dropout
from keras.layers import Flatten
from keras.layers.convolutional import Conv2D
from keras.layers.convolutional import MaxPooling2D
from keras.datasets import mnist
from keras.utils import np_utils

In [88]:
(X_train_cnn, y_train_cnn), (X_test_cnn, y_test_cnn) = mnist.load_data()

In [89]:
# Reshape for CNN 
X_train_cnn = X_train_cnn.reshape(X_train_cnn.shape[0], 28, 28, 1).astype('float32')
X_test_cnn = X_test_cnn.reshape(X_test_cnn.shape[0], 28, 28, 1).astype('float32')

In [90]:
np.random.seed(random_state)

In [91]:
# Normalize inputs from 0-255 to 0-1
X_train_cnn = X_train_cnn / 255
X_test_cnn = X_test_cnn / 255
# one hot encode outputs
y_train_cnn = np_utils.to_categorical(y_train_cnn)
y_test_cnn = np_utils.to_categorical(y_test_cnn)

In [111]:
model = Sequential()
model.add(Conv2D(32, (5, 5), input_shape=(28, 28, 1), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.4))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(10, activation='softmax'))

In [112]:
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])

In [113]:
model.fit(X_train_cnn, y_train_cnn, validation_data=(X_test_cnn, y_test_cnn), epochs=4, batch_size=200)
scores = model.evaluate(X_test_cnn, y_test_cnn)

Train on 60000 samples, validate on 10000 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [114]:
print(scores)

[0.029671441762306495, 0.9902]


### Random Forest

In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [24]:
random_f_clf = RandomForestClassifier(n_estimators=100,
                                     n_jobs = 2,
                                     random_state=random_state)
random_f_clf.fit(X_train, y_train)
y_pred = random_f_clf.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.8909523809523809


### XGBoost