In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import tree
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import accuracy_score
from scipy.stats import mode

In [82]:
""" Load data """
data = pd.read_csv('train.csv')

""" Sample data """
for i in range(10):
    idx_by_label = np.array(data.index[data['label']==i].tolist())
    idx_by_label_sample = np.random.choice(idx_by_label, size=1000, replace=False)
    if i == 0:
        idx_all = idx_by_label_sample
    else:
        idx_all = np.append(idx_all, idx_by_label_sample)

data_sample = data.iloc[np.sort(idx_all)]

""" Divide data to features and labels """
data_features = data_sample.iloc[:, 1:]
data_labels = data_sample.iloc[:, 0]

""" Convert Pandas to Array """
df_array = data_features.values
dl_array = data_labels.values

In [83]:
""" Task 1 """
""" Perform Decision Tree (DT) """
dt_clf = tree.DecisionTreeClassifier()
dt_clf.fit(df_array, dl_array)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [86]:
""" Perform Multi-Layer Perceptron (MLP) """
mlp_clf = MLPClassifier()
mlp_clf.fit(df_array, dl_array)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [87]:
""" Task 2: Calculate the accuracy score """
def mapLabels(clust,trainLabel):
    labels = np.zeros_like(clust)
    for i in range(10):
        mask = (clust == i)
        labels[mask] = mode(trainLabel[mask])[0]
    return labels

dt_prediction = clf.predict(df_array)
mlp_prediction = mlp_clf.predict(df_array)

dt_label = mapLabels(dt_prediction, dl_array)
mlp_label = mapLabels(mlp_prediction, dl_array)

dt_acc_score = accuracy_score(dl_array, dt_label)
mlp_acc_score = accuracy_score(dl_array, mlp_label)

print('Decision Tree Accuracy Score is {}'.format(dt_acc_score))
print('Multi-Layer Perceptron Accuracy Score is {}'.format(mlp_acc_score))

Decision Tree Accuracy Score is 0.8962
Multi-Layer Perceptron Accuracy Score is 0.9912


In [75]:
""" Task 3: Perform cross-validation using DT and MLP methods """
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

#dt_cv_score = cross_val_score(clf, data_features, data_labels, cv=10)
#for i in range(len(dt_cv_score)):
#    print('[DT] CV Score of {} : {}'.format(i, dt_cv_score[i]))
#mlp_cv_score = cross_val_score(mlp_clf, data_features, data_labels, cv=10)
#for i in range(len(mlp_cv_score)):
#    print('[MLP] CV Score of {} : {}'.format(i, mlp_cv_score[i]))

dt_kf_two = KFold(n_splits=2)
dt_kf_two.get_n_splits(df_array)

for train_index, test_index in dt_kf_two.split(df_array):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = [df_array[ii] for ii in train_index], [df_array[ii] for ii in test_index]
    y_train, y_test = [dl_array[ii] for ii in train_index], [dl_array[ii] for ii in test_index]
    
    clf = tree.DecisionTreeClassifier() 
    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    
    dt_cv_score = cross_val_score(clf, X_train, y_train, cv=10)
    for i in range(len(dt_cv_score)):
        print('[DT] CV Score of #{} : {}'.format(i, round(dt_cv_score[i],3)))
    print('[DT] CV Score Average: {}'.format(round(dt_cv_score.mean(),3)))    

TRAIN: [5000 5001 5002 ... 9997 9998 9999] TEST: [   0    1    2 ... 4997 4998 4999]
[DT] CV Score of #0 : 0.752
[DT] CV Score of #1 : 0.805
[DT] CV Score of #2 : 0.753
[DT] CV Score of #3 : 0.748
[DT] CV Score of #4 : 0.778
[DT] CV Score of #5 : 0.752
[DT] CV Score of #6 : 0.761
[DT] CV Score of #7 : 0.763
[DT] CV Score of #8 : 0.754
[DT] CV Score of #9 : 0.764
[DT] CV Score Average: 0.763
TRAIN: [   0    1    2 ... 4997 4998 4999] TEST: [5000 5001 5002 ... 9997 9998 9999]
[DT] CV Score of #0 : 0.764
[DT] CV Score of #1 : 0.77
[DT] CV Score of #2 : 0.771
[DT] CV Score of #3 : 0.753
[DT] CV Score of #4 : 0.747
[DT] CV Score of #5 : 0.725
[DT] CV Score of #6 : 0.744
[DT] CV Score of #7 : 0.781
[DT] CV Score of #8 : 0.771
[DT] CV Score of #9 : 0.766
[DT] CV Score Average: 0.759


In [78]:
dt_kf_four = KFold(n_splits=4)
dt_kf_four.get_n_splits(df_array)

for train_index, test_index in dt_kf_four.split(df_array):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = [df_array[ii] for ii in train_index], [df_array[ii] for ii in test_index]
    y_train, y_test = [dl_array[ii] for ii in train_index], [dl_array[ii] for ii in test_index]
    
    clf = tree.DecisionTreeClassifier() 
    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    
    dt_cv_score = cross_val_score(clf, X_train, y_train, cv=10)
    for i in range(len(dt_cv_score)):
        print('[DT] CV Score #{} : {}'.format(i, round(dt_cv_score[i],3)))
    print('[DT] CV Score Average: {}'.format(round(dt_cv_score.mean(),3)))

TRAIN: [2500 2501 2502 ... 9997 9998 9999] TEST: [   0    1    2 ... 2497 2498 2499]
[DT] CV Score #0 : 0.784
[DT] CV Score #1 : 0.809
[DT] CV Score #2 : 0.794
[DT] CV Score #3 : 0.769
[DT] CV Score #4 : 0.776
[DT] CV Score #5 : 0.78
[DT] CV Score #6 : 0.776
[DT] CV Score #7 : 0.799
[DT] CV Score #8 : 0.795
[DT] CV Score #9 : 0.779
[DT] CV Score Average: 0.786
TRAIN: [   0    1    2 ... 9997 9998 9999] TEST: [2500 2501 2502 ... 4997 4998 4999]
[DT] CV Score #0 : 0.801
[DT] CV Score #1 : 0.797
[DT] CV Score #2 : 0.799
[DT] CV Score #3 : 0.81
[DT] CV Score #4 : 0.785
[DT] CV Score #5 : 0.785
[DT] CV Score #6 : 0.777
[DT] CV Score #7 : 0.778
[DT] CV Score #8 : 0.79
[DT] CV Score #9 : 0.78
[DT] CV Score Average: 0.79
TRAIN: [   0    1    2 ... 9997 9998 9999] TEST: [5000 5001 5002 ... 7497 7498 7499]
[DT] CV Score #0 : 0.75
[DT] CV Score #1 : 0.759
[DT] CV Score #2 : 0.76
[DT] CV Score #3 : 0.771
[DT] CV Score #4 : 0.806
[DT] CV Score #5 : 0.813
[DT] CV Score #6 : 0.798
[DT] CV Score #7 : 

In [81]:
mlp_kf_two = KFold(n_splits=2)
mlp_kf_two.get_n_splits(df_array)

for train_index, test_index in mlp_kf_two.split(df_array):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = [df_array[ii] for ii in train_index], [df_array[ii] for ii in test_index]
    y_train, y_test = [dl_array[ii] for ii in train_index], [dl_array[ii] for ii in test_index]
    
    clf = MLPClassifier()
    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    
    mlp_cv_score = cross_val_score(clf, X_train, y_train, cv=10)
    for i in range(len(mlp_cv_score)):
        print('[MLP] CV Score #{} : {}'.format(i, round(mlp_cv_score[i],3)))
    print('[MLP] CV Score Average: {}'.format(round(mlp_cv_score.mean(),3)))

TRAIN: [5000 5001 5002 ... 9997 9998 9999] TEST: [   0    1    2 ... 4997 4998 4999]
[MLP] CV Score #0 : 0.859
[MLP] CV Score #1 : 0.883
[MLP] CV Score #2 : 0.881
[MLP] CV Score #3 : 0.865
[MLP] CV Score #4 : 0.88
[MLP] CV Score #5 : 0.892
[MLP] CV Score #6 : 0.863
[MLP] CV Score #7 : 0.885
[MLP] CV Score #8 : 0.891
[MLP] CV Score #9 : 0.881
[MLP] CV Score Average: 0.878
TRAIN: [   0    1    2 ... 4997 4998 4999] TEST: [5000 5001 5002 ... 9997 9998 9999]
[MLP] CV Score #0 : 0.859
[MLP] CV Score #1 : 0.903
[MLP] CV Score #2 : 0.877
[MLP] CV Score #3 : 0.861
[MLP] CV Score #4 : 0.896
[MLP] CV Score #5 : 0.876
[MLP] CV Score #6 : 0.877
[MLP] CV Score #7 : 0.887
[MLP] CV Score #8 : 0.891
[MLP] CV Score #9 : 0.903
[MLP] CV Score Average: 0.883


In [89]:
mlp_kf_four = KFold(n_splits=4)
mlp_kf_four.get_n_splits(df_array)

for train_index, test_index in mlp_kf_four.split(df_array):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = [df_array[ii] for ii in train_index], [df_array[ii] for ii in test_index]
    y_train, y_test = [dl_array[ii] for ii in train_index], [dl_array[ii] for ii in test_index]
    
    clf = MLPClassifier()
    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    
    mlp_cv_score = cross_val_score(clf, X_train, y_train, cv=10)
    for i in range(len(mlp_cv_score)):
        print('[MLP] CV Score #{} : {}'.format(i, round(mlp_cv_score[i],3)))
    print('[MLP] CV Score Average: {}'.format(round(mlp_cv_score.mean(),3)))

TRAIN: [2500 2501 2502 ... 9997 9998 9999] TEST: [   0    1    2 ... 2497 2498 2499]
[MLP] CV Score #0 : 0.893
[MLP] CV Score #1 : 0.903
[MLP] CV Score #2 : 0.903
[MLP] CV Score #3 : 0.914
[MLP] CV Score #4 : 0.891
[MLP] CV Score #5 : 0.893
[MLP] CV Score #6 : 0.904
[MLP] CV Score #7 : 0.899
[MLP] CV Score #8 : 0.912
[MLP] CV Score #9 : 0.914
[MLP] CV Score Average: 0.903
TRAIN: [   0    1    2 ... 9997 9998 9999] TEST: [2500 2501 2502 ... 4997 4998 4999]
[MLP] CV Score #0 : 0.89
[MLP] CV Score #1 : 0.907
[MLP] CV Score #2 : 0.899
[MLP] CV Score #3 : 0.911
[MLP] CV Score #4 : 0.916
[MLP] CV Score #5 : 0.908
[MLP] CV Score #6 : 0.873
[MLP] CV Score #7 : 0.904
[MLP] CV Score #8 : 0.901
[MLP] CV Score #9 : 0.898
[MLP] CV Score Average: 0.901
TRAIN: [   0    1    2 ... 9997 9998 9999] TEST: [5000 5001 5002 ... 7497 7498 7499]
[MLP] CV Score #0 : 0.907
[MLP] CV Score #1 : 0.92
[MLP] CV Score #2 : 0.915
[MLP] CV Score #3 : 0.903
[MLP] CV Score #4 : 0.896
[MLP] CV Score #5 : 0.902
[MLP] CV Sc