In [1]:
# Plot ad hoc mnist instances
#from keras.datasets import mnist
import numpy as np
import pandas as pd
from PIL import Image, ImageDraw
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold

data = pd.read_csv('train.csv').values
X = data[:,1:]
Y = data[:,0]

In [2]:
def getData(X, Y, folds, normalize):
    
    if(normalize==True): X = X/X.max()
    
    kf = KFold(n_splits=folds)
    kf.get_n_splits(X)
    
    print(kf)  
    train_data = []
    test_data = []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]
        train_data.append((X_train,Y_train))
        test_data.append((X_test, Y_test))
        
    return train_data, test_data

def getTrainingData(X, Y, train_ratio, normalize=True):
    
    TRAINING_SIZE = (int)(len(data)*train_ratio)
    
    if(normalize==True): X = X/X.max()
    
    X_train = X[0:TRAINING_SIZE,:]
    Y_train = Y[0:TRAINING_SIZE]
   
    return X_train, Y_train

def getTestingData(X, Y, train_ratio, test_size, test_remaining=False, normalize=True):
    
    TRAINING_SIZE = (int)(len(data)*train_ratio)
    
    if(normalize==True): X = X/X.max()
    
    if test_remaining==True:
        X_test = X[TRAINING_SIZE:,:]
        Y_test = Y[TRAINING_SIZE:]
    else:
        X_test = X[TRAINING_SIZE:TRAINING_SIZE+test_size,:]
        Y_test = Y[TRAINING_SIZE:TRAINING_SIZE+test_size]
    
    return X_test, Y_test

In [3]:
set_names = ["Non-normalized Data", "Normalized Data"]
rawTrainSets, rawTestSets = getData(X, Y, 5, False)
normTrainSets, normTestSets = getData(X, Y, 5, True)
train_sets = [rawTrainSets, normTrainSets]

KFold(n_splits=5, random_state=None, shuffle=False)
KFold(n_splits=5, random_state=None, shuffle=False)


In [4]:
params = [{'n_estimators': 1       , 'criterion':'gini', 'bootstrap': False},
          {'n_estimators': 1, 'criterion':'gini',     'bootstrap': True},
          {'n_estimators': 1, 'criterion':'entropy', 'bootstrap': False },
          {'n_estimators': 1, 'criterion':'entropy', 'bootstrap': True },
          {'n_estimators': 10, 'criterion':'gini', 'bootstrap': False },
          {'n_estimators': 10, 'criterion':'gini', 'bootstrap': True },
          {'n_estimators': 10, 'criterion':'entropy', 'bootstrap': False },
          {'n_estimators': 10, 'criterion':'entropy', 'bootstrap': True },
          {'n_estimators': 100, 'criterion':'gini', 'bootstrap': False },
          {'n_estimators': 100, 'criterion':'gini',     'bootstrap': True },
          {'n_estimators': 100, 'criterion':'entropy', 'bootstrap': False },
          {'n_estimators': 100, 'criterion':'entropy',  'bootstrap': True }]
          
plot_args = [{'c': 'red', 'linestyle': '-'},
             {'c': 'red', 'linestyle': '-'},
             {'c': 'red', 'linestyle': '-'},
             {'c': 'green', 'linestyle': '--'},
             {'c': 'green', 'linestyle': '--'},
             {'c': 'green', 'linestyle': '--'},
             {'c': 'blue', 'linestyle': '-'},
             {'c': 'blue', 'linestyle': '-'},
             {'c': 'blue', 'linestyle': '-'},
             {'c': 'yellow', 'linestyle': '--'},
             {'c': 'yellow', 'linestyle': '--'},
             {'c': 'yellow', 'linestyle': '--'}]


In [5]:
rfs = []
for train_set, set_name in zip(train_sets, set_names):
    X_train, y_train = train_set[4]
    print("Training models using %s " % set_name)
    for param in params:
        print("Training using: ", param)
        rf = RandomForestClassifier(**param, n_jobs = -1)
        rf.fit(X_train, y_train)
        rfs.append(rf)
        print("Done training!")
print("Done all training!")

Training models using Non-normalized Data 
Training using:  {'n_estimators': 1, 'criterion': 'gini', 'bootstrap': False}
Done training!
Training using:  {'n_estimators': 1, 'criterion': 'gini', 'bootstrap': True}
Done training!
Training using:  {'n_estimators': 1, 'criterion': 'entropy', 'bootstrap': False}
Done training!
Training using:  {'n_estimators': 1, 'criterion': 'entropy', 'bootstrap': True}
Done training!
Training using:  {'n_estimators': 10, 'criterion': 'gini', 'bootstrap': False}
Done training!
Training using:  {'n_estimators': 10, 'criterion': 'gini', 'bootstrap': True}
Done training!
Training using:  {'n_estimators': 10, 'criterion': 'entropy', 'bootstrap': False}
Done training!
Training using:  {'n_estimators': 10, 'criterion': 'entropy', 'bootstrap': True}
Done training!
Training using:  {'n_estimators': 100, 'criterion': 'gini', 'bootstrap': False}
Done training!
Training using:  {'n_estimators': 100, 'criterion': 'gini', 'bootstrap': True}
Done training!
Training usi

In [98]:
test_sets = [rawTestSets, normTestSets]
i=0

for test_set, set_name in zip(test_sets, set_names):
    X_test, y_test = test_set[4]
    print("Testing models using %s " % set_name)
    j=12*i
    f = 12*(i+1)
    while(j<f):
        str1 = str(params[j%12])
        str2 = str(rfs[j].score(X_test, y_test))
        print("%-70s \t %s\n" % (str1, str2), end='')
        j += 1
    i += 1
        

Testing models using Non-normalized Data 
{'n_estimators': 1, 'criterion': 'gini', 'bootstrap': False}           	 0.8166666666666667
{'n_estimators': 1, 'criterion': 'gini', 'bootstrap': True}            	 0.7858333333333334
{'n_estimators': 1, 'criterion': 'entropy', 'bootstrap': False}        	 0.8167857142857143
{'n_estimators': 1, 'criterion': 'entropy', 'bootstrap': True}         	 0.800952380952381
{'n_estimators': 10, 'criterion': 'gini', 'bootstrap': False}          	 0.9519047619047619
{'n_estimators': 10, 'criterion': 'gini', 'bootstrap': True}           	 0.9367857142857143
{'n_estimators': 10, 'criterion': 'entropy', 'bootstrap': False}       	 0.9503571428571429
{'n_estimators': 10, 'criterion': 'entropy', 'bootstrap': True}        	 0.9407142857142857
{'n_estimators': 100, 'criterion': 'gini', 'bootstrap': False}         	 0.9689285714285715
{'n_estimators': 100, 'criterion': 'gini', 'bootstrap': True}          	 0.9651190476190477
{'n_estimators': 100, 'criterion': 'ent

In [57]:
rfs_cv = []
for train_set, set_name in zip(train_sets, set_names):
    print("Training models using %s " % set_name)
    for cv_train_set in train_set:
        X_train, y_train = cv_train_set
        rf = RandomForestClassifier(n_jobs = -1)
        rf.fit(X_train, y_train)
        rfs_cv.append(rf)
        print("Done training!")
print("Done all training!")

Training models using Non-normalized Data 




Done training!




Done training!




Done training!




Done training!




Done training!
Training models using Normalized Data 




Done training!




Done training!




Done training!




Done training!




Done training!
Done all training!


In [58]:
test_sets = [rawTestSets, normTestSets]
i=0
for test_set, set_name in zip(test_sets, set_names):
    print("Testing models using %s " % set_name)
    j=5*i
    f = 5*(i+1)
    while(j<f):
        X_test, y_test = test_set[j%5]
        str1 = "fold# " + str(j%5)
        str2 = str(rfs_cv[j].score(X_test, y_test))
        print("%s \t %s\n" % (str1, str2), end='')
        j += 1
    i += 1
    

Testing models using Non-normalized Data 
fold# 0 	 0.9358333333333333
fold# 1 	 0.9382142857142857
fold# 2 	 0.9334523809523809
fold# 3 	 0.9403571428571429
fold# 4 	 0.9434523809523809
Testing models using Normalized Data 
fold# 0 	 0.9361904761904762
fold# 1 	 0.9341666666666667
fold# 2 	 0.9378571428571428
fold# 3 	 0.9355952380952381
fold# 4 	 0.9379761904761905


In [59]:
rfs_cv_1 = []
for train_set, set_name in zip(train_sets, set_names):
    print("Training models using %s " % set_name)
    for cv_train_set in train_set:
        X_train, y_train = cv_train_set
        rf = RandomForestClassifier(**params[8], n_jobs = -1)
        rf.fit(X_train, y_train)
        rfs_cv_1.append(rf)
        print("Done training!")
print("Done all training!")

Training models using Non-normalized Data 
Done training!
Done training!
Done training!
Done training!
Done training!
Training models using Normalized Data 
Done training!
Done training!
Done training!
Done training!
Done training!
Done all training!


In [60]:
test_sets = [rawTestSets, normTestSets]
i=0
for test_set, set_name in zip(test_sets, set_names):
    print("Testing models using %s " % set_name)
    j=5*i
    f = 5*(i+1)
    while(j<f):
        X_test, y_test = test_set[j%5]
        str1 = "fold# " + str(j%5)
        str2 = str(rfs_cv_1[j].score(X_test, y_test))
        print("%s \t %s\n" % (str1, str2), end='')
        j += 1
    i += 1
    

Testing models using Non-normalized Data 
fold# 0 	 0.9672619047619048
fold# 1 	 0.9682142857142857
fold# 2 	 0.9676190476190476
fold# 3 	 0.9691666666666666
fold# 4 	 0.9714285714285714
Testing models using Normalized Data 
fold# 0 	 0.9671428571428572
fold# 1 	 0.9688095238095238
fold# 2 	 0.9657142857142857
fold# 3 	 0.9694047619047619
fold# 4 	 0.9694047619047619


In [61]:
test = pd.read_csv('test.csv').values

In [95]:
def writePredictions(clf, X):
    IDs = np.arange(start=1,stop=len(X)+1)
    IDs= IDs.reshape(len(X),1)
    predictions = clf.predict(X)
    predictions = predictions.reshape(len(X),1)
    output = np.concatenate((IDs,predictions), axis=1)
    np.savetxt("sample_submission.csv", output, delimiter=",", fmt = '%d', header = "ImageID,Label", comments='')
    

In [96]:
writePredictions(rfs_cv_1[4],test)