In [1]:
# Plot ad hoc mnist instances
#from keras.datasets import mnist
import numpy as np
import pandas as pd
from PIL import Image, ImageDraw
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold

data = pd.read_csv('train.csv').values
X = data[:,1:]
Y = data[:,0]

In [2]:
def getData(X, Y, folds, normalize):
    
    if(normalize==True): X = X/X.max()
    
    kf = KFold(n_splits=folds)
    kf.get_n_splits(X)
    
    print(kf)  
    train_data = []
    test_data = []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]
        train_data.append((X_train,Y_train))
        test_data.append((X_test, Y_test))
        
    return train_data, test_data

def getTrainingData(X, Y, train_ratio, normalize=True):
    
    TRAINING_SIZE = (int)(len(data)*train_ratio)
    
    if(normalize==True): X = X/X.max()
    
    X_train = X[0:TRAINING_SIZE,:]
    Y_train = Y[0:TRAINING_SIZE]
   
    return X_train, Y_train

def getTestingData(X, Y, train_ratio, test_size, test_remaining=False, normalize=True):
    
    TRAINING_SIZE = (int)(len(data)*train_ratio)
    
    if(normalize==True): X = X/X.max()
    
    if test_remaining==True:
        X_test = X[TRAINING_SIZE:,:]
        Y_test = Y[TRAINING_SIZE:]
    else:
        X_test = X[TRAINING_SIZE:TRAINING_SIZE+test_size,:]
        Y_test = Y[TRAINING_SIZE:TRAINING_SIZE+test_size]
    
    return X_test, Y_test

In [None]:
set_names = ["Non-normalized Data", "Normalized Data"]
rawTrainSets, rawTestSets = getData(X, Y, 5, False)
normTrainSets, normTestSets = getData(X, Y, 5, True)
train_sets = [rawTrainSets, normTrainSets]

In [None]:
params = [{'kernel ': 1, 'criterion':'gini', 'bootstrap': False},
          {'kernel ': 1, 'criterion':'gini',     'bootstrap': True},
          {'kernel ': 1, 'criterion':'entropy', 'bootstrap': False },
          {'n_estimators': 1, 'criterion':'entropy', 'bootstrap': True },
          {'n_estimators': 10, 'criterion':'gini', 'bootstrap': False },
          {'n_estimators': 10, 'criterion':'gini', 'bootstrap': True },
          {'n_estimators': 10, 'criterion':'entropy', 'bootstrap': False },
          {'n_estimators': 10, 'criterion':'entropy', 'bootstrap': True },
          {'n_estimators': 100, 'criterion':'gini', 'bootstrap': False },
          {'n_estimators': 100, 'criterion':'gini',     'bootstrap': True },
          {'n_estimators': 100, 'criterion':'entropy', 'bootstrap': False },
          {'n_estimators': 100, 'criterion':'entropy',  'bootstrap': True }]
          
plot_args = [{'c': 'red', 'linestyle': '-'},
             {'c': 'red', 'linestyle': '-'},
             {'c': 'red', 'linestyle': '-'},
             {'c': 'green', 'linestyle': '--'},
             {'c': 'green', 'linestyle': '--'},
             {'c': 'green', 'linestyle': '--'},
             {'c': 'blue', 'linestyle': '-'},
             {'c': 'blue', 'linestyle': '-'},
             {'c': 'blue', 'linestyle': '-'},
             {'c': 'yellow', 'linestyle': '--'},
             {'c': 'yellow', 'linestyle': '--'},
             {'c': 'yellow', 'linestyle': '--'}]


In [None]:
rfs = []
for train_set, set_name in zip(train_sets, set_names):
    X_train, y_train = train_set[4]
    print("Training models using %s " % set_name)
    for param in params:
        print("Training using: ", param)
        rf = RandomForestClassifier(**param, n_jobs = -1)
        rf.fit(X_train, y_train)
        rfs.append(rf)
        print("Done training!")
print("Done all training!")

In [None]:
test_sets = [rawTestSets, normTestSets]
i=0

for test_set, set_name in zip(test_sets, set_names):
    X_test, y_test = test_set[4]
    print("Testing models using %s " % set_name)
    j=12*i
    f = 12*(i+1)
    while(j<f):
        str1 = str(params[j%12])
        str2 = str(rfs[j].score(X_test, y_test))
        print("%-70s \t %s\n" % (str1, str2), end='')
        j += 1
    i += 1
        

In [None]:
rfs_cv = []
for train_set, set_name in zip(train_sets, set_names):
    print("Training models using %s " % set_name)
    for cv_train_set in train_set:
        X_train, y_train = cv_train_set
        rf = RandomForestClassifier(n_jobs = -1)
        rf.fit(X_train, y_train)
        rfs_cv.append(rf)
        print("Done training!")
print("Done all training!")

In [None]:
test_sets = [rawTestSets, normTestSets]
i=0
for test_set, set_name in zip(test_sets, set_names):
    print("Testing models using %s " % set_name)
    j=5*i
    f = 5*(i+1)
    while(j<f):
        X_test, y_test = test_set[j%5]
        str1 = "fold# " + str(j%5)
        str2 = str(rfs_cv[j].score(X_test, y_test))
        print("%s \t %s\n" % (str1, str2), end='')
        j += 1
    i += 1
    