In [1]:
# Plot ad hoc mnist instances
#from keras.datasets import mnist
import numpy as np
import pandas as pd
from PIL import Image, ImageDraw
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold

data = pd.read_csv('train.csv').values
X = data[:,1:]
Y = data[:,0]

In [2]:
def getData(X, Y, folds, normalize):
    
    if(normalize==True): X = X/X.max()
    
    kf = KFold(n_splits=folds)
    kf.get_n_splits(X)
    
    print(kf)  
    train_data = []
    test_data = []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]
        train_data.append((X_train,Y_train))
        test_data.append((X_test, Y_test))
        
    return train_data, test_data

def getTrainingData(X, Y, train_ratio, normalize=True):
    
    TRAINING_SIZE = (int)(len(data)*train_ratio)
    
    if(normalize==True): X = X/X.max()
    
    X_train = X[0:TRAINING_SIZE,:]
    Y_train = Y[0:TRAINING_SIZE]
   
    return X_train, Y_train

def getTestingData(X, Y, train_ratio, test_size, test_remaining=False, normalize=True):
    
    TRAINING_SIZE = (int)(len(data)*train_ratio)
    
    if(normalize==True): X = X/X.max()
    
    if test_remaining==True:
        X_test = X[TRAINING_SIZE:,:]
        Y_test = Y[TRAINING_SIZE:]
    else:
        X_test = X[TRAINING_SIZE:TRAINING_SIZE+test_size,:]
        Y_test = Y[TRAINING_SIZE:TRAINING_SIZE+test_size]
    
    return X_test, Y_test

In [3]:
set_names = ["Non-normalized Data", "Normalized Data"]
rawTrainSets, rawTestSets = getData(X, Y, 5, False)
normTrainSets, normTestSets = getData(X, Y, 5, True)
train_sets = [rawTrainSets, normTrainSets]

KFold(n_splits=5, random_state=None, shuffle=False)
KFold(n_splits=5, random_state=None, shuffle=False)


In [61]:
params = [{'n_estimators': 1       , 'criterion':'gini', 'bootstrap': False},
          {'n_estimators': 1, 'criterion':'gini',     'bootstrap': True},
          {'n_estimators': 1, 'criterion':'entropy', 'bootstrap': False },
          {'n_estimators': 1, 'criterion':'entropy', 'bootstrap': True },
          {'n_estimators': 10, 'criterion':'gini', 'bootstrap': False },
          {'n_estimators': 10, 'criterion':'gini', 'bootstrap': True },
          {'n_estimators': 10, 'criterion':'entropy', 'bootstrap': False },
          {'n_estimators': 10, 'criterion':'entropy', 'bootstrap': True },
          {'n_estimators': 100, 'criterion':'gini', 'bootstrap': False },
          {'n_estimators': 100, 'criterion':'gini',     'bootstrap': True },
          {'n_estimators': 100, 'criterion':'entropy', 'bootstrap': False },
          {'n_estimators': 100, 'criterion':'entropy',  'bootstrap': True }]
          
plot_args = [{'c': 'red', 'linestyle': '-'},
             {'c': 'red', 'linestyle': '-'},
             {'c': 'red', 'linestyle': '-'},
             {'c': 'green', 'linestyle': '--'},
             {'c': 'green', 'linestyle': '--'},
             {'c': 'green', 'linestyle': '--'},
             {'c': 'blue', 'linestyle': '-'},
             {'c': 'blue', 'linestyle': '-'},
             {'c': 'blue', 'linestyle': '-'},
             {'c': 'yellow', 'linestyle': '--'},
             {'c': 'yellow', 'linestyle': '--'},
             {'c': 'yellow', 'linestyle': '--'}]


In [50]:
rfs = []
for train_set, set_name in zip(train_sets, set_names):
    X_train, y_train = train_set[4]
    print("Training models using %s " % set_name)
    for param in params:
        print("Training using: ", param)
        rf = RandomForestClassifier(**param, n_jobs = -1)
        rf.fit(X_train, y_train)
        rfs.append(rf)
        print("Done training!")
print("Done all training!")

Training models using Non-normalized Data 
Training using:  {'n_estimators': 1, 'criterion': 'gini', 'bootstrap': False}
Done training!
Training using:  {'n_estimators': 1, 'criterion': 'gini', 'bootstrap': True}
Done training!
Training using:  {'n_estimators': 1, 'criterion': 'entropy', 'bootstrap': False}
Done training!
Training using:  {'n_estimators': 1, 'criterion': 'entropy', 'bootstrap': True}
Done training!
Training using:  {'n_estimators': 10, 'criterion': 'gini', 'bootstrap': False}
Done training!
Training using:  {'n_estimators': 10, 'criterion': 'gini', 'bootstrap': True}
Done training!
Training using:  {'n_estimators': 10, 'criterion': 'entropy', 'bootstrap': False}
Done training!
Training using:  {'n_estimators': 10, 'criterion': 'entropy', 'bootstrap': True}
Done training!
Training using:  {'n_estimators': 100, 'criterion': 'gini', 'bootstrap': False}
Done training!
Training using:  {'n_estimators': 100, 'criterion': 'gini', 'bootstrap': True}
Done training!
Training usi

In [62]:
test_sets = [rawTestSets, normTestSets]
i=0
for test_set, set_name in zip(test_sets, set_names):
    X_test, y_test = test_set[4]
    print("Testing models using %s " % set_name)
    j=12*i
    f = 12*(i+1)
    while(j<f):
        print(params[j%12],"\t" , rfs[j].score(X_test, y_test))
        j += 1
    i += 1
        

Testing models using Non-normalized Data 
{'n_estimators': 1, 'criterion': 'gini', 'bootstrap': False} 	 0.8196428571428571
{'n_estimators': 1, 'criterion': 'gini', 'bootstrap': True} 	 0.7884523809523809
{'n_estimators': 1, 'criterion': 'entropy', 'bootstrap': False} 	 0.825952380952381
{'n_estimators': 1, 'criterion': 'entropy', 'bootstrap': True} 	 0.8029761904761905
{'n_estimators': 10, 'criterion': 'gini', 'bootstrap': False} 	 0.9492857142857143
{'n_estimators': 10, 'criterion': 'gini', 'bootstrap': True} 	 0.9388095238095238
{'n_estimators': 10, 'criterion': 'entropy', 'bootstrap': False} 	 0.9515476190476191
{'n_estimators': 10, 'criterion': 'entropy', 'bootstrap': True} 	 0.9421428571428572
{'n_estimators': 100, 'criterion': 'gini', 'bootstrap': False} 	 0.9698809523809524
{'n_estimators': 100, 'criterion': 'gini', 'bootstrap': True} 	 0.9655952380952381
{'n_estimators': 100, 'criterion': 'entropy', 'bootstrap': False} 	 0.9696428571428571
{'n_estimators': 100, 'criterion': 'e