In [1]:
print(__doc__)


# Code source: Gaël Varoquaux
#              Andreas Müller
# Modified for documentation by Jaques Grobler
# License: BSD 3 clause
import pandas as pd
from PIL import Image, ImageDraw
import numpy as np
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier


data = pd.read_csv('train.csv').values
X = data[:,1:]
Y = data[:,0]


def getData(X, Y, folds, normalize):
    
    if(normalize==True): X = X/X.max()
    
    kf = KFold(n_splits=folds)
    kf.get_n_splits(X)
    
    print(kf)  
    train_data = []
    test_data = []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]
        train_data.append((X_train,Y_train))
        test_data.append((X_test, Y_test))
        
    return train_data, test_data


# def getTrainingData(X, Y, train_ratio, folds = 5,normalize=True):
    
#     TRAINING_SIZE = (int)(len(data)*train_ratio)
#     training_sets = []
#     if(normalize==True): X = X/X.max()
#     for i in range (folds):
#         X_train = X[TRAINING_SIZE*i:TRAINING_SIZE*(i+1),:]
#         Y_train = Y[TRAINING_SIZE*i:TRAINING_SIZE*(i+1)]
#         training_sets.append(zip(X_train, Y_train))
#         print(len(X_train))
   
#     return training_sets

# def getTestingData(X, Y, train_ratio, test_size, test_remaining=False, normalize=True):
    
#     TRAINING_SIZE = (int)(len(data)*train_ratio)
    
#     if(normalize==True): X = X/X.max()
    
#     if test_remaining==True:
#         X_test = X[TRAINING_SIZE:,:]
#         Y_test = Y[TRAINING_SIZE:]
#     else:
#         X_test = X[TRAINING_SIZE:TRAINING_SIZE+test_size,:]
#         Y_test = Y[TRAINING_SIZE:TRAINING_SIZE+test_size]
    
#     return X_test, Y_test

names = [ "Neural Net", "Nearest Neighbors", "Random Forest", "Linear SVM"]

classifiers = [
    MLPClassifier(),
    KNeighborsClassifier(),
    RandomForestClassifier(),
    SVC(max_iter = 100)
  ]

TRAINING_RATIO = 0.8

set_names = ["Non-normalized Data", "Normalized Data"]
RTrain_sets, RTest_sets = getData(X, Y, 5, False)
NTrain_sets, NTest_sets = getData(X, Y, 5, True)
Train_sets = [RTrain_sets, NTrain_sets]


Automatically created module for IPython interactive environment
KFold(n_splits=5, random_state=None, shuffle=False)
KFold(n_splits=5, random_state=None, shuffle=False)


In [4]:
# print(len(RTrain_sets[0][1]))
# print(len(RTest_sets[0][1]))

33600
8400


In [5]:

#we have 5 non-normalized training data sets, and 5 normalized training data sets
#             if(name == "Linear SVM" and set_name == "Non-normalized Data"):
#                 print("model never converges")
#             else:

figure = plt.figure(figsize=(27, 9))
i = 1

clfs = []

for train_sets, set_name in zip(Train_sets, set_names):
    print("Training classifiers using: %s ..." % set_name)
    
    for name, clf in zip(names, classifiers):
        i = 1
        print("Training: %s using different training data ..." % name)
        for train_set in train_sets:
            print("Training classifier using fold#: %d ..." % i)
            # iterate over classifiers
            i+=1
            print("Training: %s ..." % name)
            X_train, Y_train = train_set
            clf.fit(X_train, Y_train)
            clfs.append(clf)
            print("Done training: %s!" % name)
        print("Done training %s using different training data!" % name)

    
       
    print("Done training classifiers using: %s ..." % set_name)    

Training classifiers using: Non-normalized Data ...
Training: Neural Net using different training data ...
Training classifier using fold#: 1 ...
Training: Neural Net ...
Done training: Neural Net!
Training classifier using fold#: 2 ...
Training: Neural Net ...
Done training: Neural Net!
Training classifier using fold#: 3 ...
Training: Neural Net ...
Done training: Neural Net!
Training classifier using fold#: 4 ...
Training: Neural Net ...
Done training: Neural Net!
Training classifier using fold#: 5 ...
Training: Neural Net ...
Done training: Neural Net!
Done training Neural Net using different training data!
Training: Nearest Neighbors using different training data ...
Training classifier using fold#: 1 ...
Training: Nearest Neighbors ...
Done training: Nearest Neighbors!
Training classifier using fold#: 2 ...
Training: Nearest Neighbors ...
Done training: Nearest Neighbors!
Training classifier using fold#: 3 ...
Training: Nearest Neighbors ...
Done training: Nearest Neighbors!
Train



Done training: Random Forest!
Training classifier using fold#: 2 ...
Training: Random Forest ...
Done training: Random Forest!
Training classifier using fold#: 3 ...
Training: Random Forest ...
Done training: Random Forest!
Training classifier using fold#: 4 ...
Training: Random Forest ...
Done training: Random Forest!
Training classifier using fold#: 5 ...
Training: Random Forest ...
Done training: Random Forest!
Done training Random Forest using different training data!
Training: Linear SVM using different training data ...
Training classifier using fold#: 1 ...
Training: Linear SVM ...




Done training: Linear SVM!
Training classifier using fold#: 2 ...
Training: Linear SVM ...




Done training: Linear SVM!
Training classifier using fold#: 3 ...
Training: Linear SVM ...




Done training: Linear SVM!
Training classifier using fold#: 4 ...
Training: Linear SVM ...




Done training: Linear SVM!
Training classifier using fold#: 5 ...
Training: Linear SVM ...




Done training: Linear SVM!
Done training Linear SVM using different training data!
Done training classifiers using: Non-normalized Data ...
Training classifiers using: Normalized Data ...
Training: Neural Net using different training data ...
Training classifier using fold#: 1 ...
Training: Neural Net ...
Done training: Neural Net!
Training classifier using fold#: 2 ...
Training: Neural Net ...
Done training: Neural Net!
Training classifier using fold#: 3 ...
Training: Neural Net ...
Done training: Neural Net!
Training classifier using fold#: 4 ...
Training: Neural Net ...
Done training: Neural Net!
Training classifier using fold#: 5 ...
Training: Neural Net ...
Done training: Neural Net!
Done training Neural Net using different training data!
Training: Nearest Neighbors using different training data ...
Training classifier using fold#: 1 ...
Training: Nearest Neighbors ...
Done training: Nearest Neighbors!
Training classifier using fold#: 2 ...
Training: Nearest Neighbors ...
Done tra



Done training: Linear SVM!
Training classifier using fold#: 2 ...
Training: Linear SVM ...




Done training: Linear SVM!
Training classifier using fold#: 3 ...
Training: Linear SVM ...




Done training: Linear SVM!
Training classifier using fold#: 4 ...
Training: Linear SVM ...




Done training: Linear SVM!
Training classifier using fold#: 5 ...
Training: Linear SVM ...




Done training: Linear SVM!
Done training Linear SVM using different training data!
Done training classifiers using: Normalized Data ...




<Figure size 1944x648 with 0 Axes>

In [6]:
Test_sets = [RTest_sets, NTest_sets]

# plot_args = [{'c': 'red', 'linestyle': '-'},{'c': 'green', 'linestyle': '-'},
#              {'c': 'blue', 'linestyle': '-'},{'c': 'yellow', 'linestyle': '-'}]
# print(len(clfs))
i = 1
# fig, axes = plt.subplots(figsize=(15, 10))
for test_sets, set_name in zip(Test_sets, set_names):
    print("Testing classifiers using: %s ..." % set_name)
    for test_set in test_sets:
        print("Testing classifiers using fold#: %d ..." % i)
        # iterate over classifiers
        i+=1
        X_test, Y_test = test_set
        print("Testing classifiers using: %s ..." % set_name)
        for clf, name in zip(clfs, names):
    #         if name != "Nearest Neighbors" and name != "Random Forest":
            print("Testing: %s" % name)
            print("Classifier score: %f" % clf.score(X_test, Y_test))
#         print("Training set loss: %f" % clf.loss_)
# #             print(Y_test[0:10])
# #             print(clf.predict(X_test[0:10]))
# #             axes.plot(clf.loss_curve_, label=name, **plot_arg)
#     print("Training set score: %f" % mlp.score(X_test, Y_test))
#     print("Training set loss: %f" % mlp.loss_)
#     print(mlp.loss_curve_)
#     axes.plot(mlp.loss_curve_, label=label, **plot_arg)

# fig.legend(axes.get_lines(), names, ncol = 2, loc="upper center")
# plt.show()

Testing classifiers using: Non-normalized Data ...
Testing classifiers using fold#: 1 ...
Testing classifiers using: Non-normalized Data ...
Testing: Neural Net
Classifier score: 1.000000
Testing: Nearest Neighbors
Classifier score: 1.000000
Testing: Random Forest
Classifier score: 1.000000
Testing: Linear SVM
Classifier score: 1.000000
Testing classifiers using fold#: 2 ...
Testing classifiers using: Non-normalized Data ...
Testing: Neural Net
Classifier score: 1.000000
Testing: Nearest Neighbors
Classifier score: 1.000000
Testing: Random Forest
Classifier score: 1.000000
Testing: Linear SVM
Classifier score: 1.000000
Testing classifiers using fold#: 3 ...
Testing classifiers using: Non-normalized Data ...
Testing: Neural Net
Classifier score: 1.000000
Testing: Nearest Neighbors
Classifier score: 1.000000
Testing: Random Forest
Classifier score: 1.000000
Testing: Linear SVM
Classifier score: 1.000000
Testing classifiers using fold#: 4 ...
Testing classifiers using: Non-normalized Data