In [1]:
import sklearn
import numpy as np
import pandas as pd
import json
from scipy import stats
from scipy.stats import ttest_1samp
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.cluster import KMeans
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsClassifier
from random import randint
from sklearn import svm
from sklearn.preprocessing import MinMaxScaler

In [2]:
#Methods before acquiring data from models, methods to acquire and clean data.
#Scales all X data to ensure feature weight is all the same
scaler = MinMaxScaler(feature_range=(0, 1))

#The manual Test can be a lot, we use every entry from 0 to whatever random number was generated
def randomizeData(dataset):
    if(dataset == 'covtype.csv'):
        data = pd.read_csv('covtype.csv')
        randomNumber = randint(1,len(data.index) - 5000)
        randomDataSampling = data[randomNumber:randomNumber + 5000]
        randomTestSampling = data[0:randomNumber]
        return randomDataSampling,randomTestSampling
    if(dataset == 'letter-recognition.csv'):
        data = pd.read_csv('letter-recognition.csv')
        randomNumber = randint(1,len(data.index) - 5000)
        randomDataSampling = data[randomNumber:randomNumber + 5000]
        randomTestSampling = data[0:randomNumber]
        return randomDataSampling,randomTestSampling
    if(dataset == 'Adult.csv'):
        data = pd.read_csv('Adult.csv')
        randomNumber = randint(1,len(data.index) - 5000)
        randomDataSampling = data[randomNumber:randomNumber + 5000]
        randomTestSampling = data[0:randomNumber]
        return randomDataSampling,randomTestSampling
    elif(dataset == dataset):
        data = pd.read_csv(dataset)
        randomNumber = randint(1,len(data.index) - 5000)
        randomDataSampling = data[randomNumber:randomNumber + 5000]
        randomTestSampling = data[0:randomNumber]
        return randomDataSampling,randomTestSampling

#No method to return X set and Y set of any dataset as datasets may vary.
#Returns covertype data as X set and Y set.
#Output: two outputs, X and Y sets and manual testing sets
def covtypeData():
    covtypeData,covtypeTestData = randomizeData('covtype.csv')
    covtypeDataX = covtypeData.iloc[:, 0: 53]
    covtypeDataY = covtypeData.iloc[:, 54]
    covtypeDataX = scaler.fit_transform(covtypeDataX)
    covtypeTestDataX = covtypeTestData.iloc[:, 0: 53]
    covtypeTestDataY = covtypeTestData.iloc[:, 54]
    covtypeTestDataX = scaler.fit_transform(covtypeTestDataX)
    return covtypeDataX,covtypeDataY,covtypeTestDataX,covtypeTestDataY

#Returns letter data as X set and Y set
#Output: 4 outputs, X and Y sets and manual testing sets
def letterData():
    letterData,letterTestData = randomizeData('letter-recognition.csv')
    letterDataX = letterData.iloc[:,1:17]
    letterDataY = letterData.iloc[:, 0]
    letterDataX = scaler.fit_transform(letterDataX)
    #Change answer column to guess if the letter is within range of A-M on the alphabet. 1 if it is, 0 if not
    allowed = ['A','B','C','D','E','F','G','H','I','J','K','L','M','a','b','c','d','e','f','g','h','i','j','k','l','m']
    letterDataY = letterDataY.apply(lambda x: 1 if x in allowed else 0)
    letterTestDataX = letterTestData.iloc[:,1:17]
    letterTestDataY = letterTestData.iloc[:, 0]
    letterTestDataX = scaler.fit_transform(letterTestDataX)
    letterTestDataY = letterTestDataY.apply(lambda x: 1 if x in allowed else 0)
    return letterDataX, letterDataY, letterTestDataX, letterTestDataY

#Returns Adult data as one hot encoded X set and Y set
#Output: two outputs, X and Y sets
def adultData():
    #Clean Adult data set
    #adultDataX1, adultDataY1 = adultData()
    adultData,adultTestdata = randomizeData('Adult.csv')
    adultDataY = adultData.iloc[:, -1]
    adultDataX = adultData.iloc[:, 0: 14]
    over = [' <=50K']
    adultDataY = adultDataY.apply(lambda x: 1 if x in over else 0)
    one_hot = pd.get_dummies(adultDataX)
    return one_hot, adultDataY,


In [3]:
#Instantiate all estimator objects and it's parameters to be used for gridsearchCV

#Knn estimator object instantiated, try to choose parameters by having different number of neighbors
#Everything else is default
#https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
knn = KNeighborsClassifier()
knnParamGrid = {'n_neighbors': [5, 6, 7, 8, 9], 'weights' : ['uniform', 'distance']}

#svc estimator object instantiated
#Checks list of float values for regularization parameter C, kernel type for algorithm linear or rbf,
#and gamma the kernel coefficient for rbf
svc = svm.SVC()
svcParamGrid = {'C':[1, 5, 10, 20, 100],'gamma':[1,2,3], 'kernel':['linear','rbf']}

#Naive Bayesian estimator object instantiated
#Uses different alphas and to check if it's prior data should be is better
mnb = MultinomialNB()
mnbParamGrid = {'alpha':[1, 5, 10, 20, 100],'fit_prior':[True,False]}

#Kmeans estimator object instantiated
#n_clusters : The number of clusters to form as well as the number of centroids to generate 
#init : Has initial clustering and random clustering
km = KMeans()
kmParamGrid = {'n_clusters':[1, 5, 10, 15],'init':['random','k-means++'],'algorithm':['auto','full']}

In [4]:
#covtype.csv currently has three randomly sampled data sets split into X and Y datasets
#and into train and test sets to be used in gridsearchcv
covtypeDataX1, covtypeDataY1, covtypeDataX1_test, covtypeDataY1_test = covtypeData()
covtypeDataX2, covtypeDataY2, covtypeDataX2_test, covtypeDataY2_test = covtypeData()
covtypeDataX3, covtypeDataY3, covtypeDataX3_test, covtypeDataY3_test = covtypeData()

In [5]:
#Make gridsearchcv objects using our svc estimator object, svc parameters, and using accuracy for scoring
#Uses cross validation
#set 1
kmCovTypegrid1 = GridSearchCV(km,kmParamGrid,scoring='accuracy',return_train_score=True)
kmCovTypegrid1.fit(covtypeDataX1, covtypeDataY1)
kmCovTypePrediction1 = kmCovTypegrid1.score(covtypeDataX1_test,covtypeDataY1_test)
#set 2
kmCovTypegrid2 = GridSearchCV(km,kmParamGrid,scoring='accuracy',return_train_score=True)
kmCovTypegrid2.fit(covtypeDataX2, covtypeDataY2)
kmCovTypePrediction2 = kmCovTypegrid2.score(covtypeDataX2_test,covtypeDataY2_test)
#set 3
kmCovTypegrid3 = GridSearchCV(km,kmParamGrid,scoring='accuracy',return_train_score=True)
kmCovTypegrid3.fit(covtypeDataX3, covtypeDataY3)
kmCovTypePrediction3 = kmCovTypegrid3.score(covtypeDataX3_test,covtypeDataY3_test)

In [6]:
#Make gridsearchcv objects using our knn estimator object, knn parameters, and using accuracy for scoring
#Uses cross validation
#set 1
knnCovTypegrid1 = GridSearchCV(knn,knnParamGrid,scoring='accuracy',return_train_score=True)
knnCovTypegrid1.fit(covtypeDataX1,covtypeDataY1)
knnCovTypePrediction1 = knnCovTypegrid1.score(covtypeDataX1_test,covtypeDataY1_test)
#set 2
knnCovTypegrid2 = GridSearchCV(knn,knnParamGrid,scoring='accuracy',return_train_score=True)
knnCovTypegrid2.fit(covtypeDataX2,covtypeDataY2)
knnCovTypePrediction2 = knnCovTypegrid2.score(covtypeDataX2_test,covtypeDataY2_test)
#set 3
knnCovTypegrid3 = GridSearchCV(knn,knnParamGrid,scoring='accuracy',return_train_score=True)
knnCovTypegrid3.fit(covtypeDataX1,covtypeDataY1)
knnCovTypePrediction3 = knnCovTypegrid3.score(covtypeDataX3_test,covtypeDataY3_test)

In [7]:
#Make gridsearchcv objects using our Multinomial NB estimator object, default parameters, and using accuracy for scoring
#Uses cross validation
#set 1
mnbCovTypegrid1 = GridSearchCV(mnb,mnbParamGrid,scoring='accuracy',return_train_score=True)
mnbCovTypegrid1.fit(covtypeDataX1,covtypeDataY1)
mnbcovTypePrediction1 = mnbCovTypegrid1.score(covtypeDataX1_test,covtypeDataY1_test)
#set 2
mnbCovTypegrid2 = GridSearchCV(mnb,mnbParamGrid,scoring='accuracy',return_train_score=True)
mnbCovTypegrid2.fit(covtypeDataX2,covtypeDataY2)
mnbcovTypePrediction2 = mnbCovTypegrid2.score(covtypeDataX2_test,covtypeDataY2_test)
#set 3
mnbCovTypegrid3 = GridSearchCV(mnb,mnbParamGrid,scoring='accuracy',return_train_score=True)
mnbCovTypegrid3.fit(covtypeDataX1,covtypeDataY1)
mnbcovTypePrediction3 = mnbCovTypegrid3.score(covtypeDataX3_test,covtypeDataY3_test)

In [8]:
#letter.csv currently has three randomly sampled data sets split into X and Y datasets
#and into train and test sets to be used in gridsearchcv
letterDataX1, letterDataY1, letterDataX1_test, letterDataY1_test = letterData()
letterDataX2, letterDataY2, letterDataX2_test, letterDataY2_test = letterData()
letterDataX3, letterDataY3, letterDataX3_test, letterDataY3_test = letterData()

In [9]:
#Make gridsearchcv objects using our svc estimator object, svc parameters, and using accuracy for scoring
#Uses cross validation
#set 1
kmLetterGrid1 = GridSearchCV(km,kmParamGrid,scoring='accuracy',return_train_score=True)
kmLetterGrid1.fit(letterDataX1, letterDataY1)
kmLetterPrediction1 = kmLetterGrid1.score(letterDataX1_test,letterDataY1_test)
#set 2
kmLetterGrid2 = GridSearchCV(km,kmParamGrid,scoring='accuracy',return_train_score=True)
kmLetterGrid2.fit(letterDataX2, letterDataY2)
kmLetterPrediction2 = kmLetterGrid2.score(letterDataX2_test,letterDataY2_test)
#set 3
kmLetterGrid3 = GridSearchCV(km,kmParamGrid,scoring='accuracy',return_train_score=True)
kmLetterGrid3.fit(letterDataX3, letterDataY3)
kmLetterPrediction3 = kmLetterGrid3.score(letterDataX3_test,letterDataY3_test)

In [10]:
#Make gridsearchcv objects using our knn estimator object, knn parameters, and using accuracy for scoring
#Uses cross validation
#set 1
knnLetterGrid1 = GridSearchCV(knn,knnParamGrid,scoring='accuracy',return_train_score=True)
knnLetterGrid1.fit(letterDataX1, letterDataY1)
knnLetterPrediction1 = knnLetterGrid1.score(letterDataX1_test,letterDataY1_test)
#set 2
knnLetterGrid2 = GridSearchCV(knn,knnParamGrid,scoring='accuracy',return_train_score=True)
knnLetterGrid2.fit(letterDataX2, letterDataY2)
knnLetterPrediction2 = knnLetterGrid2.score(letterDataX2_test,letterDataY2_test)
#set 3
knnLetterGrid3 = GridSearchCV(knn,knnParamGrid,scoring='accuracy',return_train_score=True)
knnLetterGrid3.fit(letterDataX3, letterDataY3)
knnLetterPrediction3 = knnLetterGrid3.score(letterDataX3_test,letterDataY3_test)

In [11]:
#Make gridsearchcv objects using our Multinomial NB estimator object, default parameters, and using accuracy for scoring
#Uses cross validation
#set 1
mnbLetterGrid1 = GridSearchCV(mnb,mnbParamGrid,scoring='accuracy',return_train_score=True)
mnbLetterGrid1.fit(letterDataX1, letterDataY1)
mnbLetterPrediction1 = mnbLetterGrid1.score(letterDataX1_test,letterDataY1_test)
#set 2
mnbLetterGrid2 = GridSearchCV(mnb,mnbParamGrid,scoring='accuracy',return_train_score=True)
mnbLetterGrid2.fit(letterDataX2, letterDataY2)
mnbLetterPrediction2 = mnbLetterGrid2.score(letterDataX2_test,letterDataY2_test)
#set 3
mnbLetterGrid3 = GridSearchCV(mnb,mnbParamGrid,scoring='accuracy',return_train_score=True)
mnbLetterGrid3.fit(letterDataX3, letterDataY3)
mnbLetterPrediction3 = mnbLetterGrid3.score(letterDataX3_test,letterDataY3_test)

In [12]:
#letter.csv currently has three randomly sampled data sets split into X and Y datasets to be used in gridsearchCV
#Can't have predictions for this dataset because on hot encoding changed the number of features 
#so we couldn't use our model against a mantual test set
adultDataX1, adultDataY1 = adultData()
adultDataX2, adultDataY2 = adultData()
adultDataX3, adultDataY3 = adultData()

In [13]:
#Make gridsearchcv objects using our svc estimator object, svc parameters, and using accuracy for scoring
#set 1
kmAdultGrid1 = GridSearchCV(km,kmParamGrid,scoring='accuracy',return_train_score=True)
kmAdultGrid1.fit(adultDataX1, adultDataY1)
#set 2
kmAdultGrid2 = GridSearchCV(km,kmParamGrid,scoring='accuracy',return_train_score=True)
kmAdultGrid2.fit(adultDataX2, adultDataY2)
#set 3
kmAdultGrid3 = GridSearchCV(km,kmParamGrid,scoring='accuracy',return_train_score=True)
kmAdultGrid3.fit(adultDataX3, adultDataY3)
#This random variable is included to supress .fit from printing to console.
#It's not needed for other adult sets because we manually test our models too.
randomvariable = 2

In [14]:
#Make gridsearchcv objects using our knn estimator object, knn parameters, and using accuracy for scoring
#set 1
knnAdultGrid1 = GridSearchCV(knn,knnParamGrid,scoring='accuracy',return_train_score=True)
knnAdultGrid1.fit(adultDataX1, adultDataY1)
#set 2
knnAdultGrid2 = GridSearchCV(knn,knnParamGrid,scoring='accuracy',return_train_score=True)
knnAdultGrid2.fit(adultDataX2, adultDataY2)
#set 3
knnAdultGrid3 = GridSearchCV(knn,knnParamGrid,scoring='accuracy',return_train_score=True)
knnAdultGrid3.fit(adultDataX3, adultDataY3)
randomvariable = 2

In [15]:
#Make gridsearchcv objects using our Multinomial NB estimator object, default parameters, and using accuracy for scoring
#set 1
mnbAdultGrid1 = GridSearchCV(mnb,mnbParamGrid,scoring='accuracy',return_train_score=True)
mnbAdultGrid1.fit(adultDataX1, adultDataY1)
#set 2
mnbAdultGrid2 = GridSearchCV(mnb,mnbParamGrid,scoring='accuracy',return_train_score=True)
mnbAdultGrid2.fit(adultDataX2, adultDataY2)
#set 3
mnbAdultGrid3 = GridSearchCV(mnb,mnbParamGrid,scoring='accuracy',return_train_score=True)
mnbAdultGrid3.fit(adultDataX3, adultDataY3)
randomvariable = 2

In [16]:
#All Averages used within our methods to build tables
averagemnbLetterManualTest = (mnbLetterPrediction1+mnbLetterPrediction2+mnbLetterPrediction3)/3
averageknnLetterManualTest = (knnLetterPrediction1+knnLetterPrediction2+knnLetterPrediction3)/3
averagekmLetterManualTest = (kmLetterPrediction1+kmLetterPrediction2+kmLetterPrediction3)/3
averagemnbCovTypeManualTest = (mnbcovTypePrediction1+mnbcovTypePrediction2+mnbcovTypePrediction3)/3
averageknnCovTypeManualTest = (knnCovTypePrediction1+knnCovTypePrediction2+knnCovTypePrediction3)/3
averagekmCovTypeManualTest = (kmCovTypePrediction1+kmCovTypePrediction2+kmCovTypePrediction3)/3
averagemnbAdultTrain = (mnbAdultGrid1.cv_results_['mean_train_score'].mean(axis=0)+mnbAdultGrid2.cv_results_['mean_train_score'].mean(axis=0)+mnbAdultGrid3.cv_results_['mean_train_score'].mean(axis=0))/3
averageknnAdultTrain = (knnAdultGrid1.cv_results_['mean_train_score'].mean(axis=0)+knnAdultGrid2.cv_results_['mean_train_score'].mean(axis=0)+knnAdultGrid3.cv_results_['mean_train_score'].mean(axis=0))/3
averagekmAdultTrain = (kmAdultGrid1.cv_results_['mean_train_score'].mean(axis=0)+kmAdultGrid2.cv_results_['mean_train_score'].mean(axis=0)+kmAdultGrid3.cv_results_['mean_train_score'].mean(axis=0))/3
averagemnbCovTypeTrain = (mnbCovTypegrid1.cv_results_['mean_train_score'].mean(axis=0)+mnbCovTypegrid2.cv_results_['mean_train_score'].mean(axis=0)+mnbCovTypegrid3.cv_results_['mean_train_score'].mean(axis=0))/3
averageknnCovTypeTrain = (knnCovTypegrid1.cv_results_['mean_train_score'].mean(axis=0)+knnCovTypegrid2.cv_results_['mean_train_score'].mean(axis=0)+knnCovTypegrid3.cv_results_['mean_train_score'].mean(axis=0))/3
averagekmCovTypeTrain = (kmCovTypegrid1.cv_results_['mean_train_score'].mean(axis=0)+kmCovTypegrid2.cv_results_['mean_train_score'].mean(axis=0)+kmCovTypegrid3.cv_results_['mean_train_score'].mean(axis=0))/3
averagemnbLetterTrain = (mnbLetterGrid1.cv_results_['mean_train_score'].mean(axis=0)+mnbLetterGrid2.cv_results_['mean_train_score'].mean(axis=0)+mnbLetterGrid3.cv_results_['mean_train_score'].mean(axis=0))/3
averageknnLetterTrain = (knnLetterGrid1.cv_results_['mean_train_score'].mean(axis=0)+knnLetterGrid2.cv_results_['mean_train_score'].mean(axis=0)+knnLetterGrid3.cv_results_['mean_train_score'].mean(axis=0))/3
averagekmLetterTrain = (kmLetterGrid1.cv_results_['mean_train_score'].mean(axis=0)+kmLetterGrid2.cv_results_['mean_train_score'].mean(axis=0)+kmLetterGrid3.cv_results_['mean_train_score'].mean(axis=0))/3
averagemnbAdultTest = (mnbAdultGrid1.cv_results_['mean_test_score'].mean(axis=0)+mnbAdultGrid2.cv_results_['mean_test_score'].mean(axis=0)+mnbAdultGrid3.cv_results_['mean_test_score'].mean(axis=0))/3
averageknnAdultTest = (knnAdultGrid1.cv_results_['mean_test_score'].mean(axis=0)+knnAdultGrid2.cv_results_['mean_test_score'].mean(axis=0)+knnAdultGrid3.cv_results_['mean_test_score'].mean(axis=0))/3
averagekmAdultTest = (kmAdultGrid1.cv_results_['mean_test_score'].mean(axis=0)+kmAdultGrid2.cv_results_['mean_test_score'].mean(axis=0)+kmAdultGrid3.cv_results_['mean_test_score'].mean(axis=0))/3
averagemnbCovTypeTest = (mnbCovTypegrid1.cv_results_['mean_test_score'].mean(axis=0)+mnbCovTypegrid2.cv_results_['mean_test_score'].mean(axis=0)+mnbCovTypegrid3.cv_results_['mean_test_score'].mean(axis=0))/3
averageknnCovTypeTest = (knnCovTypegrid1.cv_results_['mean_test_score'].mean(axis=0)+knnCovTypegrid2.cv_results_['mean_test_score'].mean(axis=0)+knnCovTypegrid3.cv_results_['mean_test_score'].mean(axis=0))/3
averagekmCovTypeTest = (kmCovTypegrid1.cv_results_['mean_test_score'].mean(axis=0)+kmCovTypegrid2.cv_results_['mean_test_score'].mean(axis=0)+kmCovTypegrid3.cv_results_['mean_test_score'].mean(axis=0))/3
averagemnbLetterTest = (mnbLetterGrid1.cv_results_['mean_test_score'].mean(axis=0)+mnbLetterGrid2.cv_results_['mean_test_score'].mean(axis=0)+mnbLetterGrid3.cv_results_['mean_test_score'].mean(axis=0))/3
averageknnLetterTest = (knnLetterGrid1.cv_results_['mean_test_score'].mean(axis=0)+knnLetterGrid2.cv_results_['mean_test_score'].mean(axis=0)+knnLetterGrid3.cv_results_['mean_test_score'].mean(axis=0))/3
averagekmLetterTest = (kmLetterGrid1.cv_results_['mean_test_score'].mean(axis=0)+kmLetterGrid2.cv_results_['mean_test_score'].mean(axis=0)+kmLetterGrid3.cv_results_['mean_test_score'].mean(axis=0))/3
averagemnbTest = (averagemnbLetterTest + averagemnbAdultTest + averagemnbCovTypeTest)/3
averageknnTest = (averageknnLetterTest+ averageknnAdultTest + averageknnCovTypeTest)/3
averagekmTest = (averagekmLetterTest + averagekmAdultTest + averagekmCovTypeTest)/3



In [17]:
#Methods to make all tables
#Method to make P values from null hypothesis
def makePValuesandTValues():
    #Null hypothesis is that the each model will on average of all trials perform with a score of 70% or .7 or better on all datasets
    mnbResults = [averagemnbAdultTest,averagemnbLetterTest,averagemnbCovTypeTest]
    knnResults = [averageknnAdultTest,averageknnLetterTest,averageknnCovTypeTest]
    kmResults = [averagekmAdultTest,averagekmLetterTest,averagekmCovTypeTest]
    mnbtstatstic, mnbpval = ttest_1samp(mnbResults, .7)
    knntstatstic, knnpval = ttest_1samp(knnResults, .7)
    kmtstatstic, kmpval = ttest_1samp(kmResults, .7)
    return mnbtstatstic,knntstatstic,kmtstatstic,mnbpval,knnpval,kmpval

#Table one combines the main Table 1 with raw test set scores combining secondary results.
#Table of mean of test set performance for each algorithm/dataset
def MakeTable1():
    MainTable1Data = np.array([
             [mnbAdultGrid1.cv_results_['mean_test_score'].mean(axis=0),mnbAdultGrid2.cv_results_['mean_test_score'].mean(axis=0),mnbAdultGrid3.cv_results_['mean_test_score'].mean(axis=0),averagemnbAdultTest]
            ,[knnAdultGrid1.cv_results_['mean_test_score'].mean(axis=0),knnAdultGrid2.cv_results_['mean_test_score'].mean(axis=0),knnAdultGrid3.cv_results_['mean_test_score'].mean(axis=0),averageknnAdultTest]
            ,[kmAdultGrid1.cv_results_['mean_test_score'].mean(axis=0),kmAdultGrid2.cv_results_['mean_test_score'].mean(axis=0),kmAdultGrid3.cv_results_['mean_test_score'].mean(axis=0),averagekmAdultTest]
            ,[mnbLetterGrid1.cv_results_['mean_test_score'].mean(axis=0),mnbLetterGrid2.cv_results_['mean_test_score'].mean(axis=0),mnbLetterGrid3.cv_results_['mean_test_score'].mean(axis=0),averagemnbLetterTest]
            ,[knnLetterGrid1.cv_results_['mean_test_score'].mean(axis=0),knnLetterGrid2.cv_results_['mean_test_score'].mean(axis=0),knnLetterGrid3.cv_results_['mean_test_score'].mean(axis=0),averageknnLetterTest]
            ,[kmLetterGrid1.cv_results_['mean_test_score'].mean(axis=0),kmLetterGrid2.cv_results_['mean_test_score'].mean(axis=0),kmLetterGrid3.cv_results_['mean_test_score'].mean(axis=0),averagekmLetterTest]
            ,[mnbCovTypegrid1.cv_results_['mean_test_score'].mean(axis=0),mnbCovTypegrid2.cv_results_['mean_test_score'].mean(axis=0),mnbCovTypegrid3.cv_results_['mean_test_score'].mean(axis=0),averagemnbCovTypeTest]
            ,[knnCovTypegrid1.cv_results_['mean_test_score'].mean(axis=0),knnCovTypegrid2.cv_results_['mean_test_score'].mean(axis=0),knnCovTypegrid3.cv_results_['mean_test_score'].mean(axis=0),averageknnCovTypeTest]
            ,[kmCovTypegrid1.cv_results_['mean_test_score'].mean(axis=0),kmCovTypegrid2.cv_results_['mean_test_score'].mean(axis=0),kmCovTypegrid3.cv_results_['mean_test_score'].mean(axis=0),averagekmCovTypeTest]
            ])
    MainTable1 = pd.DataFrame(MainTable1Data, columns=['1', '2', '3','AverageAmongstAllTests'])
    MainTable1['TypeOfAlgo/Data'] = ['mnbAdultTest','knnAdultTest','kmAdultTest','mnbLetterTest','knnLetterTest','kmLetterTest','mnbCovTypeTest','knnCovTypeTest','kmCovTypeTest']
    MainTable1 = MainTable1.set_index('TypeOfAlgo/Data')
    return MainTable1

#Table two combines average of all algorithms for all datasets and P values if P value > .05 accept null hypothesis,
#If P value < .05 accept alternate hypothesis
def MakeTable2():
    mnbtstatstic,knntstatstic,kmtstatstic,mnbpval,knnpval,kmpval = makePValuesandTValues()
    MainTable2Data = np.array([
             [averagemnbCovTypeTest,averagemnbAdultTest,averagemnbLetterTest,averagemnbTest,mnbpval,mnbtstatstic]
            ,[averageknnCovTypeTest,averageknnAdultTest,averageknnLetterTest,averageknnTest,knnpval,knntstatstic]
            ,[averagekmCovTypeTest,averagekmAdultTest,averagekmLetterTest,averagekmTest,kmpval,kmtstatstic]
            ])
    MainTable2 = pd.DataFrame(MainTable2Data, columns=['Covt', 'Adult', 'Letter','AverageAmongstAllTests','PValue','TTestValue'])
    MainTable2['TypeOfModel'] = ['mnb','knn','km']
    MainTable2 = MainTable2.set_index('TypeOfModel')
    return MainTable2

#Table of mean of training set performance for each algorithm/dataset for each trial and average amongst all trials
def MakeTable3():
    SecondaryTable1Data = np.array([
             [mnbAdultGrid1.cv_results_['mean_train_score'].mean(axis=0),mnbAdultGrid2.cv_results_['mean_train_score'].mean(axis=0),mnbAdultGrid3.cv_results_['mean_train_score'].mean(axis=0),averagemnbAdultTrain]
            ,[knnAdultGrid1.cv_results_['mean_train_score'].mean(axis=0),knnAdultGrid2.cv_results_['mean_train_score'].mean(axis=0),knnAdultGrid3.cv_results_['mean_train_score'].mean(axis=0),averageknnAdultTrain]
            ,[kmAdultGrid1.cv_results_['mean_train_score'].mean(axis=0),kmAdultGrid2.cv_results_['mean_train_score'].mean(axis=0),kmAdultGrid3.cv_results_['mean_train_score'].mean(axis=0),averagekmAdultTrain]
            ,[mnbLetterGrid1.cv_results_['mean_train_score'].mean(axis=0),mnbLetterGrid2.cv_results_['mean_train_score'].mean(axis=0),mnbLetterGrid3.cv_results_['mean_train_score'].mean(axis=0),averagemnbLetterTrain]
            ,[knnLetterGrid1.cv_results_['mean_train_score'].mean(axis=0),knnLetterGrid2.cv_results_['mean_train_score'].mean(axis=0),knnLetterGrid3.cv_results_['mean_train_score'].mean(axis=0),averageknnLetterTrain]
            ,[kmLetterGrid1.cv_results_['mean_train_score'].mean(axis=0),kmLetterGrid2.cv_results_['mean_train_score'].mean(axis=0),kmLetterGrid3.cv_results_['mean_train_score'].mean(axis=0),averagekmLetterTrain]
            ,[mnbCovTypegrid1.cv_results_['mean_train_score'].mean(axis=0),mnbCovTypegrid2.cv_results_['mean_train_score'].mean(axis=0),mnbCovTypegrid3.cv_results_['mean_train_score'].mean(axis=0),averagemnbCovTypeTrain]
            ,[knnCovTypegrid1.cv_results_['mean_train_score'].mean(axis=0),knnCovTypegrid2.cv_results_['mean_train_score'].mean(axis=0),knnCovTypegrid3.cv_results_['mean_train_score'].mean(axis=0),averageknnCovTypeTrain]
            ,[kmCovTypegrid1.cv_results_['mean_train_score'].mean(axis=0),kmCovTypegrid2.cv_results_['mean_train_score'].mean(axis=0),kmCovTypegrid3.cv_results_['mean_train_score'].mean(axis=0),averagekmCovTypeTrain]
            ])
    MainTable3 = pd.DataFrame(SecondaryTable1Data, columns=['1', '2', '3','AverageAmongstAllTraining'])
    MainTable3['TypeOfAlgo/Data'] = ['mnbAdultTrain','knnAdultTrain','kmAdultTrain','mnbLetterTrain','knnLetterTrain','kmLetterTrain','mnbCovTypeTrain','knnCovTypeTrain','kmCovTypeTrain']
    MainTable3 = MainTable3.set_index('TypeOfAlgo/Data')
    return MainTable3

#Manual test set checking to compare to table 1 overfitting
def MakeTable4():
    ExtraTableData = np.array([
            [mnbLetterPrediction1,mnbLetterPrediction2,mnbLetterPrediction3,averagemnbLetterManualTest]
            ,[knnLetterPrediction1,knnLetterPrediction2,knnLetterPrediction3,averageknnLetterManualTest]
            ,[kmLetterPrediction1,kmLetterPrediction2,kmLetterPrediction3,averagekmLetterManualTest]
            ,[mnbcovTypePrediction1,mnbcovTypePrediction2,mnbcovTypePrediction3,averagemnbCovTypeManualTest]
            ,[knnCovTypePrediction1,knnCovTypePrediction2,knnCovTypePrediction3,averageknnCovTypeManualTest]
            ,[kmCovTypePrediction1,kmCovTypePrediction2,kmCovTypePrediction3,averagekmCovTypeManualTest]
            ])
    ExtraTable = pd.DataFrame(ExtraTableData, columns=['1', '2', '3','AverageAmongstAllManualTesting'])
    ExtraTable['TypeOfAlgo/Data'] = ['mnbLetterTrain','knnLetterTrain','kmLetterTrain','mnbCovTypeTrain','knnCovTypeTrain','kmCovTypeTrain']
    ExtraTable = ExtraTable.set_index('TypeOfAlgo/Data')
    return ExtraTable

#Returns the best parameters for each algorithm/dataset combination
def ParameterTable():
    mnbAdultParams = ('Best parameters for each mnbAdult trials are' + json.dumps(mnbAdultGrid1.best_params_)+', '+json.dumps(mnbAdultGrid2.best_params_)+', '+json.dumps(mnbAdultGrid3.best_params_))
    knnAdultParams = ('Best parameters for each knnAdult trials are' + json.dumps(knnAdultGrid1.best_params_)+', '+json.dumps(knnAdultGrid2.best_params_)+', '+json.dumps(knnAdultGrid3.best_params_))
    kmAdultParams = ('Best parameters for each kmAdult trials are' + json.dumps(kmAdultGrid1.best_params_)+', '+json.dumps(kmAdultGrid2.best_params_)+', '+json.dumps(kmAdultGrid3.best_params_))
    mnbLettersParam = ('Best parameters for each mnbLetter trials are' + json.dumps(mnbLetterGrid1.best_params_)+', '+json.dumps(mnbLetterGrid2.best_params_)+', '+json.dumps(mnbLetterGrid3.best_params_))
    knnLettersParam = ('Best parameters for each knnLetter trials are' + json.dumps(knnLetterGrid1.best_params_)+', '+json.dumps(knnLetterGrid2.best_params_)+', '+json.dumps(knnLetterGrid3.best_params_))
    kmLettersParams = ('Best parameters for each kmLetter trials are' + json.dumps(kmLetterGrid1.best_params_)+', '+json.dumps(kmLetterGrid2.best_params_)+', '+json.dumps(kmLetterGrid3.best_params_))
    mnbCovTypeParams = ('Best parameters for each mnbCovType trials are' + json.dumps(mnbCovTypegrid1.best_params_)+', '+json.dumps(mnbCovTypegrid2.best_params_)+', '+json.dumps(mnbCovTypegrid3.best_params_))
    knnCovTypeParams = ('Best parameters for each knnCovType trials are' + json.dumps(knnCovTypegrid1.best_params_)+', '+json.dumps(knnCovTypegrid2.best_params_)+', '+json.dumps(knnCovTypegrid3.best_params_))
    kmCovTypeParams = ('Best parameters for each kmCovType trials are' + json.dumps(kmLetterGrid1.best_params_)+', '+json.dumps(kmLetterGrid2.best_params_)+', '+json.dumps(kmLetterGrid3.best_params_))
    df = pd.DataFrame({'' : [
                (mnbAdultParams),
                (knnAdultParams),
                (kmAdultParams),
                (mnbLettersParam),
                (knnLettersParam),
                (kmLettersParams),
                (mnbCovTypeParams),
                (knnCovTypeParams),
                (kmCovTypeParams)
                        ],
                'Algorithm/Data': ['mnbAdult','knnAdult','kmAdult','mnbLetter','knnLetter','kmLetter','mnbCovType','knnCovType','kmCovType']
                  })
    df = df.set_index('Algorithm/Data')
    return df

In [18]:
MainTable1 = MakeTable1()
display(MainTable1)

Unnamed: 0_level_0,1,2,3,AverageAmongstAllTests
TypeOfAlgo/Data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
mnbAdultTest,0.7838,0.7876,0.7816,0.784333
knnAdultTest,0.76208,0.76508,0.75316,0.760107
kmAdultTest,0.15615,0.158925,0.159988,0.158354
mnbLetterTest,0.65332,0.65526,0.6696,0.659393
knnLetterTest,0.93162,0.93898,0.93212,0.93424
kmLetterTest,0.220525,0.214637,0.221025,0.218729
mnbCovTypeTest,0.80752,0.74388,0.80752,0.786307
knnCovTypeTest,0.95388,0.9059,0.95388,0.937887
kmCovTypeTest,0.1164,0.102938,0.106925,0.108754


In [19]:
MainTable2 = MakeTable2()
display(MainTable2)

Unnamed: 0_level_0,Covt,Adult,Letter,AverageAmongstAllTests,PValue,TTestValue
TypeOfModel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
mnb,0.786307,0.784333,0.659393,0.743344,0.410336,1.032516
knn,0.937887,0.760107,0.93424,0.877411,0.094146,3.024311
km,0.108754,0.158354,0.218729,0.161946,0.003474,-16.921113


In [20]:
SecondaryTable1 = MakeTable3()
display(SecondaryTable1)

Unnamed: 0_level_0,1,2,3,AverageAmongstAllTraining
TypeOfAlgo/Data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
mnbAdultTrain,0.78432,0.78787,0.78153,0.784573
knnAdultTrain,0.90591,0.90748,0.9043,0.905897
kmAdultTrain,0.156106,0.159497,0.161975,0.159193
mnbLetterTrain,0.65429,0.657465,0.67084,0.660865
knnLetterTrain,0.97982,0.981075,0.97884,0.979912
kmLetterTrain,0.2205,0.214403,0.220178,0.21836
mnbCovTypeTrain,0.809475,0.75,0.809475,0.78965
knnCovTypeTrain,0.988015,0.98351,0.988015,0.986513
kmCovTypeTrain,0.115697,0.104659,0.104884,0.108414


In [21]:
ExtraTable = MakeTable4()
display(ExtraTable)

Unnamed: 0_level_0,1,2,3,AverageAmongstAllManualTesting
TypeOfAlgo/Data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
mnbLetterTrain,0.663981,0.667134,0.707646,0.679587
knnLetterTrain,0.945409,0.948121,0.917541,0.937024
kmLetterTrain,0.495485,0.500748,0.488756,0.494996
mnbCovTypeTrain,0.23309,0.455951,0.231165,0.306735
knnCovTypeTrain,0.364501,0.530932,0.363127,0.41952
kmCovTypeTrain,0.309452,0.191579,0.089297,0.196776


In [22]:
pd.options.display.max_colwidth
parameterTable1 = ParameterTable()
display(parameterTable1)

Algorithm/Data,Unnamed: 1
mnbAdult,"Best parameters for each mnbAdult trials are{""..."
knnAdult,"Best parameters for each knnAdult trials are{""..."
kmAdult,"Best parameters for each kmAdult trials are{""a..."
mnbLetter,Best parameters for each mnbLetter trials are{...
knnLetter,Best parameters for each knnLetter trials are{...
kmLetter,"Best parameters for each kmLetter trials are{""..."
mnbCovType,Best parameters for each mnbCovType trials are...
knnCovType,Best parameters for each knnCovType trials are...
kmCovType,Best parameters for each kmCovType trials are{...


In [23]:
#This is all extra SVC testing, originally planned to use svm but took too long for adult data to process

In [24]:
#Make gridsearchcv objects using our svc estimator object, svc parameters, and using accuracy for scoring
#Uses cross validation
#set 1
svcCovTypeGrid1 = GridSearchCV(svc,svcParamGrid,scoring='accuracy',return_train_score=True)
svcCovTypeGrid1.fit(covtypeDataX1, covtypeDataY1)
svcCovTypePrediction1 = svcCovTypeGrid1.score(covtypeDataX1_test,covtypeDataY1_test)
#set 2
svcCovTypeGrid2 = GridSearchCV(svc,svcParamGrid,scoring='accuracy',return_train_score=True)
svcCovTypeGrid2.fit(covtypeDataX2, covtypeDataY2)
svcCovTypePrediction2 = svcCovTypeGrid2.score(covtypeDataX2_test,covtypeDataY2_test)
#set 3
svcCovTypeGrid3 = GridSearchCV(svc,svcParamGrid,scoring='accuracy',return_train_score=True)
svcCovTypeGrid3.fit(covtypeDataX3, covtypeDataY3)
svcCovTypePrediction3 = svcCovTypeGrid3.score(covtypeDataX3_test,covtypeDataY3_test)

In [25]:
#Make gridsearchcv objects using our svc estimator object, svc parameters, and using accuracy for scoring
#Uses cross validation
#set 1
svcLetterGrid1 = GridSearchCV(svc,svcParamGrid,scoring='accuracy',return_train_score=True)
svcLetterGrid1.fit(letterDataX1, letterDataY1)
svcLetterPrediction1 = svcLetterGrid1.score(letterDataX1_test,letterDataY1_test)
#set 2
svcLetterGrid2 = GridSearchCV(svc,svcParamGrid,scoring='accuracy',return_train_score=True)
svcLetterGrid2.fit(letterDataX2, letterDataY2)
svcLetterPrediction2 = svcLetterGrid2.score(letterDataX2_test,letterDataY2_test)
#set 3
svcLetterGrid3 = GridSearchCV(svc,svcParamGrid,scoring='accuracy',return_train_score=True)
svcLetterGrid3.fit(letterDataX3, letterDataY3)
svcLetterPrediction3 = svcLetterGrid3.score(letterDataX3_test,letterDataY3_test)

In [26]:
#Here we Make all tables associated with svc
pd.options.display.max_colwidth = 10000
svcCovTypeParams = ('Best parameters for each svcCovType trials are' + json.dumps(svcCovTypeGrid1.best_params_)+', '+json.dumps(svcCovTypeGrid2.best_params_)+', '+json.dumps(svcCovTypeGrid3.best_params_))
svcLetterParams = ('Best parameters for each svcCovType trials are' + json.dumps(svcLetterGrid1.best_params_)+', '+json.dumps(svcLetterGrid2.best_params_)+', '+json.dumps(svcLetterGrid3.best_params_))
svcParams = pd.DataFrame({'Best Parameters for svc' : [(svcCovTypeParams),(svcLetterParams),],'Algorithm/Dataset':['svcCovType','svcLetter']})
svcParams = svcParams.set_index('Algorithm/Dataset')

averagesvcCovType = (svcCovTypeGrid1.cv_results_['mean_test_score'].mean(axis=0)+svcCovTypeGrid2.cv_results_['mean_test_score'].mean(axis=0)+svcCovTypeGrid3.cv_results_['mean_test_score'].mean(axis=0))/3
averagesvcLetter = (svcLetterGrid1.cv_results_['mean_test_score'].mean(axis=0)+svcLetterGrid2.cv_results_['mean_test_score'].mean(axis=0)+svcLetterGrid3.cv_results_['mean_test_score'].mean(axis=0))/3
ExtraData = np.array([
             [svcCovTypeGrid1.cv_results_['mean_test_score'].mean(axis=0),svcCovTypeGrid2.cv_results_['mean_test_score'].mean(axis=0),svcCovTypeGrid3.cv_results_['mean_test_score'].mean(axis=0),averagesvcCovType]
            ,[svcLetterGrid1.cv_results_['mean_test_score'].mean(axis=0),svcLetterGrid2.cv_results_['mean_test_score'].mean(axis=0),svcLetterGrid3.cv_results_['mean_test_score'].mean(axis=0),averagesvcLetter]
            ])
ExtraTable = pd.DataFrame(ExtraData, columns=['1', '2', '3','AverageAmongstAllTrials'])
ExtraTable['TypeOfAlgo/Data'] = ['svcCovTypeTest','svcLetterTest']
ExtraTable = ExtraTable.set_index('TypeOfAlgo/Data')

In [27]:
display(ExtraTable)

Unnamed: 0_level_0,1,2,3,AverageAmongstAllTrials
TypeOfAlgo/Data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
svcCovTypeTest,0.92652,0.901553,0.928627,0.9189
svcLetterTest,0.820147,0.820627,0.82722,0.822664


In [28]:
display(svcParams)

Unnamed: 0_level_0,Best Parameters for svc
Algorithm/Dataset,Unnamed: 1_level_1
svcCovType,"Best parameters for each svcCovType trials are{""C"": 100, ""gamma"": 2, ""kernel"": ""rbf""}, {""C"": 5, ""gamma"": 1, ""kernel"": ""rbf""}, {""C"": 100, ""gamma"": 1, ""kernel"": ""rbf""}"
svcLetter,"Best parameters for each svcCovType trials are{""C"": 20, ""gamma"": 3, ""kernel"": ""rbf""}, {""C"": 20, ""gamma"": 3, ""kernel"": ""rbf""}, {""C"": 20, ""gamma"": 3, ""kernel"": ""rbf""}"
