# Classification models and cross validation
This section is focused on the following points:
- Reading the data from the file
- Train the models and test them with a Stratified Cross-Validation
- Train the models and test them with a Normal Cross-Validation
- Train the models and test them with a Stratified Cross-Validation with Standardized Data
- Train the models and test them with a Normal Cross-Validation with Standardized Data
- Train the models and test them with a Stratified Cross-Validation with PCA transformed data
- Train the models and test them with a Normal Cross-Validation with PCA transformed data
    

### Imports section

In [1]:
# Imports section
import sys
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import math
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.ensemble import VotingClassifier
from sklearn.neural_network import MLPClassifier

### Reading he data from the file

In [2]:
# Read the file in the format given
text=""
nLines=0
nAttributes=0
nClasses=0
attributesName = []
data = []
dataPerAttribute = []
classes = []
try:
    with open("Slice409.txt","r") as file:
        count=0
        for line in file:
            if count < 3:
                if count == 0:
                    nLines = int(line.strip())
                else:
                    if count == 1:
                        nAttributes = int(line.strip())
                        for i in range(1,nAttributes+1):
                            attributesName.append("att"+str(i))
                            dataPerAttribute.append([])
                    else:
                        if count == 2:
                            nClasses = int(line.strip())
            else:
                split_string_S = line.strip().split(',')
                
                count_split = 0
                split_string_n = []
                for split in split_string_S:
                    if count_split >= nAttributes:
                        split_string_n.append(int(split))
                    else:
                        split_string_n.append(float(split))
                    count_split += 1
                    
                data.append(split_string_n)
                classes.append(split_string_n[nAttributes])
                n_attribute = 0
                for attribute in split_string_n:
                    if n_attribute >= nAttributes:
                        break
                    else:
                        dataPerAttribute[n_attribute].append(attribute)
                        n_attribute += 1
            count += 1
        print("EOF reached")
except FileNotFoundError:
    text="Archivo no existe"
    exit()
finally:
    file.close()
    # print(str(nLines) + "\n")
    # print(str(nAttributes) + "\n")
    # print(str(nClasses) + "\n")
    # print(attributesName)
    # print(data)

EOF reached


### Printing the readed DataFrame

In [3]:
# Turning the data into a DataFrame python object
columns_ = attributesName[:]
columns_.append("target")

dataFrame = pd.DataFrame(data=data, columns=columns_)
dataFrame

Unnamed: 0,att1,att2,att3,att4,att5,att6,att7,att8,att9,att10,...,att377,att378,att379,att380,att381,att382,att383,att384,att385,target
0,0.622490,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.25,...,0.0,0.00000,0.000000,0.000000,0.000000,0.0,0.00,0.00,27.937052,4
1,0.630137,0.400524,0.421793,0.850483,0.734300,0.000000,0.000000,-0.250000,-0.250000,-0.25,...,0.0,0.97244,0.995384,0.997934,0.000000,0.0,-0.25,-0.25,11.468284,4
2,0.657718,0.773077,0.850657,0.753205,0.000000,0.000000,0.000000,-0.250000,-0.250000,-0.25,...,0.0,0.95305,0.986386,0.993740,0.000000,0.0,-0.25,-0.25,13.492684,4
3,0.683544,0.573705,0.323887,0.824176,0.728665,0.000000,0.000000,-0.250000,-0.250000,-0.25,...,0.0,0.00000,0.995114,0.997632,0.000000,0.0,-0.25,-0.25,11.413571,4
4,0.000000,0.000000,0.000000,0.000000,0.196364,0.947141,0.000000,-0.250000,-0.250000,-0.25,...,0.0,0.00000,0.000000,0.000000,0.000000,0.0,-0.25,-0.25,9.939380,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
404,0.142292,0.299413,0.000000,0.000000,0.000000,0.790510,0.910816,0.927898,0.972053,-0.25,...,0.0,0.00000,0.000000,0.000000,0.000000,0.0,0.00,0.00,29.476450,0
405,0.142857,0.000000,0.000000,0.000000,0.000000,0.581470,0.977459,0.779539,0.000000,-0.25,...,0.0,0.00000,0.000000,0.000000,0.000000,0.0,0.00,0.00,28.895193,0
406,0.153846,0.334586,0.000000,0.000000,0.000000,0.000000,-0.250000,-0.250000,-0.250000,-0.25,...,0.0,0.00000,0.000000,0.000000,0.000000,0.0,-0.25,-0.25,22.210732,0
407,0.154762,0.000000,0.000000,0.000000,0.000000,0.000000,0.641105,0.000000,0.000000,-0.25,...,0.0,0.00000,0.000000,0.000000,0.000000,0.0,0.00,0.00,28.488313,0


In [4]:
# Separating out the features
x = dataFrame.loc[:, attributesName].values
# Separating out the target
y = dataFrame.loc[:,['target']].values


### Train the models and test them with a Stratified Cross-Validation
Here we train our three classification models: Naive - Bayes, K - Nearest Neighbors and Neural Network and test them with a Stratified Cross-Validation.
The printed results are the accuracy scores in every fold for every classification model and then a table with the average accuracy score for every classification model

In [5]:
def get_score(model,x_train,x_test,y_train,y_test):
    model.fit(x_train,y_train.ravel())
    return model.score(x_test,y_test)
folds = int(input("Enter number of K-Folds for Cross Validation: "))
neighbors = int(input("Enter value of K-Nearest Neighbors: "))
kk = KNeighborsClassifier(n_neighbors=neighbors)
nb = GaussianNB()
hLayers = int(input("Enter number of Hidden Layers for the Neural Network: "))
layersSize=[]
for i in range(1,hLayers+1):
    layersSize.append(int(input("Number of perceptrons for the "+str(i)+"-th layer: ")))
activationFunction=input("activation{identity, logistic, tanh, relu}: ")
epochs = int(input("Number of iterations for the Neural Network: "))
moment = float(input("Momentum for the Neural Network[0-1]: "))
nn = MLPClassifier(hidden_layer_sizes=layersSize,solver='sgd',activation=activationFunction,max_iter=epochs,momentum=moment)
scores_KK = []
scores_NB = []
scores_NN = []
kk_df = []
nb_df = []
nn_df = []
avg_df = []
split_names = []
i=1
KFolds = StratifiedKFold(n_splits=folds)
for train_index,test_index in KFolds.split(X=x,y=y):
    X_train,X_test,y_train,y_test = x[train_index],x[test_index],y[train_index],y[test_index]
    scores_KK.append(get_score(kk,X_train,X_test,y_train,y_test))
    scores_NB.append(get_score(nb,X_train,X_test,y_train,y_test))
    scores_NN.append(get_score(nn,X_train,X_test,y_train,y_test))
    split_names.append("Fold "+str(i))
    i += 1
print("---------Stratified Cross Validation---------")
print("K-Nearest Neighbors")
print(scores_KK)
print("Naive Bayer")
print(scores_NB)
print("Neural Network")
print(scores_NN)
kk_df = pd.DataFrame(scores_KK,
                   columns=['Stratified Cross Validation'],
                     index=split_names)
nb_df = pd.DataFrame(scores_NB,
                   columns=['Stratified Cross Validation'],
                     index=split_names)
nn_df = pd.DataFrame(scores_NN,
                   columns=['Stratified Cross Validation'],
                     index=split_names)
strat_avg = [np.mean(scores_KK),
             np.mean(scores_NB),
             np.mean(scores_NN)]
avg_df = pd.DataFrame(strat_avg,
                      columns=['Stratified Cross Validation'],index=['K-Neighbors','Naive-Bayes','Neural Network'])
print("----Cross Validation Average----")
avg_df


Enter number of K-Folds for Cross Validation: 5
Enter value of K-Nearest Neighbors: 5
Enter number of Hidden Layers for the Neural Network: 3
Number of perceptrons for the 1-th layer: 10
Number of perceptrons for the 2-th layer: 15
Number of perceptrons for the 3-th layer: 10
activation{identity, logistic, tanh, relu}: tanh
Number of iterations for the Neural Network: 3000
Momentum for the Neural Network[0-1]: 0.8
---------Stratified Cross Validation---------
K-Nearest Neighbors
[0.8536585365853658, 0.9878048780487805, 1.0, 0.9512195121951219, 0.5555555555555556]
Naive Bayer
[0.8170731707317073, 0.7073170731707317, 0.8292682926829268, 0.7926829268292683, 0.8271604938271605]
Neural Network
[0.926829268292683, 0.9390243902439024, 0.975609756097561, 0.9634146341463414, 0.5802469135802469]
----Cross Validation Average----


Unnamed: 0,Stratified Cross Validation
K-Neighbors,0.869648
Naive-Bayes,0.7947
Neural Network,0.877025


### Train the models and test them with a Normal Cross-Validation
Here we train our three classification models: Naive - Bayes, K - Nearest Neighbors and Neural Network and test them with a Normal Cross-Validation.
The printed results are the accuracy scores in every fold for every classification model and then a table with the average accuracy score for every classification model

In [6]:
scores_KK = []
scores_NB = []
scores_NN = []
scores_KK = cross_val_score(kk, x, y.ravel(), cv=folds)
scores_NB = cross_val_score(nb, x, y.ravel(), cv=folds)
scores_NN = cross_val_score(nn, x, y.ravel(), cv=folds)
print("------Normal Cross Validation-------")
print("K-Nearest Neighbors")
print(scores_KK)
print("Naive Bayer")
print(scores_NB)
print("Neural Network")
print(scores_NN)
kk_df_aux = pd.DataFrame(scores_KK,
                   columns=['Normal Cross Validation'],
                         index=split_names)
kk_df_result = pd.concat([kk_df_aux,kk_df],axis=1)
kk_df = kk_df_result


nb_df_aux = pd.DataFrame(scores_NB,
                   columns=['Normal Cross Validation'],
                     index=split_names)
nb_df_result = pd.concat([nb_df_aux,nb_df],axis=1)
nb_df = nb_df_result


nn_df_aux = pd.DataFrame(scores_NN,
                   columns=['Normal Cross Validation'],
                         index=split_names)
nn_df_result = pd.concat([nn_df_aux,nn_df],axis=1)
nn_df = nn_df_result

strat_avg = [np.mean(scores_KK),
             np.mean(scores_NB),
             np.mean(scores_NN)]
avg_df_aux = pd.DataFrame(strat_avg,
                      columns=['Normal Cross Validation'],index=['K-Neighbors','Naive-Bayes','Neural Network'])
avg_df_result = pd.concat([avg_df_aux,avg_df],axis=1)
avg_df=avg_df_result
print("----Cross Validation Average----")
avg_df

------Normal Cross Validation-------
K-Nearest Neighbors
[0.85365854 0.98780488 1.         0.95121951 0.55555556]
Naive Bayer
[0.81707317 0.70731707 0.82926829 0.79268293 0.82716049]
Neural Network
[0.92682927 0.93902439 0.98780488 0.93902439 0.56790123]
----Cross Validation Average----


Unnamed: 0,Normal Cross Validation,Stratified Cross Validation
K-Neighbors,0.869648,0.869648
Naive-Bayes,0.7947,0.7947
Neural Network,0.872117,0.877025


### Train the models and test them with a Stratified Cross-Validation with Standardized Data
Here we train our three classification models: Naive - Bayes, K - Nearest Neighbors and Neural Network and test them with a Stratified Cross-Validation with Standardized Data.
The printed results are the accuracy scores in every fold for every classification model and then a table with the average accuracy score for every classification model

In [7]:
# Standardizing the features
x = StandardScaler().fit_transform(x)
scores_KK = []
scores_NB = []
scores_NN = []
KFolds = StratifiedKFold(n_splits=folds)
for train_index,test_index in KFolds.split(X=x,y=y):
    X_train,X_test,y_train,y_test = x[train_index],x[test_index],y[train_index],y[test_index]
    scores_KK.append(get_score(kk,X_train,X_test,y_train,y_test))
    scores_NB.append(get_score(nb,X_train,X_test,y_train,y_test))
    scores_NN.append(get_score(nn,X_train,X_test,y_train,y_test))
print("------Stratified Cross Validation with Standardized data-------")
print("K-Nearest Neighbors")
print(scores_KK)
print("Naive Bayer")
print(scores_NB)
print("Neural Network")
print(scores_NN)
kk_df_aux = pd.DataFrame(scores_KK,
                   columns=['Stratified Cross Validation w/SD'],
                         index=split_names)
kk_df_result = pd.concat([kk_df_aux,kk_df],axis=1)
kk_df = kk_df_result


nb_df_aux = pd.DataFrame(scores_NB,
                   columns=['Stratified Cross Validation w/SD'],
                     index=split_names)
nb_df_result = pd.concat([nb_df_aux,nb_df],axis=1)
nb_df = nb_df_result


nn_df_aux = pd.DataFrame(scores_NN,
                   columns=['Stratified Cross Validation w/SD'],
                         index=split_names)
nn_df_result = pd.concat([nn_df_aux,nn_df],axis=1)
nn_df = nn_df_result

strat_avg = [np.mean(scores_KK),
             np.mean(scores_NB),
             np.mean(scores_NN)]
avg_df_aux = pd.DataFrame(strat_avg,
                      columns=['Stratified Cross Validation w/SD'],index=['K-Neighbors','Naive-Bayes','Neural Network'])
avg_df_result = pd.concat([avg_df_aux,avg_df],axis=1)
avg_df=avg_df_result
print("----Stratified Cross Validation w/SD average----")
avg_df

------Stratified Cross Validation with Standardized data-------
K-Nearest Neighbors
[0.8536585365853658, 1.0, 0.9878048780487805, 0.9390243902439024, 0.5679012345679012]
Naive Bayer
[0.8170731707317073, 0.7439024390243902, 0.8292682926829268, 0.7926829268292683, 0.8148148148148148]
Neural Network
[0.8902439024390244, 0.8902439024390244, 0.9878048780487805, 0.9512195121951219, 0.5802469135802469]
----Stratified Cross Validation w/SD average----


Unnamed: 0,Stratified Cross Validation w/SD,Normal Cross Validation,Stratified Cross Validation
K-Neighbors,0.869678,0.869648,0.869648
Naive-Bayes,0.799548,0.7947,0.7947
Neural Network,0.859952,0.872117,0.877025


### Train the models and test them with a Normal Cross-Validation with Standardized Data
Here we train our three classification models: Naive - Bayes, K - Nearest Neighbors and Neural Network and test them with a Normal Cross-Validation with Standardized Data.
The printed results are the accuracy scores in every fold for every classification model and then a table with the average accuracy score for every classification model

In [8]:
scores_KK = []
scores_NB = []
scores_NN = []
scores_KK = cross_val_score(kk, x, y.ravel(), cv=folds)
scores_NB = cross_val_score(nb, x, y.ravel(), cv=folds)
scores_NN = cross_val_score(nn, x, y.ravel(), cv=folds)
print("------Normal Cross Validation with Standardized data-------")
print("K-Nearest Neighbors")
print(scores_KK)
print("Naive Bayer")
print(scores_NB)
print("Neural Network")
print(scores_NN)
kk_df_aux = pd.DataFrame(scores_KK,
                   columns=['Normal Cross Validation w/SD'],
                         index=split_names)
kk_df_result = pd.concat([kk_df_aux,kk_df],axis=1)
kk_df = kk_df_result


nb_df_aux = pd.DataFrame(scores_NB,
                   columns=['Normal Cross Validation w/SD'],
                     index=split_names)
nb_df_result = pd.concat([nb_df_aux,nb_df],axis=1)
nb_df = nb_df_result


nn_df_aux = pd.DataFrame(scores_NN,
                   columns=['Normal Cross Validation w/SD'],
                         index=split_names)
nn_df_result = pd.concat([nn_df_aux,nn_df],axis=1)
nn_df = nn_df_result

strat_avg = [np.mean(scores_KK),
             np.mean(scores_NB),
             np.mean(scores_NN)]
avg_df_aux = pd.DataFrame(strat_avg,
                      columns=['Normal Cross Validation w/SD'],index=['K-Neighbors','Naive-Bayes','Neural Network'])
avg_df_result = pd.concat([avg_df_aux,avg_df],axis=1)
avg_df=avg_df_result
print("----Normal Cross Validation w/SD average----")
avg_df

------Normal Cross Validation with Standardized data-------
K-Nearest Neighbors
[0.85365854 1.         0.98780488 0.93902439 0.56790123]
Naive Bayer
[0.81707317 0.74390244 0.82926829 0.79268293 0.81481481]
Neural Network
[0.93902439 0.91463415 0.98780488 0.92682927 0.58024691]
----Normal Cross Validation w/SD average----


Unnamed: 0,Normal Cross Validation w/SD,Stratified Cross Validation w/SD,Normal Cross Validation,Stratified Cross Validation
K-Neighbors,0.869678,0.869678,0.869648,0.869648
Naive-Bayes,0.799548,0.799548,0.7947,0.7947
Neural Network,0.869708,0.859952,0.872117,0.877025


### Applying PCA to the orignal dataset

In [9]:
# PCA
pcn = int(input('Number of principal components:'))
if pcn > nAttributes:
    print(str(nAttributes) + " is the maximum number of principal components")
    pcn = min([nAttributes,pcn])
else:
    if pcn <= 0:
        print("1 is the minimum number of principal components")
        pcn = max([1,pcn])
        
print ("Creating PCA with " + str(pcn) + " components")
pca = PCA(n_components=pcn)
principalComponents = pca.fit_transform(x)
principalComponents

Number of principal components:30
Creating PCA with 30 components


array([[-3.82657033,  9.14916927, -1.19006751, ...,  0.09687072,
        -0.43165342,  1.33453662],
       [ 5.86754502, -3.93589181, 10.39285271, ...,  0.55656731,
        -0.07877821,  1.14493696],
       [ 5.7942836 , -5.28834434, 18.5209348 , ..., -3.60607757,
        -0.0330234 ,  0.28762829],
       ...,
       [10.50845397, -3.05413093, -5.72717923, ..., -0.59308823,
        -0.52393668, -0.34698198],
       [-7.00235438, 11.06483274,  2.80468752, ..., -1.61105123,
         5.06067893, -0.56502414],
       [ 3.27860785, -5.91037113, 16.12292689, ...,  0.38654337,
         0.32496377,  0.8102596 ]])

### Train the models and test them with a Stratified Cross-Validation with PCA transformed data
Here we train our three classification models: Naive - Bayes, K - Nearest Neighbors and Neural Network and test them with a Stratified Cross-Validation with PCA transformed data. The printed results are the accuracy scores in every fold for every classification model and then a table with the average accuracy score for every classification model

In [10]:
scores_KK = []
scores_NB = []
scores_NN = []
KFolds = StratifiedKFold(n_splits=folds)
for train_index,test_index in KFolds.split(X=x,y=y):
    X_train,X_test,y_train,y_test = principalComponents[train_index],principalComponents[test_index],y[train_index],y[test_index]
    scores_KK.append(get_score(kk,X_train,X_test,y_train,y_test))
    scores_NB.append(get_score(nb,X_train,X_test,y_train,y_test))
    scores_NN.append(get_score(nn,X_train,X_test,y_train,y_test))
print("------Stratified Cross Validation with PCA-------")
print("K-Nearest Neighbors")
print(scores_KK)
print("Naive Bayer")
print(scores_NB)
print("Neural Network")
print(scores_NN)
kk_df_aux = pd.DataFrame(scores_KK,
                   columns=['Stratified Cross Validation with PCA'],
                         index=split_names)
kk_df_result = pd.concat([kk_df_aux,kk_df],axis=1)
kk_df = kk_df_result


nb_df_aux = pd.DataFrame(scores_NB,
                   columns=['Stratified Cross Validation with PCA'],
                     index=split_names)
nb_df_result = pd.concat([nb_df_aux,nb_df],axis=1)
nb_df = nb_df_result


nn_df_aux = pd.DataFrame(scores_NN,
                   columns=['Stratified Cross Validation with PCA'],
                         index=split_names)
nn_df_result = pd.concat([nn_df_aux,nn_df],axis=1)
nn_df = nn_df_result

strat_avg = [np.mean(scores_KK),
             np.mean(scores_NB),
             np.mean(scores_NN)]
avg_df_aux = pd.DataFrame(strat_avg,
                      columns=['Stratified Cross Validation with PCA'],index=['K-Neighbors','Naive-Bayes','Neural Network'])
avg_df_result = pd.concat([avg_df_aux,avg_df],axis=1)
avg_df=avg_df_result
print("----Stratified Cross Validation with PCA average----")
avg_df

------Stratified Cross Validation with PCA-------
K-Nearest Neighbors
[0.8780487804878049, 1.0, 1.0, 0.9390243902439024, 0.5555555555555556]
Naive Bayer
[0.926829268292683, 0.9878048780487805, 1.0, 0.8170731707317073, 0.5679012345679012]
Neural Network
[0.8414634146341463, 0.8414634146341463, 0.975609756097561, 0.9146341463414634, 0.5925925925925926]
----Stratified Cross Validation with PCA average----


Unnamed: 0,Stratified Cross Validation with PCA,Normal Cross Validation w/SD,Stratified Cross Validation w/SD,Normal Cross Validation,Stratified Cross Validation
K-Neighbors,0.874526,0.869678,0.869678,0.869648,0.869648
Naive-Bayes,0.859922,0.799548,0.799548,0.7947,0.7947
Neural Network,0.833153,0.869708,0.859952,0.872117,0.877025


### Train the models and test them with a Normal Cross-Validation with PCA transformed data
Here we train our three classification models: Naive - Bayes, K - Nearest Neighbors and Neural Network and test them with a Normal Cross-Validation with PCA transformed data. The printed results are the accuracy scores in every fold for every classification model and then a table with the average accuracy score for every classification model

In [11]:
scores_KK = []
scores_NB = []
scores_NN = []
scores_KK = cross_val_score(kk, principalComponents, y.ravel(), cv=folds)
scores_NB = cross_val_score(nb, principalComponents, y.ravel(), cv=folds)
scores_NN = cross_val_score(nn, principalComponents, y.ravel(), cv=folds)
print("------Normal Cross Validation with PCA-------")
print("K-Nearest Neighbors")
print(scores_KK)
print("Naive Bayer")
print(scores_NB)
print("Neural Network")
print(scores_NN)
kk_df_aux = pd.DataFrame(scores_KK,
                   columns=['Normal Cross Validation with PCA'],
                         index=split_names)
kk_df_result = pd.concat([kk_df_aux,kk_df],axis=1)
kk_df = kk_df_result


nb_df_aux = pd.DataFrame(scores_NB,
                   columns=['Normal Cross Validation with PCA'],
                     index=split_names)
nb_df_result = pd.concat([nb_df_aux,nb_df],axis=1)
nb_df = nb_df_result


nn_df_aux = pd.DataFrame(scores_NN,
                   columns=['Normal Cross Validation with PCA'],
                         index=split_names)
nn_df_result = pd.concat([nn_df_aux,nn_df],axis=1)
nn_df = nn_df_result

strat_avg = [np.mean(scores_KK),
             np.mean(scores_NB),
             np.mean(scores_NN)]
avg_df_aux = pd.DataFrame(strat_avg,
                      columns=['Normal Cross Validation with PCA'],index=['K-Neighbors','Naive-Bayes','Neural Network'])
avg_df_result = pd.concat([avg_df_aux,avg_df],axis=1)
avg_df=avg_df_result
print("----Normal Cross Validation with PCA average----")
avg_df

------Normal Cross Validation with PCA-------
K-Nearest Neighbors
[0.87804878 1.         1.         0.93902439 0.55555556]
Naive Bayer
[0.92682927 0.98780488 1.         0.81707317 0.56790123]
Neural Network
[0.87804878 0.91463415 0.90243902 0.90243902 0.54320988]
----Normal Cross Validation with PCA average----


Unnamed: 0,Normal Cross Validation with PCA,Stratified Cross Validation with PCA,Normal Cross Validation w/SD,Stratified Cross Validation w/SD,Normal Cross Validation,Stratified Cross Validation
K-Neighbors,0.874526,0.874526,0.869678,0.869678,0.869648,0.869648
Naive-Bayes,0.859922,0.859922,0.799548,0.799548,0.7947,0.7947
Neural Network,0.828154,0.833153,0.869708,0.859952,0.872117,0.877025
