In [1]:
# Imports section
import sys
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import math
import statistics 
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.ensemble import VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import log_loss


In [2]:
class Ensemble:
    def __init__(self):
        self.dataFrame = []
        self.nPC = 2
        self.nKNN = 2
        self.nKFolds = 5
        self.hiddenLayers = [10,15,10]
        self.activFunction = "tanh"
        self.nIterations = 2500
        self.momentum = 0.9
        self.x = []
        self.y = []
        self.principalComponents = []
        self.text=""
        self.nLines=0
        self.nAttributes=0
        self.nClasses=0
        self.attributesName = []
        self.data = []
        self.dataPerAttribute = []
        self.classes = []
        self.principalDf = []
        self.kn = []
        self.nb = []
        self.nn = []
        self.scores_KK = []
        self.scores_KN = []
        self.scores_NN = []
        self.split_names = []
        self.model_names =['K-Nearest Neighbors','Naive Bayes','Neural Network']
        self.eclf = []
        self.eclf_scores = []
    def setDF(self,dataFrame):
        self.dataFrame = dataFrame
    def readFile(self,filename):
        # Read the file in the format given
        self.text=""
        self.nLines=0
        self.nAttributes=0
        self.nClasses=0
        self.attributesName = []
        self.data = []
        self.dataPerAttribute = []
        self.classes = []
        try:
            with open(filename,"r") as file:
                count=0
                for line in file:
                    if count < 3:
                        if count == 0:
                            self.nLines = int(line.strip())
                        else:
                            if count == 1:
                                self.nAttributes = int(line.strip())
                                for i in range(1,self.nAttributes+1):
                                    self.attributesName.append("att"+str(i))
                                    self.dataPerAttribute.append([])
                            else:
                                if count == 2:
                                    self.nClasses = int(line.strip())
                    else:
                        split_string_S = line.strip().split(',')

                        count_split = 0
                        split_string_n = []
                        for split in split_string_S:
                            if count_split >= self.nAttributes:
                                split_string_n.append(int(split))
                            else:
                                split_string_n.append(float(split))
                            count_split += 1

                        self.data.append(split_string_n)
                        self.classes.append(split_string_n[self.nAttributes])
                        n_attribute = 0
                        for attribute in split_string_n:
                            if n_attribute >= self.nAttributes:
                                break
                            else:
                                self.dataPerAttribute[n_attribute].append(attribute)
                                n_attribute += 1
                    count += 1
                print("EOF reached")
                # Turning the data into a DataFrame python object
                columns_ =  self.attributesName[:]
                columns_.append("target")
                dataFrame = pd.DataFrame(data= self.data, columns=columns_)
                self.setDF(dataFrame)
        except FileNotFoundError:
            text="Archivo no existe"
            exit()
        finally:
            file.close()
            # print(str(nLines) + "\n")
            # print(str(nAttributes) + "\n")
            # print(str(nClasses) + "\n")
            # print(attributesName)
            # print(data)
    def transformData(self,nPC):
        self.nPC = nPC
        if self.nPC > self.nAttributes:
            print(str(self.nAttributes) + " is the maximum number of principal components")
            self.nPC = min([self.nAttributes,self.nPC])
        else:
            if self.nPC <= 0:
                print("1 is the minimum number of principal components")
                self.nPC = max([1,self.nPC])
        
        # Separating out the features
        self.x = self.dataFrame.loc[:, self.attributesName].values
        # Separating out the target
        self.y = self.dataFrame.loc[:,['target']].values
        # Standardizing the features
        self.x = StandardScaler().fit_transform(self.x)
        
        print ("Creating PCA with " + str(self.nPC) + " components")
        pca = PCA(n_components=self.nPC)
        self.principalComponents = pca.fit_transform(self.x)
        pc_names = []
        for i in range(1,self.nPC+1):
            pc_names.append("principal component "+ str(i))

        self.principalDf = pd.DataFrame(data = self.principalComponents
                     , columns = pc_names)
    def buildModels(self,nKFolds=5,nKNN=5,hiddenLayers=[10,15,10],activFunction='tanh',nIterations=2500,momentum=0.9):
        #Building models for the Ensemble
        self.nKFolds = nKFolds
        self.nKNN = nKNN
        self.kn = KNeighborsClassifier(n_neighbors=nKNN)
        self.nb = GaussianNB()
        self.hiddenLayers = hiddenLayers
        self.activFunction = activFunction
        self.nIterations = nIterations
        self.momentum = momentum
        self.nn = MLPClassifier(hidden_layer_sizes=self.hiddenLayers,
                                solver='sgd',
                                activation=self.activFunction,
                                max_iter=self.nIterations,
                                momentum=self.momentum)
        
    def getScoreModels(self):
        self.scores_KN = cross_val_score(self.kn, self.principalComponents, self.y.ravel(), cv=self.nKFolds)
        self.scores_NB = cross_val_score(self.nb, self.principalComponents, self.y.ravel(), cv=self.nKFolds)
        self.scores_NN = cross_val_score(self.nn, self.principalComponents, self.y.ravel(), cv=self.nKFolds)
        self.split_names= []
        
        for i in range(1,self.nKFolds+1):
            self.split_names.append("Fold "+str(i))
            
        kk_df = pd.DataFrame(self.scores_KN,
                           columns=['K-Nearest Neighbors'],
                             index=self.split_names)
        nb_df = pd.DataFrame(self.scores_NB,
                           columns=['Naive Bayes'],
                             index=self.split_names)
        nn_df = pd.DataFrame(self.scores_NN,
                           columns=['Neural Network'],
                             index=self.split_names)
        avg_models = [np.mean(self.scores_KN),
                     np.mean(self.scores_NB),
                     np.mean(self.scores_NN)]
        avg_df = pd.DataFrame(avg_models,
                              columns=['Normal Cross Validation Average'],index=self.model_names)
        return kk_df,nb_df,nn_df,avg_df
    def buildEnsemble(self,vote='hard'):
        self.eclf = VotingClassifier(estimators=[('kn', self.kn), 
                                                 ('nb', self.nb), 
                                                 ('nnet', self.nn)], 
                                     voting=vote)
    def trainEnsemble(self):
        self.eclf_scores = cross_val_score(self.eclf, 
                                           self.principalComponents, 
                                           self.y.ravel(), 
                                           scoring='accuracy', 
                                           cv=self.nKFolds)
        eclf_df = pd.DataFrame(self.eclf_scores,
                           columns=['Ensemble'],
                             index=self.split_names)
        #print("Accuracy: %0.4f (+/- %0.4f)" % (eclf_scores.mean(), eclf_scores.std() * 2))
        return eclf_df,self.eclf_scores.mean(),self.eclf_scores.std()
basic_ensemble = Ensemble()

In [3]:
basic_ensemble.readFile("archivo.txt")
basic_ensemble.transformData(8)
basic_ensemble.buildModels()
d1,d2,d3,a = basic_ensemble.getScoreModels()
basic_ensemble.buildEnsemble()
e_df,e_mean,e_std = basic_ensemble.trainEnsemble()
e_df

EOF reached
Creating PCA with 8 components


Unnamed: 0,Ensemble
Fold 1,0.857143
Fold 2,0.916667
Fold 3,0.928571
Fold 4,0.916667
Fold 5,0.904762
