<a href="https://colab.research.google.com/github/Niveditathakur/Classification/blob/master/Classification_ensemble.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
#Load all the libraries 

import numpy as np
import pandas as pd
from sklearn import model_selection, preprocessing 
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder,  StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, log_loss, classification_report, confusion_matrix, roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from keras.utils import np_utils
# Use scikit-learn to grid search the batch size and epochs
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
#ensemble classsifier
from mlxtend.classifier import EnsembleVoteClassifier

from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Conv1D, BatchNormalization, MaxPooling1D, Activation, Flatten
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils

# fix random seed for reproducibility
seed = 7
np.random.seed(seed)

import tensorflow as tf
import keras

#ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [0]:
# number of folds required for cross validataion
num_folds = 3

        
#classifiers used -list of all the classifiers used
classifiers = [
    KNeighborsClassifier(5),
    SVC(gamma='scale',kernel="rbf", C=0.025, probability=True),
    DecisionTreeClassifier(),
    AdaBoostClassifier(),
    RandomForestClassifier(),
    GradientBoostingClassifier(),
    GaussianNB(),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis(),
    LogisticRegression(penalty = 'l2', C = 10, random_state=0),
    RidgeClassifier()]


#min max scalar - normalize the features to range -1 and 1 
min_max_scaler = preprocessing.MinMaxScaler(feature_range=(-1,1))


#Fxn to find confidence interval for cv classification
def CI_find(acc, alpha=0.95):
    # confidence intervals alpha = 0.95
    
    p         =   ((1.0-alpha)/2.0) * 100
    lower     =   max(0.0, np.percentile(acc, p))
    p         =   (alpha+((1.0-alpha)/2.0)) * 100
    upper     =   min(1.0, np.percentile(acc, p))
    acc_std   =   ((upper - lower) / 2)*100

    return( np.mean(acc)*100, acc_std )


# Function to do  cross validation
def Fxn_CV(Input, Output): 
    
    # scale the input 
    x_scaled = min_max_scaler.fit_transform(Input) 
    
    # Logging for Visual Comparison use a  pandas dataframe 
    log_cols=["Classifier Name","Acc (MS)", "Acc-SD (MS)", "AUROC (MS)", "AUROC-SD (MS)" ,
                                "Acc", "Acc-SD", "AUROC", "AUROC-SD"] 
    log = pd.DataFrame(columns=log_cols)
    
    #KFolds using sklearn
    kfold = model_selection.KFold(n_splits=num_folds, shuffle=True, random_state=seed)
    
    
    for clf in classifiers:
        name     = clf.__class__.__name__
        
        # use sklearn' in build cross_val_score for cross validation
        #use accuracy as metric for performance
        acc_res  = model_selection.cross_val_score(clf, Input, Output, cv=kfold, 
                                                   scoring='accuracy')
        
        #use Area under roc curve as metric for performance
        auc_res  = model_selection.cross_val_score(clf,  Input, Output, cv=kfold, 
                                                  scoring='roc_auc') 
        
        #create empty lists for Acc and AUROC derived using only kfold
        score = list()
        acc   = list()
        
        for train_index, test_index in kfold.split(Input):
          
            X_train, X_test    = Input[ train_index, :], Input[test_index,:]
            y_train, y_test    = Output[train_index],    Output[test_index]
            
            clf                = clf.fit(X_train, y_train)

            y_pred             = clf.predict(X_test)
            acc1               = accuracy_score(y_test, y_pred)
            acc.append(acc1)
             
            tt           = y_pred.astype(int)
            org          = y_test.astype(int) 
            
            #one hot encoding for AUROC
            one_hot_pred = np_utils.to_categorical(tt) ;
            one_hot_org  = np_utils.to_categorical(org) ;
            
            score.append(roc_auc_score(one_hot_org, one_hot_pred))
        

        acc_mean, acc_std     = CI_find(acc, alpha=0.95)
        auroc_mean, auroc_std = CI_find(score, alpha=0.95)
        
        
        #enter the data to the pandas dataframe
        log_entry  =  pd.DataFrame([[name, acc_res.mean()*100, acc_res.std()*100, 
                                     auc_res.mean(),  auc_res.std(), 
                                     acc_mean, acc_std, 
                                     auroc_mean/100, auroc_std/100 ]], columns=log_cols)        
        log        =  log.append(log_entry)

    return(log)

In [0]:
#load the breast cancer data - two class classification problem

from sklearn.datasets import load_breast_cancer
cancer_data = load_breast_cancer()

data = cancer_data.data   # features 
labels = cancer_data.target # labels 

log = Fxn_CV(data, labels)
print(log)

                 Classifier Name   Acc (MS)  Acc-SD (MS)  AUROC (MS)  \
0           KNeighborsClassifier  92.618583     0.859667    0.953154   
0                            SVC  62.733686     4.251442    0.970136   
0         DecisionTreeClassifier  91.394226     2.430917    0.924584   
0             AdaBoostClassifier  95.958415     1.311157    0.990327   
0         RandomForestClassifier  94.902998     1.083060    0.984464   
0     GradientBoostingClassifier  95.960271     1.622955    0.992257   
0                     GaussianNB  93.846654     1.393031    0.986492   
0     LinearDiscriminantAnalysis  95.254804     0.011815    0.990850   
0  QuadraticDiscriminantAnalysis  95.607537     0.487026    0.991087   
0             LogisticRegression  94.903926     1.310681    0.990712   
0                RidgeClassifier  94.905783     0.983241    0.990913   

   AUROC-SD (MS)        Acc    Acc-SD     AUROC  AUROC-SD  
0       0.004775  92.618583  1.000000  0.915940  0.009094  
0       0.00759

In [0]:
#use ensemble of classifiers

clf1  = LogisticRegression(random_state=0)
clf2  = RandomForestClassifier(random_state=0)
clf3  = GradientBoostingClassifier()
clf4  = AdaBoostClassifier()


eclf1 = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3, clf4], weights=[2, 1, 1, 2], voting='soft')


classifiers = [ eclf1]
log = Fxn_CV(data, labels)
print(log)

          Classifier Name   Acc (MS)  Acc-SD (MS)  AUROC (MS)  AUROC-SD (MS)  \
0  EnsembleVoteClassifier  96.661097     1.080601    0.994579        0.00284   

         Acc  Acc-SD     AUROC  AUROC-SD  
0  96.661097    1.25  0.958589     0.011  
