In [9]:
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC

import pandas as pd
import numpy as np

In [6]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

In [3]:
# define the random seed
global seed
seed = 43

load the dataset fs

In [4]:
# get the data from Feature-Selector
X = np.loadtxt('../dataset/matrix/X_fs.csv')
Y = np.loadtxt('../dataset/matrix/Y_str.csv')

In [5]:
# build the dataset dict
dataset_type = {'dataset': ['fs']}

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, Y)

In [12]:
# build the dict to save the result
confusion_matrix_dict = {}
roc_auc_score_dict = {}
f1_score_dict = {}
accuracy_dict = {}

Build Ensemble Models

In [20]:
# the realization of BaggingClassifier
def Bag_classifier(X_train, X_test, y_train, y_test):
    
    accuracy_svc = []
    confusion_matrix_svc = []
    roc_auc_score_svc = []
    f1_score_svc = []
    
    clf = BaggingClassifier(base_estimator=SVC(), n_estimators=10, random_state=seed).fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    confusion_matrix_value = confusion_matrix(y_test, y_pred, labels=[0,1])
    accuracy = np.round(clf.score(X_test, y_test),6)
    roc_auc_score_value = np.round(roc_auc_score(y_test, y_pred),6)
    f1_score_value = np.round(f1_score(y_test, y_pred),6)
    
    print('The confusion matrix for BaggingClassifier model is {}\n'.format(confusion_matrix_value))
    print('The roc_aux_score for BaggingClassifier model is {}\n'.format(roc_auc_score_value))
    print('The f1_score for BaggingClassifier model is {}\n'.format(f1_score_value))
    
    return confusion_matrix_value, roc_auc_score_value, f1_score_value, accuracy

In [21]:
# create the list to save the different result from different dataset

value11, value12, value13, value14  = Bag_classifier(X_train, X_test, y_train, y_test)


confusion_matrix_dict['BaggingClassifier'] = value11
roc_auc_score_dict['BaggingClassifier'] = value12
f1_score_dict['BaggingClassifier'] = value13
accuracy_dict['BaggingClassifier'] = value14

The confusion matrix for BaggingClassifier model is [[26  3]
 [ 3 49]]

The roc_aux_score for BaggingClassifier model is 0.91943

The f1_score for BaggingClassifier model is 0.942308



In [22]:
# the realization of RandomForestClassifier
def RF_model(X_trian, X_test, y_train, y_test):
    clf = RandomForestClassifier(max_depth=2, random_state=seed).fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    confusion_matrix_value = confusion_matrix(y_test, y_pred, labels=[0,1])

    accuracy = np.round(clf.score(X_test, y_test), 6)
    
    roc_auc_score_value = np.round(roc_auc_score(y_test, y_pred),6)
    f1_score_value = np.round(f1_score(y_test, y_pred),6)
    
    
    print('The confusion matrix for RandomForestClassifier model is {}\n'.format(confusion_matrix_value))
    print('The roc_aux_score for RandomForestClassifier model is {}\n'.format(roc_auc_score_value))
    print('The f1_score for RandomForestClassifier model is {}\n'.format(f1_score_value))
    
    return confusion_matrix_value, roc_auc_score_value, f1_score_value, accuracy

In [23]:
# create the list to save the different result from different dataset
value21, value22, value23, value24 = RF_model(X_train, X_test, y_train, y_test)
    
confusion_matrix_dict['RandomForestClassifier'] = value21
roc_auc_score_dict['RandomForestClassifier'] = value22
f1_score_dict['RandomForestClassifier'] = value23
accuracy_dict['RandomForestClassifier'] = value24


The confusion matrix for RandomForestClassifier model is [[25  4]
 [ 5 47]]

The roc_aux_score for RandomForestClassifier model is 0.882958

The f1_score for RandomForestClassifier model is 0.912621



In [24]:
# the realization of AdaBoostClassifier
def AdaBoost_model(X_trian, X_test, y_train, y_test):
    clf = AdaBoostClassifier(n_estimators=100, random_state=seed).fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = np.round(clf.score(X_test, y_test), 6)
    confusion_matrix_value = confusion_matrix(y_test, y_pred, labels=[0,1])
    roc_auc_score_value = np.round(roc_auc_score(y_test, y_pred),6)
    f1_score_value = np.round(f1_score(y_test, y_pred),6)
        
    print('The confusion matrix for AdaBoost_model model is {}\n'.format(confusion_matrix(y_test, y_pred, labels=[0,1])))
    print('The roc_aux_score for AdaBoost_model model is {}\n'.format(roc_auc_score(y_test, y_pred)))
    print('The f1_score for AdaBoost_model model is {}\n'.format(f1_score(y_test, y_pred)))
    
    return confusion_matrix_value, roc_auc_score_value, f1_score_value, accuracy

In [25]:
    
value31, value32, value33, value34 = RF_model(X_train, X_test, y_train, y_test)
    
confusion_matrix_dict['AdaBoostClassifier'] = value31
roc_auc_score_dict['AdaBoostClassifier'] = value32
f1_score_dict['AdaBoostClassifier'] = value33
accuracy_dict['AdaBoostClassifier'] = value34

The confusion matrix for RandomForestClassifier model is [[25  4]
 [ 5 47]]

The roc_aux_score for RandomForestClassifier model is 0.882958

The f1_score for RandomForestClassifier model is 0.912621



In [26]:
# the realization of GradientBoostingClassifier
def GradientBoost_model(X_trian, X_test, y_train, y_test):
    clf = GradientBoostingClassifier(n_estimators=100, random_state=seed).fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    accuracy = np.round(clf.score(X_test, y_test),6)
    confusion_matrix_value = confusion_matrix(y_test, y_pred, labels=[0,1])
    roc_auc_score_value = np.round(roc_auc_score(y_test, y_pred), 6)
    f1_score_value = np.round(f1_score(y_test, y_pred),6)
    
    print('The confusion matrix for GradientBoost_model model is {}\n'.format(confusion_matrix(y_test, y_pred, labels=[0,1])))
    print('The roc_aux_score for GradientBoost_model model is {}\n'.format(roc_auc_score(y_test, y_pred)))
    print('The f1_score for GradientBoost_model model is {}\n'.format(f1_score(y_test, y_pred)))
    
    return confusion_matrix_value, roc_auc_score_value, f1_score_value, accuracy

In [27]:
value41, value42, value43, value44 = GradientBoost_model(X_train, X_test, y_train, y_test)

confusion_matrix_dict['GradientBoostingClassifier'] = value41
roc_auc_score_dict['GradientBoostingClassifier'] = value42
f1_score_dict['GradientBoostingClassifier'] = value43
accuracy_dict['GradientBoostingClassifier'] = value44   


The confusion matrix for GradientBoost_model model is [[26  3]
 [ 1 51]]

The roc_aux_score for GradientBoost_model model is 0.9386604774535809

The f1_score for GradientBoost_model model is 0.9622641509433962



Build the metrics dataframe for ensemble classifier

In [28]:
# 1. Confusion_Matrix
df_confusion_matrix = pd.DataFrame([confusion_matrix_dict], index=[0])
df_confusion_matrix['data type'] = dataset_type['dataset']
df_confusion_matrix.set_index('data type')

Unnamed: 0_level_0,BaggingClassifier,RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
data type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
fs,"[[26, 3], [3, 49]]","[[25, 4], [5, 47]]","[[25, 4], [5, 47]]","[[26, 3], [1, 51]]"


In [29]:
# 2. Roc_Auc_Score
df_roc_auc_score = pd.DataFrame([roc_auc_score_dict])
df_roc_auc_score['data type'] = dataset_type['dataset']
df_roc_auc_score.set_index('data type')

Unnamed: 0_level_0,BaggingClassifier,RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
data type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
fs,0.91943,0.882958,0.882958,0.93866


In [31]:
# 3. F1_Score
df_f1_score = pd.DataFrame([f1_score_dict])
df_f1_score['data type'] = dataset_type['dataset']
df_f1_score.set_index('data type')

Unnamed: 0_level_0,BaggingClassifier,RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
data type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
fs,0.942308,0.912621,0.912621,0.962264


In [32]:
# 4. Accuracy
df_accuracy = pd.DataFrame([accuracy_dict])
df_accuracy['data type'] = dataset_type['dataset']
df_accuracy.set_index('data type')

Unnamed: 0_level_0,BaggingClassifier,RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
data type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
fs,0.925926,0.888889,0.888889,0.950617


In [33]:
# combine all the matrix
df = pd.DataFrame(columns=df_accuracy.columns.values)
# df = df.append(df_confusion_matrix, ignore_index=True)
df = df.append(df_roc_auc_score, ignore_index=True)
df = df.append(df_f1_score, ignore_index=True)
df = df.append(df_accuracy, ignore_index=True)


In [35]:
df.index= list([ 'roc_auc_score', 'f1_score', 'accuracy'])
# with pd.option_context('display.max_rows', None):
    # print(df['SVM'])

In [36]:
import dataframe_image as dfi
dfi.export(df.drop('data type', axis=1), 'matrix.png')