In [1]:
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegressionCV, RidgeClassifierCV, SGDClassifier
from sklearn.neural_network import MLPClassifier
import pandas as pd
import numpy as np

In [2]:
# import the evaluation libraries
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
# import the processing libraries
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

In [3]:
# define the random seed
global seed
seed = 43

load the dataset fs

In [4]:
# get the data from Feature-Selector
X = np.loadtxt('../dataset/matrix/X_fs.csv')
Y = np.loadtxt('../dataset/matrix/Y_str.csv')


In [5]:
Y

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [11]:
# split the data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=seed)
    

In [12]:
# build the dict to save evaluation results
confusion_matrix_dict = {}
roc_auc_score_dict = {}
f1_score_dict = {}
accuracy_dict = {}

In [13]:
dataset_type = {'dataset': ['fs']}

Create the classifier model based on sklearn

In [122]:
# 1. Create svm model
# The polynomial and RBF are especially useful when the data-points are not linearly separable.
def SVC_model(X_train, X_test, y_train, y_test):
    accuracy_svc = []
    confusion_matrix_svc = []
    roc_auc_score_svc = []
    f1_score_svc = []
    for kernel in ('linear', 'poly', 'rbf'):
        clf = SVC(kernel = kernel)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        accuracy = clf.score(X_test, y_test)
        
        confusion_matrix_value = confusion_matrix(y_test, y_pred, labels=[0,1])
        roc_auc_score_value = roc_auc_score(y_test, y_pred)
        f1_score_value = f1_score(y_test, y_pred)
        
        print('The confusion matrix for kernel {} is {}\n'.format(kernel, confusion_matrix_value))
        print('The roc_aux_score for kernel {} is {}\n'.format(kernel,roc_auc_score_value))
        print('The f1_score for kernel {} is {}\n'.format(kernel, f1_score_value))
        accuracy_svc.append(np.round(accuracy, 6))
        confusion_matrix_svc.append(confusion_matrix_value)
        roc_auc_score_svc.append(np.round(roc_auc_score_value,6))
        f1_score_svc.append(np.round(f1_score_value,6))
        
    return confusion_matrix_svc, roc_auc_score_svc, f1_score_svc, accuracy_svc

In [123]:
# create the list to save the different result from different dataset
value11, value12, value13, value14  = SVC_model(X_train, X_test, y_train, y_test)
    
# write the metrics to dict
confusion_matrix_dict['SVM'] = value11
roc_auc_score_dict['SVM'] = value12
f1_score_dict['SVM'] = value13
accuracy_dict['SVM'] = value14

The confusion matrix for kernel linear is [[33  6]
 [ 0 58]]

The roc_aux_score for kernel linear is 0.9230769230769231

The f1_score for kernel linear is 0.9508196721311475

The confusion matrix for kernel poly is [[31  8]
 [ 0 58]]

The roc_aux_score for kernel poly is 0.8974358974358975

The f1_score for kernel poly is 0.9354838709677419

The confusion matrix for kernel rbf is [[32  7]
 [ 0 58]]

The roc_aux_score for kernel rbf is 0.9102564102564102

The f1_score for kernel rbf is 0.9430894308943091



In [16]:
# The realization of Decision Trees
def decision_tree(X_train, X_test, y_train, y_test):
    clf = DecisionTreeClassifier(random_state=0)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    accuracy = clf.score(X_test, y_test)
    confusion_matrix_value = confusion_matrix(y_test, y_pred, labels=[0,1])
    roc_auc_score_value = roc_auc_score(y_test, y_pred)
    f1_score_value = f1_score(y_test, y_pred)
    
    print('The confusion matrix for decision trees model is {}\n'.format(confusion_matrix_value))
    print('The roc_aux_score for decision trees model is {}\n'.format(roc_auc_score_value))
    print('The f1_score for decision trees model is {}\n'.format(f1_score_value))
    
    return confusion_matrix_value, roc_auc_score_value, f1_score_value, accuracy

In [17]:
value21, value22, value23, value24 = decision_tree(X_train, X_test, y_train, y_test)

    
confusion_matrix_dict['Decision Trees'] = value21
roc_auc_score_dict['Decision Trees'] = value22
f1_score_dict['Decision Trees'] = value23
accuracy_dict['Decision Trees'] = value24

The confusion matrix for decision trees model is [[33  6]
 [ 0 58]]

The roc_aux_score for decision trees model is 0.9230769230769231

The f1_score for decision trees model is 0.9508196721311475



In [18]:
# The realization of Naive Bayers 
def naive_bayers_model(X_train, X_test, y_train, y_test):
    clf = GaussianNB()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = clf.score(X_test, y_test)
    confusion_matrix_value = confusion_matrix(y_test, y_pred, labels=[0,1])
    roc_auc_score_value = roc_auc_score(y_test, y_pred)
    f1_score_value = f1_score(y_test, y_pred)
    
    print('The confusion matrix for Naive Bayers model is {}\n'.format(confusion_matrix_value))
    print('The roc_aux_score for Naive Bayers model is {}\n'.format(roc_auc_score_value))
    print('The f1_score for Naive Bayers model is {}\n'.format(f1_score_value))
    
    return confusion_matrix_value, roc_auc_score_value, f1_score_value, accuracy

In [19]:
# create the list to save the different result from different dataset
value31, value32, value33, value34 = naive_bayers_model(X_train, X_test, y_train, y_test)
    
confusion_matrix_dict['Naive Bayers'] = value31
roc_auc_score_dict['Naive Bayers'] = value32
f1_score_dict['Naive Bayers'] = value33
accuracy_dict['Naive Bayers'] = value34

The confusion matrix for Naive Bayers model is [[34  5]
 [27 31]]

The roc_aux_score for Naive Bayers model is 0.7031388152077807

The f1_score for Naive Bayers model is 0.6595744680851064



In [20]:
# The realization of LogisticRegressionCV 
def LR_cv_model(X_train, X_test, y_train, y_test):
    clf = LogisticRegressionCV(random_state=seed).fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    accuracy = clf.score(X_test, y_test)
    confusion_matrix_value = confusion_matrix(y_test, y_pred, labels=[0,1])
    roc_auc_score_value = roc_auc_score(y_test, y_pred)
    f1_score_value = f1_score(y_test, y_pred)
    
    print('The confusion matrix for LRcv model is {}\n'.format(confusion_matrix_value))
    print('The roc_aux_score for LRcv model is {}\n'.format(roc_auc_score_value))
    print('The f1_score for LRcv model is {}\n'.format(f1_score_value))
    
    return confusion_matrix_value, roc_auc_score_value, f1_score_value, accuracy

In [21]:
# create the list to save the different result from different dataset
value41, value42, value43, value44 = LR_cv_model(X_train, X_test, y_train, y_test)

confusion_matrix_dict['LogisticRegressionCV'] = value41
roc_auc_score_dict['LogisticRegressionCV'] = value42
f1_score_dict['LogisticRegressionCV'] = value43
accuracy_dict['LogisticRegressionCV'] = value44

The confusion matrix for LRcv model is [[32  7]
 [ 0 58]]

The roc_aux_score for LRcv model is 0.9102564102564102

The f1_score for LRcv model is 0.9430894308943091



In [22]:
# The realization of RidgeClassifierCV
def RC_cv_model(X_train, X_test, y_train, y_test):
    clf = RidgeClassifierCV().fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = clf.score(X_test, y_test)
    confusion_matrix_value = confusion_matrix(y_test, y_pred, labels=[0,1])
    roc_auc_score_value = roc_auc_score(y_test, y_pred)
    f1_score_value = f1_score(y_test, y_pred)
    
    print('The confusion matrix for RidgeClassifierCV model is {}\n'.format(confusion_matrix_value))
    print('The roc_aux_score for RidgeClassifierCV model is {}\n'.format(roc_auc_score_value))
    print('The f1_score for RidgeClassifierCV model is {}\n'.format(f1_score_value))
    
    return confusion_matrix_value, roc_auc_score_value, f1_score_value, accuracy

In [23]:
# create the list to save the different result from different dataset
value51, value52, value53, value54 = RC_cv_model(X_train, X_test, y_train, y_test)

confusion_matrix_dict['RidgeClassifierCV'] = value51
roc_auc_score_dict['RidgeClassifierCV'] = value52
f1_score_dict['RidgeClassifierCV'] = value53
accuracy_dict['RidgeClassifierCV'] = value54

The confusion matrix for RidgeClassifierCV model is [[32  7]
 [ 0 58]]

The roc_aux_score for RidgeClassifierCV model is 0.9102564102564102

The f1_score for RidgeClassifierCV model is 0.9430894308943091



In [24]:
# The realization of SGDClassifier
def SGD_model(X_train, X_test, y_train, y_test):
    clf = SGDClassifier(random_state=seed).fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = clf.score(X_test, y_test)
    confusion_matrix_value = confusion_matrix(y_test, y_pred, labels=[0,1])
    roc_auc_score_value = roc_auc_score(y_test, y_pred)
    f1_score_value = f1_score(y_test, y_pred)
    
    print('The confusion matrix for SGDClassifier model is {}\n'.format(confusion_matrix_value))
    print('The roc_aux_score for SGDClassifier model is {}\n'.format(roc_auc_score_value))
    print('The f1_score for SGDClassifier model is {}\n'.format(f1_score_value))
    
    return confusion_matrix_value, roc_auc_score_value, f1_score_value, accuracy

In [25]:
# create the list to save the different result from different dataset
value61, value62, value63, value64 = SGD_model(X_train, X_test, y_train, y_test)
    
confusion_matrix_dict['SGDClassifier'] = value61
roc_auc_score_dict['SGDClassifier'] = value62
f1_score_dict['SGDClassifier'] = value63
accuracy_dict['SGDClassifier'] = value64

The confusion matrix for SGDClassifier model is [[35  4]
 [ 2 56]]

The roc_aux_score for SGDClassifier model is 0.931476569407604

The f1_score for SGDClassifier model is 0.9491525423728815



In [26]:
# The realization of MLP model
def MLP_model(X_train, X_test, y_train, y_test):
    clf = MLPClassifier(hidden_layer_sizes=(50,), max_iter=50, alpha=1e-4,
                                              solver='sgd', verbose=0, tol=1e-4, random_state=seed,
                                              learning_rate_init=.1).fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = clf.score(X_test, y_test)
    confusion_matrix_value = confusion_matrix(y_test, y_pred, labels=[0,1])
    roc_auc_score_value = roc_auc_score(y_test, y_pred)
    f1_score_value = f1_score(y_test, y_pred)
    
    print('The confusion matrix for MLP model is {}\n'.format(confusion_matrix_value))
    print('The roc_aux_score for MLP model is {}\n'.format(roc_auc_score_value))
    print('The f1_score for MLP model is {}\n'.format(f1_score_value))
    
    return confusion_matrix_value, roc_auc_score_value, f1_score_value, accuracy


In [27]:
# create the list to save the different result from different dataset
value71, value72, value73, value74 = MLP_model(X_train, X_test, y_train, y_test)
    
confusion_matrix_dict['MLP'] = value71
roc_auc_score_dict['MLP'] = value72
f1_score_dict['MLP'] = value73
accuracy_dict['MLP'] = value74

The confusion matrix for MLP model is [[32  7]
 [ 0 58]]

The roc_aux_score for MLP model is 0.9102564102564102

The f1_score for MLP model is 0.9430894308943091





Create the Dataframe for different Metrics Method

In [124]:
# build the total dict
print(dataset_type) 
print(confusion_matrix_dict)
for key, value in confusion_matrix_dict.items():
    confusion_matrix_dict[key] = np.array(value)

{'dataset': ['ts']}
{'SVM': [array([[33,  6],
       [ 0, 58]], dtype=int64), array([[31,  8],
       [ 0, 58]], dtype=int64), array([[32,  7],
       [ 0, 58]], dtype=int64)], 'Decision Trees': array([[33,  6],
       [ 0, 58]], dtype=int64), 'Naive Bayers': array([[34,  5],
       [27, 31]], dtype=int64), 'LogisticRegressionCV': array([[32,  7],
       [ 0, 58]], dtype=int64), 'RidgeClassifierCV': array([[32,  7],
       [ 0, 58]], dtype=int64), 'SGDClassifier': array([[35,  4],
       [ 2, 56]], dtype=int64), 'MLP': array([[32,  7],
       [ 0, 58]], dtype=int64)}


In [125]:
# 1. Confusion_Matrix
df_confusion_matrix = pd.DataFrame([confusion_matrix_dict], index=[0])
df_confusion_matrix['data type'] = dataset_type['dataset']
df_confusion_matrix.set_index('data type')

Unnamed: 0_level_0,SVM,Decision Trees,Naive Bayers,LogisticRegressionCV,RidgeClassifierCV,SGDClassifier,MLP
data type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ts,"[[[33, 6], [0, 58]], [[31, 8], [0, 58]], [[32,...","[[33, 6], [0, 58]]","[[34, 5], [27, 31]]","[[32, 7], [0, 58]]","[[32, 7], [0, 58]]","[[35, 4], [2, 56]]","[[32, 7], [0, 58]]"


In [126]:
df_confusion_matrix['SVM'].values

array([array([[[33,  6],
               [ 0, 58]],

              [[31,  8],
               [ 0, 58]],

              [[32,  7],
               [ 0, 58]]], dtype=int64)], dtype=object)

In [127]:
# 2. Roc_Auc_Score
df_roc_auc_score = pd.DataFrame([roc_auc_score_dict])
df_roc_auc_score['data type'] = dataset_type['dataset']
df_roc_auc_score.set_index('data type')

Unnamed: 0_level_0,SVM,Decision Trees,Naive Bayers,LogisticRegressionCV,RidgeClassifierCV,SGDClassifier,MLP
data type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ts,"[0.923077, 0.897436, 0.910256]",0.923077,0.703139,0.910256,0.910256,0.931477,0.910256


In [128]:
# 3. F1_Score
df_f1_score = pd.DataFrame([f1_score_dict])
df_f1_score['data type'] = dataset_type['dataset']
df_f1_score.set_index('data type')

Unnamed: 0_level_0,SVM,Decision Trees,Naive Bayers,LogisticRegressionCV,RidgeClassifierCV,SGDClassifier,MLP
data type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ts,"[0.95082, 0.935484, 0.943089]",0.95082,0.659574,0.943089,0.943089,0.949153,0.943089


In [129]:
# 4. Accuracy
df_accuracy = pd.DataFrame([accuracy_dict])
df_accuracy['data type'] = dataset_type['dataset']
df_accuracy.set_index('data type')

Unnamed: 0_level_0,SVM,Decision Trees,Naive Bayers,LogisticRegressionCV,RidgeClassifierCV,SGDClassifier,MLP
data type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ts,"[0.938144, 0.917526, 0.927835]",0.938144,0.670103,0.927835,0.927835,0.938144,0.927835


In [130]:
# combine all the matrix
df = pd.DataFrame(columns=df_accuracy.columns.values)
# df = df.append(df_confusion_matrix, ignore_index=True)
df = df.append(df_roc_auc_score, ignore_index=True)
df = df.append(df_f1_score, ignore_index=True)
df = df.append(df_accuracy, ignore_index=True)


In [131]:
df

Unnamed: 0,SVM,Decision Trees,Naive Bayers,LogisticRegressionCV,RidgeClassifierCV,SGDClassifier,MLP,data type
0,"[0.923077, 0.897436, 0.910256]",0.923077,0.703139,0.910256,0.910256,0.931477,0.910256,ts
1,"[0.95082, 0.935484, 0.943089]",0.95082,0.659574,0.943089,0.943089,0.949153,0.943089,ts
2,"[0.938144, 0.917526, 0.927835]",0.938144,0.670103,0.927835,0.927835,0.938144,0.927835,ts


In [132]:
df.index= list([ 'roc_auc_score', 'f1_score', 'accuracy'])

In [133]:
with pd.option_context('display.max_rows', None):
    print(df['SVM'])

roc_auc_score    [0.923077, 0.897436, 0.910256]
f1_score          [0.95082, 0.935484, 0.943089]
accuracy         [0.938144, 0.917526, 0.927835]
Name: SVM, dtype: object


In [134]:
import dataframe_image as dfi

In [135]:
dfi.export(df.drop('data type', axis=1), 'matrix.png')

IndexError: tuple index out of range