In [1]:
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegressionCV, RidgeClassifierCV, SGDClassifier
from sklearn.neural_network import MLPClassifier

In [2]:
# import the evaluation libraries
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
# import the processing libraries
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

In [3]:
import pandas as pd

In [4]:
# define the random seed
global seed
seed = 43

## load the dataset, there are six types of dataset in total 

In [5]:
# get the data from Feature-Selector
selected_features = pd.read_pickle('../dataset/training_data/features_selected.pkl')

In [6]:
# extract the label for all dataset
label = selected_features['label']
# label.values

In [7]:
selected_features_all = selected_features.iloc[:,1:-1]
selected_features_all.values.shape

(333, 56)

In [8]:
# get the data from three-Gram
thr_gram_features = pd.read_pickle('../dataset/training_data/thr_gram_features.pkl')
# bypass the label column
thr_gram_features = thr_gram_features.iloc[:,:-1]
print(thr_gram_features.values.shape)
# get the data from four-Gram
four_gram_features = pd.read_pickle('../dataset/training_data/four_gram_features.pkl')
print(four_gram_features.values.shape)


(333, 2841)
(333, 1772)


In [9]:
# get the data from all TF-IDF features
tf_idf_all = pd.read_pickle('../dataset/training_data/features_ran_all.pkl')
# bypass the label column
tf_idf_all = tf_idf_all.iloc[:,:-1]
print(tf_idf_all.values.shape)
# get the data from partial TF-IDF features
tf_idf_part = pd.read_pickle('../dataset/training_data/features_ran_part.pkl')
# bypass the label column
tf_idf_part = tf_idf_part.iloc[:,:-1]
print(tf_idf_part.values.shape)

(333, 1049)
(333, 688)


In [11]:
# get the united features
united_features = pd.read_pickle('../dataset/training_data/united_features.pkl')
united_features.values.shape

(333, 2297)

In [12]:
# build the dataset dict
dataset_dict = {
    'selected_features': selected_features_all,
    'thr_gram_features': thr_gram_features,
    'four_gram_features': four_gram_features,
    'tf_idf_all': tf_idf_all,
    'tf_idf_part': tf_idf_part,
    'united_features': united_features
}

## convert the features into numpy data from model

In [55]:
# module to create training and testing x and y from data
def data_split(dataframe, label):
    X = dataframe.values
    y = label.values
    # shuffle the data
    X, y = shuffle(X, y, random_state = seed)
    # split the data into training and testing data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed)
    
    return X_train, X_test, y_train, y_test
    

## build the dict to save the result

In [1]:
confusion_matrix_dict = {}
roc_auc_score_dict = {}
f1_score_dict = {}
accuracy_dict = {}

In [2]:
dataset_type = {'dataset': ['feature_selection', 'thr_gram_features', 'four_gram_features', 'tf_idf_all', 'tf_idf_part', 'united_features']}

## Create the classifier model based on sklearn

### 1. Create svm model

In [58]:
# The realization of SVM
# The polynomial and RBF are especially useful when the data-points are not linearly separable.
def SVC_model(X_train, X_test, y_train, y_test):
    accuracy_svc = []
    confusion_matrix_svc = []
    roc_auc_score_svc = []
    f1_score_svc = []
    for kernel in ('linear', 'poly', 'rbf'):
        clf = SVC(kernel = kernel)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        accuracy = clf.score(X_test, y_test)
        
        confusion_matrix_value = confusion_matrix(y_test, y_pred, labels=[0,1])
        roc_auc_score_value = roc_auc_score(y_test, y_pred)
        f1_score_value = f1_score(y_test, y_pred)
        
        print('The confusion matrix for kernel {} is {}\n'.format(kernel, confusion_matrix_value))
        print('The roc_aux_score for kernel {} is {}\n'.format(kernel,roc_auc_score_value))
        print('The f1_score for kernel {} is {}\n'.format(kernel, f1_score_value))
        accuracy_svc.append(accuracy)
        confusion_matrix_svc.append(confusion_matrix_value)
        roc_auc_score_svc.append(roc_auc_score_value)
        f1_score_svc.append(f1_score_value)
        
    return confusion_matrix_svc, roc_auc_score_svc, f1_score_svc, accuracy_svc

In [59]:
# create the list to save the different result from different dataset
value110, value120, value130, value140 = [], [], [], []
for key, value in dataset_dict.items():
    X_train, X_test, y_train, y_test = data_split(value, label)
    value11, value12, value13, value14  = SVC_model(X_train, X_test, y_train, y_test)
    
    value110.append(value11)
    value120.append(value12)
    value130.append(value13)
    value140.append(value14)
    print('The performace of features {} on the three kernels of scv is {}\n'.format(key, value14))

# write the metrics to dict
confusion_matrix_dict['SVM'] = value110
roc_auc_score_dict['SVM'] = value120
f1_score_dict['SVM'] = value130
accuracy_dict['SVM'] = value140

The confusion matrix for kernel linear is [[42  0]
 [ 0 58]]

The roc_aux_score for kernel linear is 1.0

The f1_score for kernel linear is 1.0

The confusion matrix for kernel poly is [[41  1]
 [ 0 58]]

The roc_aux_score for kernel poly is 0.988095238095238

The f1_score for kernel poly is 0.9914529914529915

The confusion matrix for kernel rbf is [[41  1]
 [ 0 58]]

The roc_aux_score for kernel rbf is 0.988095238095238

The f1_score for kernel rbf is 0.9914529914529915

The performace of features selected_features on the three kernels of scv is [1.0, 0.99, 0.99]

The confusion matrix for kernel linear is [[42  0]
 [ 0 58]]

The roc_aux_score for kernel linear is 1.0

The f1_score for kernel linear is 1.0

The confusion matrix for kernel poly is [[37  5]
 [ 0 58]]

The roc_aux_score for kernel poly is 0.9404761904761905

The f1_score for kernel poly is 0.9586776859504132

The confusion matrix for kernel rbf is [[42  0]
 [ 0 58]]

The roc_aux_score for kernel rbf is 1.0

The f1_score 

### 2. Decision Trees model

In [60]:
# The realization of Decision Trees
def decision_tree(X_train, X_test, y_train, y_test):
    clf = DecisionTreeClassifier(random_state=0)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    accuracy = clf.score(X_test, y_test)
    confusion_matrix_value = confusion_matrix(y_test, y_pred, labels=[0,1])
    roc_auc_score_value = roc_auc_score(y_test, y_pred)
    f1_score_value = f1_score(y_test, y_pred)
    
    print('The confusion matrix for decision trees model is {}\n'.format(confusion_matrix_value))
    print('The roc_aux_score for decision trees model is {}\n'.format(roc_auc_score_value))
    print('The f1_score for decision trees model is {}\n'.format(f1_score_value))
    
    return confusion_matrix_value, roc_auc_score_value, f1_score_value, accuracy

In [61]:
# create the list to save the different result from different dataset
value210, value220, value230, value240 = [], [], [], []

for key, value in dataset_dict.items():
    X_train, X_test, y_train, y_test = data_split(value, label)
    value21, value22, value23, value24 = decision_tree(X_train, X_test, y_train, y_test)
    
    value210.append(value21)
    value220.append(value22)
    value230.append(value23)
    value240.append(value24)
    
    print('The performace of features {} on the decision trees model is {}\n'.format(key, value24))

    
confusion_matrix_dict['Decision Trees'] = value210
roc_auc_score_dict['Decision Trees'] = value220
f1_score_dict['Decision Trees'] = value230
accuracy_dict['Decision Trees'] = value240

The confusion matrix for decision trees model is [[42  0]
 [ 0 58]]

The roc_aux_score for decision trees model is 1.0

The f1_score for decision trees model is 1.0

The performace of features selected_features on the decision trees model is 1.0

The confusion matrix for decision trees model is [[42  0]
 [ 0 58]]

The roc_aux_score for decision trees model is 1.0

The f1_score for decision trees model is 1.0

The performace of features thr_gram_features on the decision trees model is 1.0

The confusion matrix for decision trees model is [[42  0]
 [ 0 58]]

The roc_aux_score for decision trees model is 1.0

The f1_score for decision trees model is 1.0

The performace of features four_gram_features on the decision trees model is 1.0

The confusion matrix for decision trees model is [[42  0]
 [ 0 58]]

The roc_aux_score for decision trees model is 1.0

The f1_score for decision trees model is 1.0

The performace of features tf_idf_all on the decision trees model is 1.0

The confusion matr

### 3. Naive Bayers model

In [62]:
# The realization of Naive Bayers 
def naive_bayers_model(X_train, X_test, y_train, y_test):
    clf = GaussianNB()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = clf.score(X_test, y_test)
    confusion_matrix_value = confusion_matrix(y_test, y_pred, labels=[0,1])
    roc_auc_score_value = roc_auc_score(y_test, y_pred)
    f1_score_value = f1_score(y_test, y_pred)
    
    print('The confusion matrix for Naive Bayers model is {}\n'.format(confusion_matrix_value))
    print('The roc_aux_score for Naive Bayers model is {}\n'.format(roc_auc_score_value))
    print('The f1_score for Naive Bayers model is {}\n'.format(f1_score_value))
    
    return confusion_matrix_value, roc_auc_score_value, f1_score_value, accuracy

In [63]:
# create the list to save the different result from different dataset
value310, value320, value330, value340 = [], [], [], []

for key, value in dataset_dict.items():
    X_train, X_test, y_train, y_test = data_split(value, label)
    value31, value32, value33, value34 = naive_bayers_model(X_train, X_test, y_train, y_test)
    
    value310.append(value31)
    value320.append(value32)
    value330.append(value33)
    value340.append(value34)
    
    print('The performace of features {} on the naive bayers model is {}\n'.format(key, value34))

confusion_matrix_dict['Naive Bayers'] = value310
roc_auc_score_dict['Naive Bayers'] = value320
f1_score_dict['Naive Bayers'] = value330
accuracy_dict['Naive Bayers'] = value340

The confusion matrix for Naive Bayers model is [[42  0]
 [ 0 58]]

The roc_aux_score for Naive Bayers model is 1.0

The f1_score for Naive Bayers model is 1.0

The performace of features selected_features on the naive bayers model is 1.0

The confusion matrix for Naive Bayers model is [[42  0]
 [ 0 58]]

The roc_aux_score for Naive Bayers model is 1.0

The f1_score for Naive Bayers model is 1.0

The performace of features thr_gram_features on the naive bayers model is 1.0

The confusion matrix for Naive Bayers model is [[42  0]
 [ 0 58]]

The roc_aux_score for Naive Bayers model is 1.0

The f1_score for Naive Bayers model is 1.0

The performace of features four_gram_features on the naive bayers model is 1.0

The confusion matrix for Naive Bayers model is [[42  0]
 [ 0 58]]

The roc_aux_score for Naive Bayers model is 1.0

The f1_score for Naive Bayers model is 1.0

The performace of features tf_idf_all on the naive bayers model is 1.0

The confusion matrix for Naive Bayers model is [[4

### 4. Linear Models: LogisticRegressionCV,  RidgeClassifierCV,  SGDClassifer

In [64]:
# The realization of LogisticRegressionCV 
def LR_cv_model(X_train, X_test, y_train, y_test):
    clf = LogisticRegressionCV(random_state=seed).fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    accuracy = clf.score(X_test, y_test)
    confusion_matrix_value = confusion_matrix(y_test, y_pred, labels=[0,1])
    roc_auc_score_value = roc_auc_score(y_test, y_pred)
    f1_score_value = f1_score(y_test, y_pred)
    
    print('The confusion matrix for LRcv model is {}\n'.format(confusion_matrix_value))
    print('The roc_aux_score for LRcv model is {}\n'.format(roc_auc_score_value))
    print('The f1_score for LRcv model is {}\n'.format(f1_score_value))
    
    return confusion_matrix_value, roc_auc_score_value, f1_score_value, accuracy

In [65]:
# create the list to save the different result from different dataset
value410, value420, value430, value440 = [], [], [], []

for key, value in dataset_dict.items():
    X_train, X_test, y_train, y_test = data_split(value, label)
    value41, value42, value43, value44 = LR_cv_model(X_train, X_test, y_train, y_test)
    
    value410.append(value41)
    value420.append(value42)
    value430.append(value43)
    value440.append(value44)
    
    print('The performace of features {} on the LR_cv_model is {}\n'.format(key, value44))

confusion_matrix_dict['LogisticRegressionCV'] = value410
roc_auc_score_dict['LogisticRegressionCV'] = value420
f1_score_dict['LogisticRegressionCV'] = value430
accuracy_dict['LogisticRegressionCV'] = value440

The confusion matrix for LRcv model is [[41  1]
 [ 0 58]]

The roc_aux_score for LRcv model is 0.988095238095238

The f1_score for LRcv model is 0.9914529914529915

The performace of features selected_features on the LR_cv_model is 0.99

The confusion matrix for LRcv model is [[40  2]
 [ 0 58]]

The roc_aux_score for LRcv model is 0.9761904761904762

The f1_score for LRcv model is 0.983050847457627

The performace of features thr_gram_features on the LR_cv_model is 0.98

The confusion matrix for LRcv model is [[40  2]
 [ 0 58]]

The roc_aux_score for LRcv model is 0.9761904761904762

The f1_score for LRcv model is 0.983050847457627

The performace of features four_gram_features on the LR_cv_model is 0.98

The confusion matrix for LRcv model is [[40  2]
 [ 0 58]]

The roc_aux_score for LRcv model is 0.9761904761904762

The f1_score for LRcv model is 0.983050847457627

The performace of features tf_idf_all on the LR_cv_model is 0.98

The confusion matrix for LRcv model is [[41  1]
 [ 0 5

In [66]:
# The realization of RidgeClassifierCV
def RC_cv_model(X_train, X_test, y_train, y_test):
    clf = RidgeClassifierCV().fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = clf.score(X_test, y_test)
    confusion_matrix_value = confusion_matrix(y_test, y_pred, labels=[0,1])
    roc_auc_score_value = roc_auc_score(y_test, y_pred)
    f1_score_value = f1_score(y_test, y_pred)
    
    print('The confusion matrix for RidgeClassifierCV model is {}\n'.format(confusion_matrix_value))
    print('The roc_aux_score for RidgeClassifierCV model is {}\n'.format(roc_auc_score_value))
    print('The f1_score for RidgeClassifierCV model is {}\n'.format(f1_score_value))
    
    return confusion_matrix_value, roc_auc_score_value, f1_score_value, accuracy

In [67]:
# create the list to save the different result from different dataset
value510, value520, value530, value540 = [], [], [], []

for key, value in dataset_dict.items():
    X_train, X_test, y_train, y_test = data_split(value, label)
    value51, value52, value53, value54 = RC_cv_model(X_train, X_test, y_train, y_test)

    value510.append(value51)
    value520.append(value52)
    value530.append(value53)
    value540.append(value54)
    
    print('The performace of features {} on the RC_cv_model is {}\n'.format(key, value54))

confusion_matrix_dict['RidgeClassifierCV'] = value510
roc_auc_score_dict['RidgeClassifierCV'] = value520
f1_score_dict['RidgeClassifierCV'] = value530
accuracy_dict['RidgeClassifierCV'] = value540

The confusion matrix for RidgeClassifierCV model is [[42  0]
 [ 0 58]]

The roc_aux_score for RidgeClassifierCV model is 1.0

The f1_score for RidgeClassifierCV model is 1.0

The performace of features selected_features on the RC_cv_model is 1.0

The confusion matrix for RidgeClassifierCV model is [[42  0]
 [ 0 58]]

The roc_aux_score for RidgeClassifierCV model is 1.0

The f1_score for RidgeClassifierCV model is 1.0

The performace of features thr_gram_features on the RC_cv_model is 1.0

The confusion matrix for RidgeClassifierCV model is [[42  0]
 [ 0 58]]

The roc_aux_score for RidgeClassifierCV model is 1.0

The f1_score for RidgeClassifierCV model is 1.0

The performace of features four_gram_features on the RC_cv_model is 1.0

The confusion matrix for RidgeClassifierCV model is [[42  0]
 [ 0 58]]

The roc_aux_score for RidgeClassifierCV model is 1.0

The f1_score for RidgeClassifierCV model is 1.0

The performace of features tf_idf_all on the RC_cv_model is 1.0

The confusion matr

In [68]:
# The realization of SGDClassifier
def SGD_model(X_train, X_test, y_train, y_test):
    clf = SGDClassifier(random_state=seed).fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = clf.score(X_test, y_test)
    confusion_matrix_value = confusion_matrix(y_test, y_pred, labels=[0,1])
    roc_auc_score_value = roc_auc_score(y_test, y_pred)
    f1_score_value = f1_score(y_test, y_pred)
    
    print('The confusion matrix for SGDClassifier model is {}\n'.format(confusion_matrix_value))
    print('The roc_aux_score for SGDClassifier model is {}\n'.format(roc_auc_score_value))
    print('The f1_score for SGDClassifier model is {}\n'.format(f1_score_value))
    
    return confusion_matrix_value, roc_auc_score_value, f1_score_value, accuracy

In [69]:
# create the list to save the different result from different dataset
value610, value620, value630, value640 = [], [], [], []


for key, value in dataset_dict.items():
    X_train, X_test, y_train, y_test = data_split(value, label)
    value61, value62, value63, value64 = SGD_model(X_train, X_test, y_train, y_test)
    
    value610.append(value61)
    value620.append(value62)
    value630.append(value63)
    value640.append(value64)
    
    print('The performace of features {} on the SGD_model is {}\n'.format(key, value64))

confusion_matrix_dict['SGDClassifier'] = value610
roc_auc_score_dict['SGDClassifier'] = value620
f1_score_dict['SGDClassifier'] = value630
accuracy_dict['SGDClassifier'] = value640

The confusion matrix for SGDClassifier model is [[42  0]
 [ 0 58]]

The roc_aux_score for SGDClassifier model is 1.0

The f1_score for SGDClassifier model is 1.0

The performace of features selected_features on the SGD_model is 1.0

The confusion matrix for SGDClassifier model is [[42  0]
 [ 0 58]]

The roc_aux_score for SGDClassifier model is 1.0

The f1_score for SGDClassifier model is 1.0

The performace of features thr_gram_features on the SGD_model is 1.0

The confusion matrix for SGDClassifier model is [[42  0]
 [ 0 58]]

The roc_aux_score for SGDClassifier model is 1.0

The f1_score for SGDClassifier model is 1.0

The performace of features four_gram_features on the SGD_model is 1.0

The confusion matrix for SGDClassifier model is [[42  0]
 [ 0 58]]

The roc_aux_score for SGDClassifier model is 1.0

The f1_score for SGDClassifier model is 1.0

The performace of features tf_idf_all on the SGD_model is 1.0

The confusion matrix for SGDClassifier model is [[42  0]
 [ 0 58]]

The ro

### 5. MLP model

In [70]:
# The realization of MLP model
def MLP_model(X_train, X_test, y_train, y_test):
    clf = MLPClassifier(hidden_layer_sizes=(50,), max_iter=50, alpha=1e-4,
                                              solver='sgd', verbose=0, tol=1e-4, random_state=seed,
                                              learning_rate_init=.1).fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = clf.score(X_test, y_test)
    confusion_matrix_value = confusion_matrix(y_test, y_pred, labels=[0,1])
    roc_auc_score_value = roc_auc_score(y_test, y_pred)
    f1_score_value = f1_score(y_test, y_pred)
    
    print('The confusion matrix for MLP model is {}\n'.format(confusion_matrix_value))
    print('The roc_aux_score for MLP model is {}\n'.format(roc_auc_score_value))
    print('The f1_score for MLP model is {}\n'.format(f1_score_value))
    
    return confusion_matrix_value, roc_auc_score_value, f1_score_value, accuracy


In [71]:
# create the list to save the different result from different dataset
value710, value720, value730, value740 = [], [], [], []


for key, value in dataset_dict.items():
    X_train, X_test, y_train, y_test = data_split(value, label)
    value71, value72, value73, value74 = MLP_model(X_train, X_test, y_train, y_test)
    
    value710.append(value71)
    value720.append(value72)
    value730.append(value73)
    value740.append(value74)
    
    print('The performace of features {} on the MLP_model is {}\n'.format(key, value74))

confusion_matrix_dict['MLP'] = value710
roc_auc_score_dict['MLP'] = value720
f1_score_dict['MLP'] = value730
accuracy_dict['MLP'] = value740

The confusion matrix for MLP model is [[42  0]
 [ 0 58]]

The roc_aux_score for MLP model is 1.0

The f1_score for MLP model is 1.0

The performace of features selected_features on the MLP_model is 1.0

The confusion matrix for MLP model is [[42  0]
 [ 0 58]]

The roc_aux_score for MLP model is 1.0

The f1_score for MLP model is 1.0

The performace of features thr_gram_features on the MLP_model is 1.0

The confusion matrix for MLP model is [[42  0]
 [ 0 58]]

The roc_aux_score for MLP model is 1.0

The f1_score for MLP model is 1.0

The performace of features four_gram_features on the MLP_model is 1.0

The confusion matrix for MLP model is [[42  0]
 [ 0 58]]

The roc_aux_score for MLP model is 1.0

The f1_score for MLP model is 1.0

The performace of features tf_idf_all on the MLP_model is 1.0

The confusion matrix for MLP model is [[42  0]
 [ 0 58]]

The roc_aux_score for MLP model is 1.0

The f1_score for MLP model is 1.0

The performace of features tf_idf_part on the MLP_model is 1.

## create the data type dataframe

In [78]:
df_datatype = pd.DataFrame(dataset_type)
df_datatype

Unnamed: 0,dataset
0,feature_selection
1,thr_gram_features
2,four_gram_features
3,tf_idf_all
4,tf_idf_part
5,united_features


## Create the Dataframe for different Metrics Method

### 1. Confusion_Matrix

In [80]:
df_confusion_matrix = pd.DataFrame(confusion_matrix_dict)
df_confusion_matrix['data type'] = df_datatype['dataset']
df_confusion_matrix.set_index('data type')

Unnamed: 0_level_0,SVM,Decision Trees,Naive Bayers,LogisticRegressionCV,RidgeClassifierCV,SGDClassifier,MLP
data type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
feature_selection,"[[[42, 0], [0, 58]], [[41, 1], [0, 58]], [[41,...","[[42, 0], [0, 58]]","[[42, 0], [0, 58]]","[[41, 1], [0, 58]]","[[42, 0], [0, 58]]","[[42, 0], [0, 58]]","[[42, 0], [0, 58]]"
thr_gram_features,"[[[42, 0], [0, 58]], [[37, 5], [0, 58]], [[42,...","[[42, 0], [0, 58]]","[[42, 0], [0, 58]]","[[40, 2], [0, 58]]","[[42, 0], [0, 58]]","[[42, 0], [0, 58]]","[[42, 0], [0, 58]]"
four_gram_features,"[[[42, 0], [0, 58]], [[37, 5], [0, 58]], [[42,...","[[42, 0], [0, 58]]","[[42, 0], [0, 58]]","[[40, 2], [0, 58]]","[[42, 0], [0, 58]]","[[42, 0], [0, 58]]","[[42, 0], [0, 58]]"
tf_idf_all,"[[[42, 0], [0, 58]], [[36, 6], [0, 58]], [[42,...","[[42, 0], [0, 58]]","[[42, 0], [0, 58]]","[[40, 2], [0, 58]]","[[42, 0], [0, 58]]","[[42, 0], [0, 58]]","[[42, 0], [0, 58]]"
tf_idf_part,"[[[42, 0], [0, 58]], [[37, 5], [0, 58]], [[42,...","[[42, 0], [0, 58]]","[[42, 0], [0, 58]]","[[41, 1], [0, 58]]","[[42, 0], [0, 58]]","[[42, 0], [0, 58]]","[[42, 0], [0, 58]]"
united_features,"[[[42, 0], [0, 58]], [[37, 5], [0, 58]], [[42,...","[[42, 0], [0, 58]]","[[42, 0], [0, 58]]","[[40, 2], [0, 58]]","[[42, 0], [0, 58]]","[[42, 0], [0, 58]]","[[42, 0], [0, 58]]"


### 2. Roc_Auc_Score

In [81]:
df_roc_auc_score = pd.DataFrame(roc_auc_score_dict)
df_roc_auc_score['data type'] = df_datatype['dataset']
df_roc_auc_score.set_index('data type')

Unnamed: 0_level_0,SVM,Decision Trees,Naive Bayers,LogisticRegressionCV,RidgeClassifierCV,SGDClassifier,MLP
data type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
feature_selection,"[1.0, 0.988095238095238, 0.988095238095238]",1.0,1.0,0.988095,1.0,1.0,1.0
thr_gram_features,"[1.0, 0.9404761904761905, 1.0]",1.0,1.0,0.97619,1.0,1.0,1.0
four_gram_features,"[1.0, 0.9404761904761905, 1.0]",1.0,1.0,0.97619,1.0,1.0,1.0
tf_idf_all,"[1.0, 0.9285714285714286, 1.0]",1.0,1.0,0.97619,1.0,1.0,1.0
tf_idf_part,"[1.0, 0.9404761904761905, 1.0]",1.0,1.0,0.988095,1.0,1.0,1.0
united_features,"[1.0, 0.9404761904761905, 1.0]",1.0,1.0,0.97619,1.0,1.0,1.0


### 3.F1_Score

In [82]:
df_f1_score = pd.DataFrame(f1_score_dict)
df_f1_score['data type'] = df_datatype['dataset']
df_f1_score.set_index('data type')

Unnamed: 0_level_0,SVM,Decision Trees,Naive Bayers,LogisticRegressionCV,RidgeClassifierCV,SGDClassifier,MLP
data type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
feature_selection,"[1.0, 0.9914529914529915, 0.9914529914529915]",1.0,1.0,0.991453,1.0,1.0,1.0
thr_gram_features,"[1.0, 0.9586776859504132, 1.0]",1.0,1.0,0.983051,1.0,1.0,1.0
four_gram_features,"[1.0, 0.9586776859504132, 1.0]",1.0,1.0,0.983051,1.0,1.0,1.0
tf_idf_all,"[1.0, 0.9508196721311475, 1.0]",1.0,1.0,0.983051,1.0,1.0,1.0
tf_idf_part,"[1.0, 0.9586776859504132, 1.0]",1.0,1.0,0.991453,1.0,1.0,1.0
united_features,"[1.0, 0.9586776859504132, 1.0]",1.0,1.0,0.983051,1.0,1.0,1.0


### 4.Accuracy

In [83]:
df_accuracy = pd.DataFrame(accuracy_dict)
df_accuracy['data type'] = df_datatype['dataset']
df_accuracy.set_index('data type')

Unnamed: 0_level_0,SVM,Decision Trees,Naive Bayers,LogisticRegressionCV,RidgeClassifierCV,SGDClassifier,MLP
data type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
feature_selection,"[1.0, 0.99, 0.99]",1.0,1.0,0.99,1.0,1.0,1.0
thr_gram_features,"[1.0, 0.95, 1.0]",1.0,1.0,0.98,1.0,1.0,1.0
four_gram_features,"[1.0, 0.95, 1.0]",1.0,1.0,0.98,1.0,1.0,1.0
tf_idf_all,"[1.0, 0.94, 1.0]",1.0,1.0,0.98,1.0,1.0,1.0
tf_idf_part,"[1.0, 0.95, 1.0]",1.0,1.0,0.99,1.0,1.0,1.0
united_features,"[1.0, 0.95, 1.0]",1.0,1.0,0.98,1.0,1.0,1.0
