In [54]:
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC

In [55]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import pandas as pd

In [56]:
# define the random seed
global seed
seed = 43

## load the dataset, there are six types of dataset in total 

In [57]:
# data from feature selector
selected_features = pd.read_pickle('../dataset/training_data/features_selected.pkl')
selected_features_all = selected_features.iloc[:, 1:-1]

# extract the label data
label = selected_features['label']

# data from three-gram
thr_gram_features = pd.read_pickle('../dataset/training_data/thr_gram_features.pkl')
thr_gram_features = thr_gram_features.iloc[:,:-1]

# data from four-gram
four_gram_features = pd.read_pickle('../dataset/training_data/four_gram_features.pkl')

# data from all TF-IDF features
tf_idf_all = pd.read_pickle('../dataset/training_data/features_ran_all.pkl')
tf_idf_all = tf_idf_all.iloc[:,:-1]

# data from partial TF_IDF features
tf_idf_part = pd.read_pickle('../dataset/training_data/features_ran_part.pkl')
tf_idf_part = tf_idf_part.iloc[:,:-1]

# data from united features
united_features = pd.read_pickle('../dataset/training_data/united_features.pkl')


In [58]:
# build the dataset dict
dataset_dict = {
    'selected_features': selected_features_all,
    'thr_gram_features': thr_gram_features,
    'four_gram_features': four_gram_features,
    'tf_idf_all': tf_idf_all,
    'tf_idf_part': tf_idf_part,
    'united_features': united_features,
}

## convert the features into numpy data from model

In [59]:
# module to create training and testing x and y from data
def data_split(dataframe, label):
    X = dataframe.values
    y = label.values
    # shuffle the data
    X, y = shuffle(X, y, random_state = seed)
    # split the data into training and testing data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed)
    
    return X_train, X_test, y_train, y_test
    

## build the dict to save the result

In [60]:
confusion_matrix_dict = {}
roc_auc_score_dict = {}
f1_score_dict = {}
accuracy_dict = {}

In [61]:
dataset_type = {'dataset': ['feature_selection', 'thr_gram_features', 'four_gram_features', 'tf_idf_all', 'tf_idf_part', 'united_features']}

## Build Ensemble Models

### 1. BaggingClassifier

In [62]:
# the realization of BaggingClassifier
def Bag_classifier(X_train, X_test, y_train, y_test):
    
    accuracy_svc = []
    confusion_matrix_svc = []
    roc_auc_score_svc = []
    f1_score_svc = []
    
    clf = BaggingClassifier(base_estimator=SVC(), n_estimators=10, random_state=seed).fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    accuracy = clf.score(X_test, y_test)
    confusion_matrix_value = confusion_matrix(y_test, y_pred, labels=[0,1])
    roc_auc_score_value = roc_auc_score(y_test, y_pred)
    f1_score_value = f1_score(y_test, y_pred)
    
    print('The confusion matrix for BaggingClassifier model is {}\n'.format(confusion_matrix_value))
    print('The roc_aux_score for BaggingClassifier model is {}\n'.format(roc_auc_score_value))
    print('The f1_score for BaggingClassifier model is {}\n'.format(f1_score_value))
    
    return confusion_matrix_value, roc_auc_score_value, f1_score_value, accuracy

In [63]:
# create the list to save the different result from different dataset
value110, value120, value130, value140 = [], [], [], []

for key, value in dataset_dict.items():
    X_train, X_test, y_train, y_test = data_split(value, label)
    
    value11, value12, value13, value14  = Bag_classifier(X_train, X_test, y_train, y_test)

    value110.append(value11)
    value120.append(value12)
    value130.append(value13)
    value140.append(value14)
    
    print('The performace of features {} on the BaggingClassifier model is {}\n'.format(key, value14))

confusion_matrix_dict['BaggingClassifier'] = value110
roc_auc_score_dict['BaggingClassifier'] = value120
f1_score_dict['BaggingClassifier'] = value130
accuracy_dict['BaggingClassifier'] = value140

The confusion matrix for BaggingClassifier model is [[41  1]
 [ 0 58]]

The roc_aux_score for BaggingClassifier model is 0.988095238095238

The f1_score for BaggingClassifier model is 0.9914529914529915

The performace of features selected_features on the BaggingClassifier model is 0.99

The confusion matrix for BaggingClassifier model is [[42  0]
 [ 0 58]]

The roc_aux_score for BaggingClassifier model is 1.0

The f1_score for BaggingClassifier model is 1.0

The performace of features thr_gram_features on the BaggingClassifier model is 1.0

The confusion matrix for BaggingClassifier model is [[42  0]
 [ 0 58]]

The roc_aux_score for BaggingClassifier model is 1.0

The f1_score for BaggingClassifier model is 1.0

The performace of features four_gram_features on the BaggingClassifier model is 1.0

The confusion matrix for BaggingClassifier model is [[42  0]
 [ 0 58]]

The roc_aux_score for BaggingClassifier model is 1.0

The f1_score for BaggingClassifier model is 1.0

The performace of

### 2.RandomForestClassifier

In [64]:
# the realization of RandomForestClassifier
def RF_model(X_trian, X_test, y_train, y_test):
    clf = RandomForestClassifier(max_depth=2, random_state=seed).fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = clf.score(X_test, y_test)
    
    confusion_matrix_value = confusion_matrix(y_test, y_pred, labels=[0,1])
    roc_auc_score_value = roc_auc_score(y_test, y_pred)
    f1_score_value = f1_score(y_test, y_pred)
    
    
    print('The confusion matrix for RandomForestClassifier model is {}\n'.format(confusion_matrix_value))
    print('The roc_aux_score for RandomForestClassifier model is {}\n'.format(roc_auc_score_value))
    print('The f1_score for RandomForestClassifier model is {}\n'.format(f1_score_value))
    
    return confusion_matrix_value, roc_auc_score_value, f1_score_value, accuracy

In [65]:
# create the list to save the different result from different dataset
value210, value220, value230, value240 = [], [], [], []

for key, value in dataset_dict.items():
    X_train, X_test, y_train, y_test = data_split(value, label)
        
    value21, value22, value23, value24 = RF_model(X_train, X_test, y_train, y_test)
    
    value210.append(value21)
    value220.append(value22)
    value230.append(value23)
    value240.append(value24) 
    
    print('The performace of features {} on the RandomForestClassifier model is {}\n'.format(key, value24))
    
confusion_matrix_dict['RandomForestClassifier'] = value210
roc_auc_score_dict['RandomForestClassifier'] = value220
f1_score_dict['RandomForestClassifier'] = value230
accuracy_dict['RandomForestClassifier'] = value240


The confusion matrix for RandomForestClassifier model is [[42  0]
 [ 0 58]]

The roc_aux_score for RandomForestClassifier model is 1.0

The f1_score for RandomForestClassifier model is 1.0

The performace of features selected_features on the RandomForestClassifier model is 1.0

The confusion matrix for RandomForestClassifier model is [[42  0]
 [ 0 58]]

The roc_aux_score for RandomForestClassifier model is 1.0

The f1_score for RandomForestClassifier model is 1.0

The performace of features thr_gram_features on the RandomForestClassifier model is 1.0

The confusion matrix for RandomForestClassifier model is [[42  0]
 [ 0 58]]

The roc_aux_score for RandomForestClassifier model is 1.0

The f1_score for RandomForestClassifier model is 1.0

The performace of features four_gram_features on the RandomForestClassifier model is 1.0

The confusion matrix for RandomForestClassifier model is [[42  0]
 [ 0 58]]

The roc_aux_score for RandomForestClassifier model is 1.0

The f1_score for RandomFor

### 3.AdaBoostClassifier

In [66]:
# the realization of AdaBoostClassifier
def AdaBoost_model(X_trian, X_test, y_train, y_test):
    clf = AdaBoostClassifier(n_estimators=100, random_state=seed).fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = clf.score(X_test, y_test)
    confusion_matrix_value = confusion_matrix(y_test, y_pred, labels=[0,1])
    roc_auc_score_value = roc_auc_score(y_test, y_pred)
    f1_score_value = f1_score(y_test, y_pred)
        
    print('The confusion matrix for AdaBoost_model model is {}\n'.format(confusion_matrix(y_test, y_pred, labels=[0,1])))
    print('The roc_aux_score for AdaBoost_model model is {}\n'.format(roc_auc_score(y_test, y_pred)))
    print('The f1_score for AdaBoost_model model is {}\n'.format(f1_score(y_test, y_pred)))
    
    return confusion_matrix_value, roc_auc_score_value, f1_score_value, accuracy

In [67]:
# create the list to save the different result from different dataset
value310, value320, value330, value340 = [], [], [], []

for key, value in dataset_dict.items():
    X_train, X_test, y_train, y_test = data_split(value, label)
    
    value31, value32, value33, value34 = RF_model(X_train, X_test, y_train, y_test)
    
    value310.append(value31)
    value320.append(value32)
    value330.append(value33)
    value340.append(value34)
    
    print('The performace of features {} on the RandomForestClassifier model is {}\n'.format(key, value34))
    
confusion_matrix_dict['AdaBoostClassifier'] = value310
roc_auc_score_dict['AdaBoostClassifier'] = value320
f1_score_dict['AdaBoostClassifier'] = value330
accuracy_dict['AdaBoostClassifier'] = value340

The confusion matrix for RandomForestClassifier model is [[42  0]
 [ 0 58]]

The roc_aux_score for RandomForestClassifier model is 1.0

The f1_score for RandomForestClassifier model is 1.0

The performace of features selected_features on the RandomForestClassifier model is 1.0

The confusion matrix for RandomForestClassifier model is [[42  0]
 [ 0 58]]

The roc_aux_score for RandomForestClassifier model is 1.0

The f1_score for RandomForestClassifier model is 1.0

The performace of features thr_gram_features on the RandomForestClassifier model is 1.0

The confusion matrix for RandomForestClassifier model is [[42  0]
 [ 0 58]]

The roc_aux_score for RandomForestClassifier model is 1.0

The f1_score for RandomForestClassifier model is 1.0

The performace of features four_gram_features on the RandomForestClassifier model is 1.0

The confusion matrix for RandomForestClassifier model is [[42  0]
 [ 0 58]]

The roc_aux_score for RandomForestClassifier model is 1.0

The f1_score for RandomFor

### 4.GradientBoostingClassifier

In [68]:
# the realization of GradientBoostingClassifier
def GradientBoost_model(X_trian, X_test, y_train, y_test):
    clf = GradientBoostingClassifier(n_estimators=100, random_state=seed).fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    accuracy = clf.score(X_test, y_test)
    confusion_matrix_value = confusion_matrix(y_test, y_pred, labels=[0,1])
    roc_auc_score_value = roc_auc_score(y_test, y_pred)
    f1_score_value = f1_score(y_test, y_pred)
    
    print('The confusion matrix for GradientBoost_model model is {}\n'.format(confusion_matrix(y_test, y_pred, labels=[0,1])))
    print('The roc_aux_score for GradientBoost_model model is {}\n'.format(roc_auc_score(y_test, y_pred)))
    print('The f1_score for GradientBoost_model model is {}\n'.format(f1_score(y_test, y_pred)))
    
    return confusion_matrix_value, roc_auc_score_value, f1_score_value, accuracy

In [69]:
# create the list to save the different result from different dataset
value410, value420, value430, value440 = [], [], [], []

for key, value in dataset_dict.items():
    X_train, X_test, y_train, y_test = data_split(value, label)

    value41, value42, value43, value44 = GradientBoost_model(X_train, X_test, y_train, y_test)
    
    value410.append(value41)
    value420.append(value42)
    value430.append(value43)
    value440.append(value44)
    
    print('The performace of features {} on the GradientBoost_model model is {}\n'.format(key, value44))

confusion_matrix_dict['GradientBoostingClassifier'] = value410
roc_auc_score_dict['GradientBoostingClassifier'] = value420
f1_score_dict['GradientBoostingClassifier'] = value430
accuracy_dict['GradientBoostingClassifier'] = value440   


The confusion matrix for GradientBoost_model model is [[42  0]
 [ 0 58]]

The roc_aux_score for GradientBoost_model model is 1.0

The f1_score for GradientBoost_model model is 1.0

The performace of features selected_features on the GradientBoost_model model is 1.0

The confusion matrix for GradientBoost_model model is [[42  0]
 [ 0 58]]

The roc_aux_score for GradientBoost_model model is 1.0

The f1_score for GradientBoost_model model is 1.0

The performace of features thr_gram_features on the GradientBoost_model model is 1.0

The confusion matrix for GradientBoost_model model is [[42  0]
 [ 0 58]]

The roc_aux_score for GradientBoost_model model is 1.0

The f1_score for GradientBoost_model model is 1.0

The performace of features four_gram_features on the GradientBoost_model model is 1.0

The confusion matrix for GradientBoost_model model is [[42  0]
 [ 0 58]]

The roc_aux_score for GradientBoost_model model is 1.0

The f1_score for GradientBoost_model model is 1.0

The performace of

## Build the metrics dataframe for ensemble classifier

In [70]:
df_datatype = pd.DataFrame(dataset_type)
df_datatype

Unnamed: 0,dataset
0,feature_selection
1,thr_gram_features
2,four_gram_features
3,tf_idf_all
4,tf_idf_part
5,united_features


### 1. Confusion_Matrix

In [71]:
df_confusion_matrix = pd.DataFrame(confusion_matrix_dict)
df_confusion_matrix['data type'] = df_datatype['dataset']
df_confusion_matrix.set_index('data type')

Unnamed: 0_level_0,BaggingClassifier,RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
data type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
feature_selection,"[[41, 1], [0, 58]]","[[42, 0], [0, 58]]","[[42, 0], [0, 58]]","[[42, 0], [0, 58]]"
thr_gram_features,"[[42, 0], [0, 58]]","[[42, 0], [0, 58]]","[[42, 0], [0, 58]]","[[42, 0], [0, 58]]"
four_gram_features,"[[42, 0], [0, 58]]","[[42, 0], [0, 58]]","[[42, 0], [0, 58]]","[[42, 0], [0, 58]]"
tf_idf_all,"[[42, 0], [0, 58]]","[[42, 0], [0, 58]]","[[42, 0], [0, 58]]","[[42, 0], [0, 58]]"
tf_idf_part,"[[42, 0], [0, 58]]","[[42, 0], [0, 58]]","[[42, 0], [0, 58]]","[[42, 0], [0, 58]]"
united_features,"[[42, 0], [0, 58]]","[[42, 0], [0, 58]]","[[42, 0], [0, 58]]","[[42, 0], [0, 58]]"


### 2. Roc_Auc_Score

In [72]:
df_roc_auc_score = pd.DataFrame(roc_auc_score_dict)
df_roc_auc_score['data type'] = df_datatype['dataset']
df_roc_auc_score.set_index('data type')

Unnamed: 0_level_0,BaggingClassifier,RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
data type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
feature_selection,0.988095,1.0,1.0,1.0
thr_gram_features,1.0,1.0,1.0,1.0
four_gram_features,1.0,1.0,1.0,1.0
tf_idf_all,1.0,1.0,1.0,1.0
tf_idf_part,1.0,1.0,1.0,1.0
united_features,1.0,1.0,1.0,1.0


### 3.F1_Score

In [73]:
df_f1_score = pd.DataFrame(f1_score_dict)
df_f1_score['data type'] = df_datatype['dataset']
df_f1_score.set_index('data type')

Unnamed: 0_level_0,BaggingClassifier,RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
data type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
feature_selection,0.991453,1.0,1.0,1.0
thr_gram_features,1.0,1.0,1.0,1.0
four_gram_features,1.0,1.0,1.0,1.0
tf_idf_all,1.0,1.0,1.0,1.0
tf_idf_part,1.0,1.0,1.0,1.0
united_features,1.0,1.0,1.0,1.0


### 4.Accuracy

In [74]:
df_accuracy = pd.DataFrame(accuracy_dict)
df_accuracy['data type'] = df_datatype['dataset']
df_accuracy.set_index('data type')

Unnamed: 0_level_0,BaggingClassifier,RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
data type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
feature_selection,0.99,1.0,1.0,1.0
thr_gram_features,1.0,1.0,1.0,1.0
four_gram_features,1.0,1.0,1.0,1.0
tf_idf_all,1.0,1.0,1.0,1.0
tf_idf_part,1.0,1.0,1.0,1.0
united_features,1.0,1.0,1.0,1.0
