In [41]:
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC

In [42]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import pandas as pd

In [43]:
# define the random seed
global seed
seed = 43

## load the dataset, there are six types of dataset in total 

In [44]:
# data from feature selector
selected_features = pd.read_pickle('../dataset/training_data/features_selected.pkl')
selected_features_all = selected_features.iloc[:, 1:-1]

# extract the label data
label = selected_features['label']

# data from three-gram
thr_gram_features = pd.read_pickle('../dataset/training_data/thr_gram_features.pkl')
thr_gram_features = thr_gram_features.iloc[:,:-1]

# data from four-gram
four_gram_features = pd.read_pickle('../dataset/training_data/four_gram_features.pkl')

# data from all TF-IDF features
tf_idf_all = pd.read_pickle('../dataset/training_data/features_ran_all.pkl')
tf_idf_all = tf_idf_all.iloc[:,:-1]

# data from partial TF_IDF features
tf_idf_part = pd.read_pickle('../dataset/training_data/features_ran_part.pkl')
tf_idf_part = tf_idf_part.iloc[:,:-1]

# data from united features
united_features = pd.read_pickle('../dataset/training_data/united_features.pkl')


In [45]:
# build the dataset dict
dataset_dict = {
    'selected_features': selected_features_all,
    'thr_gram_features': thr_gram_features,
    'four_gram_features': four_gram_features,
    'tf_idf_all': tf_idf_all,
    'tf_idf_part': tf_idf_part,
    'united_features': united_features
}

## convert the features into numpy data from model

In [46]:
# module to create training and testing x and y from data
def data_split(dataframe, label):
    X = dataframe.values
    y = label.values
    # shuffle the data
    X, y = shuffle(X, y, random_state = seed)
    # split the data into training and testing data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed)
    
    return X_train, X_test, y_train, y_test
    

## Build Ensemble Models

### 1. BaggingClassifier

In [47]:
# the realization of BaggingClassifier
def Bag_classifier(X_train, X_test, y_train, y_test):
    clf = BaggingClassifier(base_estimator=SVC(), n_estimators=10, random_state=seed).fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = clf.score(X_test, y_test)
    
    print('The confusion matrix for BaggingClassifier model is {}\n'.format(confusion_matrix(y_test, y_pred, labels=[0,1])))
    print('The roc_aux_score for BaggingClassifier model is {}\n'.format(roc_auc_score(y_test, y_pred)))
    print('The f1_score for BaggingClassifier model is {}\n'.format(f1_score(y_test, y_pred)))
    
    return accuracy

In [48]:
for key, value in dataset_dict.items():
    X_train, X_test, y_train, y_test = data_split(value, label)
    accuracy = Bag_classifier(X_train, X_test, y_train, y_test)
    print('The performace of features {} on the BaggingClassifier model is {}\n'.format(key, accuracy))

The confusion matrix for BaggingClassifier model is [[41  1]
 [ 0 58]]

The roc_aux_score for BaggingClassifier model is 0.988095238095238

The f1_score for BaggingClassifier model is 0.9914529914529915

The performace of features selected_features on the BaggingClassifier model is 0.99

The confusion matrix for BaggingClassifier model is [[42  0]
 [ 0 58]]

The roc_aux_score for BaggingClassifier model is 1.0

The f1_score for BaggingClassifier model is 1.0

The performace of features thr_gram_features on the BaggingClassifier model is 1.0

The confusion matrix for BaggingClassifier model is [[42  0]
 [ 0 58]]

The roc_aux_score for BaggingClassifier model is 1.0

The f1_score for BaggingClassifier model is 1.0

The performace of features four_gram_features on the BaggingClassifier model is 1.0

The confusion matrix for BaggingClassifier model is [[42  0]
 [ 0 58]]

The roc_aux_score for BaggingClassifier model is 1.0

The f1_score for BaggingClassifier model is 1.0

The performace of

### 2.RandomForestClassifier

In [49]:
# the realization of RandomForestClassifier
def RF_model(X_trian, X_test, y_train, y_test):
    clf = RandomForestClassifier(max_depth=2, random_state=seed).fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = clf.score(X_test, y_test)
    
    print('The confusion matrix for RandomForestClassifier model is {}\n'.format(confusion_matrix(y_test, y_pred, labels=[0,1])))
    print('The roc_aux_score for RandomForestClassifier model is {}\n'.format(roc_auc_score(y_test, y_pred)))
    print('The f1_score for RandomForestClassifier model is {}\n'.format(f1_score(y_test, y_pred)))
    
    return accuracy

In [50]:
for key, value in dataset_dict.items():
    X_train, X_test, y_train, y_test = data_split(value, label)
    accuracy = RF_model(X_train, X_test, y_train, y_test)
    print('The performace of features {} on the RandomForestClassifier model is {}\n'.format(key, accuracy))

The confusion matrix for RandomForestClassifier model is [[42  0]
 [ 0 58]]

The roc_aux_score for RandomForestClassifier model is 1.0

The f1_score for RandomForestClassifier model is 1.0

The performace of features selected_features on the RandomForestClassifier model is 1.0

The confusion matrix for RandomForestClassifier model is [[42  0]
 [ 0 58]]

The roc_aux_score for RandomForestClassifier model is 1.0

The f1_score for RandomForestClassifier model is 1.0

The performace of features thr_gram_features on the RandomForestClassifier model is 1.0

The confusion matrix for RandomForestClassifier model is [[42  0]
 [ 0 58]]

The roc_aux_score for RandomForestClassifier model is 1.0

The f1_score for RandomForestClassifier model is 1.0

The performace of features four_gram_features on the RandomForestClassifier model is 1.0

The confusion matrix for RandomForestClassifier model is [[42  0]
 [ 0 58]]

The roc_aux_score for RandomForestClassifier model is 1.0

The f1_score for RandomFor

### 3.AdaBoostClassifier

In [51]:
# the realization of AdaBoostClassifier
def AdaBoost_model(X_trian, X_test, y_train, y_test):
    clf = AdaBoost_model(n_estimators=100, random_state=seed).fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = clf.score(X_test, y_test)
    
    print('The confusion matrix for AdaBoost_model model is {}\n'.format(confusion_matrix(y_test, y_pred, labels=[0,1])))
    print('The roc_aux_score for AdaBoost_model model is {}\n'.format(roc_auc_score(y_test, y_pred)))
    print('The f1_score for AdaBoost_model model is {}\n'.format(f1_score(y_test, y_pred)))
    
    return accuracy

In [52]:
for key, value in dataset_dict.items():
    X_train, X_test, y_train, y_test = data_split(value, label)
    accuracy = RF_model(X_train, X_test, y_train, y_test)
    print('The performace of features {} on the RandomForestClassifier model is {}\n'.format(key, accuracy))

The confusion matrix for RandomForestClassifier model is [[42  0]
 [ 0 58]]

The roc_aux_score for RandomForestClassifier model is 1.0

The f1_score for RandomForestClassifier model is 1.0

The performace of features selected_features on the RandomForestClassifier model is 1.0

The confusion matrix for RandomForestClassifier model is [[42  0]
 [ 0 58]]

The roc_aux_score for RandomForestClassifier model is 1.0

The f1_score for RandomForestClassifier model is 1.0

The performace of features thr_gram_features on the RandomForestClassifier model is 1.0

The confusion matrix for RandomForestClassifier model is [[42  0]
 [ 0 58]]

The roc_aux_score for RandomForestClassifier model is 1.0

The f1_score for RandomForestClassifier model is 1.0

The performace of features four_gram_features on the RandomForestClassifier model is 1.0

The confusion matrix for RandomForestClassifier model is [[42  0]
 [ 0 58]]

The roc_aux_score for RandomForestClassifier model is 1.0

The f1_score for RandomFor

### 4.GradientBoostingClassifier

In [53]:
# the realization of GradientBoostingClassifier
def GradientBoost_model(X_trian, X_test, y_train, y_test):
    clf = GradientBoostingClassifier(n_estimators=100, random_state=seed).fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = clf.score(X_test, y_test)
    
    print('The confusion matrix for GradientBoost_model model is {}\n'.format(confusion_matrix(y_test, y_pred, labels=[0,1])))
    print('The roc_aux_score for GradientBoost_model model is {}\n'.format(roc_auc_score(y_test, y_pred)))
    print('The f1_score for GradientBoost_model model is {}\n'.format(f1_score(y_test, y_pred)))
    
    return accuracy

In [54]:
for key, value in dataset_dict.items():
    X_train, X_test, y_train, y_test = data_split(value, label)
    accuracy = GradientBoost_model(X_train, X_test, y_train, y_test)
    print('The performace of features {} on the GradientBoost_model model is {}\n'.format(key, accuracy))

The confusion matrix for GradientBoost_model model is [[42  0]
 [ 0 58]]

The roc_aux_score for GradientBoost_model model is 1.0

The f1_score for GradientBoost_model model is 1.0

The performace of features selected_features on the GradientBoost_model model is 1.0

The confusion matrix for GradientBoost_model model is [[42  0]
 [ 0 58]]

The roc_aux_score for GradientBoost_model model is 1.0

The f1_score for GradientBoost_model model is 1.0

The performace of features thr_gram_features on the GradientBoost_model model is 1.0

The confusion matrix for GradientBoost_model model is [[42  0]
 [ 0 58]]

The roc_aux_score for GradientBoost_model model is 1.0

The f1_score for GradientBoost_model model is 1.0

The performace of features four_gram_features on the GradientBoost_model model is 1.0

The confusion matrix for GradientBoost_model model is [[42  0]
 [ 0 58]]

The roc_aux_score for GradientBoost_model model is 1.0

The f1_score for GradientBoost_model model is 1.0

The performace of

## Build the metrics dataframe for ensemble classifier