# Training the data

## Building a Random Forest Model

In [271]:
import nltk
import pandas as pd      # Importing the necessary files for the model
import re
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

In [272]:
# original training data
train = pd.read_csv("data_with_features.csv",dtype='unicode')

# training data without labels
t = pd.read_csv("train_data.csv",dtype='unicode')
labels = np.array(t['Label'])
t = t.drop(['Label','Return-Path','Message-ID','From','Reply-To','To','Subject','Date','X-Mailer','MIME-Version','Content-Type','X-Priority','X-MSMail-Priority','Status','Content-Length','Content-Transfer-Encoding','Lines'], axis = 1)

# get all the features
feature_list = list(t.columns)
features = np.array(t)

### Training the Data for RFM

In [273]:
X_train, X_test, y_train, y_test = train_test_split(t, train['Label'], test_size=0.2)


In [274]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=50, max_depth=20, n_jobs=-1)
rf_model = rf.fit(X_train, y_train)

#### Test for precision, recall and accuracy

In [276]:
y_pred = rf_model.predict(X_test)
precision, recall, fscore, support = score(y_test, y_pred, pos_label='1', average='binary')

In [277]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

print('Metric for Random Forest Model: Precision: {} | Recall: {} | Accuracy: {}'.format(round(precision, 3),
                                                        round(recall, 3),
                                                        round((y_pred==y_test).sum() / len(y_pred),3)))

[[2802  352]
 [ 354 5227]]
              precision    recall  f1-score   support

           0       0.89      0.89      0.89      3154
           1       0.94      0.94      0.94      5581

    accuracy                           0.92      8735
   macro avg       0.91      0.91      0.91      8735
weighted avg       0.92      0.92      0.92      8735

Metric for Random Forest Model: Precision: 0.937 | Recall: 0.937 | Accuracy: 0.919


## Building a Support Vector Classifier (SVC) Model

In [278]:
import os
import pandas as pd
import numpy as np         # Importing the necessary files for the model
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

In [279]:
# original training data
train = pd.read_csv("train_data.csv",dtype='unicode')

# training data without labels
t = pd.read_csv("train_data.csv",dtype='unicode')
labels = np.array(t['Label'])
t = t.drop(['Label','Return-Path','Message-ID','From','Reply-To','To','Subject','Date','X-Mailer','MIME-Version','Content-Type','X-Priority','X-MSMail-Priority','Status','Content-Length','Content-Transfer-Encoding','Lines'], axis = 1)

# get all the features
feature_list = list(t.columns)
features = np.array(t)

### Training the Data for SVC

In [280]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(t, train['Label'], test_size=0.2)

In [281]:
from sklearn.svm import SVC
svclassifier = SVC()
svclassifer_model = svclassifier.fit(X_train, y_train)

#### Test for precision, recall and accuracy

In [282]:
y_pred = svclassifer_model.predict(X_test)


In [283]:
precision, recall, fscore, support = score(y_test, y_pred, pos_label='1', average='binary')


In [284]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

print('Metric for Support Vector Classifier: Precision: {} | Recall: {} | Accuracy: {}'.format(round(precision, 3),
                                                        round(recall, 3),
                                                        round((y_pred==y_test).sum() / len(y_pred),3)))

[[1963 1238]
 [ 816 4718]]
              precision    recall  f1-score   support

           0       0.71      0.61      0.66      3201
           1       0.79      0.85      0.82      5534

    accuracy                           0.76      8735
   macro avg       0.75      0.73      0.74      8735
weighted avg       0.76      0.76      0.76      8735

Metric for Support Vector Classifier: Precision: 0.792 | Recall: 0.853 | Accuracy: 0.765


## Building a Gradient Boosted Tree (GBT)  Model

In [285]:
import os
import pandas as pd                       # Importing the necessary files for the model
import numpy as np
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier

In [286]:
# original training data
train = pd.read_csv("train_data.csv",dtype='unicode')

# training data without labels
t = pd.read_csv("train_data.csv",dtype='unicode')
labels = np.array(t['Label'])
t = t.drop(['Label','Return-Path','Message-ID','From','Reply-To','To','Subject','Date','X-Mailer','MIME-Version','Content-Type','X-Priority','X-MSMail-Priority','Status','Content-Length','Content-Transfer-Encoding','Lines'], axis = 1)

# get all the features
feature_list = list(t.columns)
features = np.array(t)

### Training the Data for GBT

In [287]:
X_train, X_test, y_train, y_test = train_test_split(t, train['Label'], test_size=0.2)

In [288]:
gbt = GradientBoostingClassifier(n_estimators=50, learning_rate = 0.2, max_depth=20, max_features=2)
gbt_model = gbt.fit(X_train, y_train)

#### Test for precision, recall and accuracy

In [290]:
y_pred = gbt_model.predict(X_test)

In [291]:
precision, recall, fscore, support = score(y_test, y_pred, pos_label='1', average='binary')

In [292]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

print('Metric for Gradient Boosted Tree: Precision: {} | Recall: {} | Accuracy: {}'.format(round(precision, 3),
                                                        round(recall, 3),
                                                        round((y_pred==y_test).sum() / len(y_pred),3)))

[[2867  325]
 [ 429 5114]]
              precision    recall  f1-score   support

           0       0.87      0.90      0.88      3192
           1       0.94      0.92      0.93      5543

    accuracy                           0.91      8735
   macro avg       0.91      0.91      0.91      8735
weighted avg       0.91      0.91      0.91      8735

Metric for Gradient Boosted Tree: Precision: 0.94 | Recall: 0.923 | Accuracy: 0.914
