# Training the data

## Building a Random Forest Model

In [1]:
import nltk
import pandas as pd      # Importing the necessary files for the model
import re
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

#GBT Imports
import os
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import GradientBoostingClassifier

#SVC Imports
from collections import Counter
from sklearn.svm import SVC

In [2]:
# original training data
train = pd.read_csv("data_with_features.csv",dtype='unicode')

# training data without labels
t = pd.read_csv("data_with_features.csv",dtype='unicode')
labels = np.array(t['Label'])
t = t.drop(['Submitting Host','Label','Return-Path','Message-ID','From','Reply-To','To','Subject','Date','X-Mailer','MIME-Version','Content-Type','X-Priority','X-MSMail-Priority','Status','Content-Length','Content-Transfer-Encoding','Lines','new_email','domain','new_date'], axis = 1)

# get all the features
feature_list = list(t.columns)
features = np.array(t)

### Training the Data for RFM

In [3]:
X_train, X_test, y_train, y_test = train_test_split(t, train['Label'], test_size=0.3)

In [4]:
t

Unnamed: 0,hops,special_characters_exists_subject,number_of_words_subject,number_of_capitalized_words_subject,number_of_capitalized_characters_subject,number_of_digits_subject,number_of_characters_subject,number_of_spaces_subject,number_of_special_characters_subject,number_of_single_Quotes_subject,number_of_semiColon_subject,ratio_of_uppercase/lowercase_words,Total_number_of_upperCase,Max_word_length_in_subject,spf_valid,blackListed,validate_date,Subject_length
0,3,0,4,0,0,0,19,4,0,0.0,0,0.0,0,9,1,0,1,23
1,2,0,3,0,0,0,9,2,0,0.0,0,0.0,0,6,1,0,1,11
2,9,1,6,3,7,0,15,6,4,0.0,0,1.0,3,8,1,0,1,25
3,1,0,3,0,2,0,12,2,0,0.0,0,0.0,0,7,1,0,1,14
4,6,0,2,0,1,0,2,1,0,0.0,0,0.0,0,2,1,0,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62389,4,0,2,0,1,0,2,1,0,0.0,0,0.0,0,2,1,0,1,3
62390,1,1,4,0,1,7,7,3,1,0.0,0,0.0,0,6,1,0,1,18
62391,1,0,4,1,7,0,26,4,0,0.0,0,0.3333333333333333,1,11,1,0,1,30
62392,4,0,2,0,1,0,2,1,0,0.0,0,0.0,0,2,1,0,1,3


In [5]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=50, max_depth=20, n_jobs=-1)
rf_model = rf.fit(X_train, y_train)

#### Test for precision, recall and accuracy

In [6]:
y_pred = rf_model.predict(X_test)
precision, recall, fscore, support = score(y_test, y_pred, pos_label='1', average='binary')

In [7]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

print('Metric for Random Forest Model: Precision: {} | Recall: {} | Accuracy: {}'.format(round(precision, 3),
                                                        round(recall, 3),
                                                        round((y_pred==y_test).sum() / len(y_pred),3)))

[[ 6286   581]
 [  579 11273]]
              precision    recall  f1-score   support

           0       0.92      0.92      0.92      6867
           1       0.95      0.95      0.95     11852

    accuracy                           0.94     18719
   macro avg       0.93      0.93      0.93     18719
weighted avg       0.94      0.94      0.94     18719

Metric for Random Forest Model: Precision: 0.951 | Recall: 0.951 | Accuracy: 0.938


## Building a Support Vector Classifier (SVC) Model

In [8]:
# original training data
train = pd.read_csv("data_with_features.csv",dtype='unicode')

# training data without labels
t = pd.read_csv("data_with_features.csv",dtype='unicode')
labels = np.array(t['Label'])
t = t.drop(['Submitting Host','Label','Return-Path','Message-ID','From','Reply-To','To','Subject','Date','X-Mailer','MIME-Version','Content-Type','X-Priority','X-MSMail-Priority','Status','Content-Length','Content-Transfer-Encoding','Lines','new_email','domain','new_date'], axis = 1)

# get all the features
feature_list = list(t.columns)
features = np.array(t)

### Training the Data for SVC

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(t, train['Label'], test_size=0.3)

In [10]:
from sklearn.svm import SVC
svclassifier = SVC()
svclassifer_model = svclassifier.fit(X_train, y_train)

#### Test for precision, recall and accuracy

In [11]:
y_pred = svclassifer_model.predict(X_test)


In [12]:
precision, recall, fscore, support = score(y_test, y_pred, pos_label='1', average='binary')


In [13]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

print('Metric for Support Vector Classifier: Precision: {} | Recall: {} | Accuracy: {}'.format(round(precision, 3),
                                                        round(recall, 3),
                                                        round((y_pred==y_test).sum() / len(y_pred),3)))

[[ 5300  1607]
 [ 1466 10346]]
              precision    recall  f1-score   support

           0       0.78      0.77      0.78      6907
           1       0.87      0.88      0.87     11812

    accuracy                           0.84     18719
   macro avg       0.82      0.82      0.82     18719
weighted avg       0.84      0.84      0.84     18719

Metric for Support Vector Classifier: Precision: 0.866 | Recall: 0.876 | Accuracy: 0.836


## Building a Gradient Boosted Tree (GBT)  Model

In [14]:
# original training data
train = pd.read_csv("data_with_features.csv",dtype='unicode')

# training data without labels
t = pd.read_csv("data_with_features.csv",dtype='unicode')
labels = np.array(t['Label'])
t = t.drop(['Submitting Host','Label','Return-Path','Message-ID','From','Reply-To','To','Subject','Date','X-Mailer','MIME-Version','Content-Type','X-Priority','X-MSMail-Priority','Status','Content-Length','Content-Transfer-Encoding','Lines','new_email','domain','new_date'], axis = 1)

# get all the features
feature_list = list(t.columns)
features = np.array(t)

### Training the Data for GBT

In [15]:
X_train, X_test, y_train, y_test = train_test_split(t, train['Label'], test_size=0.3)

In [16]:
gbt = GradientBoostingClassifier(n_estimators=50, learning_rate = 0.2, max_depth=20, max_features=2)
gbt_model = gbt.fit(X_train, y_train)

#### Test for precision, recall and accuracy

In [17]:
y_pred = gbt_model.predict(X_test)

In [18]:
precision, recall, fscore, support = score(y_test, y_pred, pos_label='1', average='binary')

In [19]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

print('Metric for Gradient Boosted Tree: Precision: {} | Recall: {} | Accuracy: {}'.format(round(precision, 3),
                                                        round(recall, 3),
                                                        round((y_pred==y_test).sum() / len(y_pred),3)))

[[ 6111   526]
 [  630 11452]]
              precision    recall  f1-score   support

           0       0.91      0.92      0.91      6637
           1       0.96      0.95      0.95     12082

    accuracy                           0.94     18719
   macro avg       0.93      0.93      0.93     18719
weighted avg       0.94      0.94      0.94     18719

Metric for Gradient Boosted Tree: Precision: 0.956 | Recall: 0.948 | Accuracy: 0.938
