# Training the data

## Building a Random Forest Model

In [129]:
import nltk
import pandas as pd      # Importing the necessary files for the model
import re
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

In [130]:
# original training data
train = pd.read_csv("train_data.csv",dtype='unicode')

# training data without labels
t = pd.read_csv("train_data.csv",dtype='unicode')
labels = np.array(t['Label'])
t = t.drop(['Label','Return-Path','Message-ID','From','Reply-To','To','Subject','Date','X-Mailer','MIME-Version','Content-Type','X-Priority','X-MSMail-Priority','Status','Content-Length','Content-Transfer-Encoding','Lines'], axis = 1)

# get all the features
feature_list = list(t.columns)
features = np.array(t)

### Training the Data for RFM

In [131]:
X_train, X_test, y_train, y_test = train_test_split(t, train['Label'], test_size=0.2)

In [132]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=50, max_depth=20, n_jobs=-1)
rf_model = rf.fit(X_train, y_train)

In [133]:
sorted(zip(rf_model.feature_importances_, X_train.columns), reverse=True)[0:10]

[(0.255660367880319, 'number_of_characters_subject'),
 (0.19004099110292866, 'Max_word_length_in_subject'),
 (0.15079796719932492, 'number_of_capitalized_characters_subject'),
 (0.08303399333323848, 'number_of_words_subject'),
 (0.07429548551575159, 'number_of_spaces_subject'),
 (0.06837214539477443, 'number_of_special_characters_subject'),
 (0.05641336356603787, 'ratio_of_uppercase/lowercase_words'),
 (0.04504091077618009, 'number_of_digits_subject'),
 (0.027970642153960088, 'Total_number_of_upperCase'),
 (0.026003595058311724, 'number_of_capitalized_words_subject')]

#### Test for precision, recall and accuracy

In [134]:
y_pred = rf_model.predict(X_test)
precision, recall, fscore, support = score(y_test, y_pred, pos_label='1', average='binary')

In [135]:
print('Precision: {} / Recall: {} / Accuracy: {}'.format(round(precision, 3),
                                                        round(recall, 3),
                                                        round((y_pred==y_test).sum() / len(y_pred),3)))

Precision: 0.941 / Recall: 0.926 / Accuracy: 0.915


In [136]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[2804  327]
 [ 416 5188]]
              precision    recall  f1-score   support

           0       0.87      0.90      0.88      3131
           1       0.94      0.93      0.93      5604

    accuracy                           0.91      8735
   macro avg       0.91      0.91      0.91      8735
weighted avg       0.92      0.91      0.92      8735



In [137]:
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred)

0.9149398969662278

## Building a SVM (Support Vector Machine)  Model

In [115]:
import os
import pandas as pd
import numpy as np         # Importing the necessary files for the model
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.svm import SVC, NuSVC, LinearSVC


In [116]:
# original training data
train = pd.read_csv("train_data.csv",dtype='unicode')

# training data without labels
t = pd.read_csv("train_data.csv",dtype='unicode')
labels = np.array(t['Label'])
t = t.drop(['Label','Return-Path','Message-ID','From','Reply-To','To','Subject','Date','X-Mailer','MIME-Version','Content-Type','X-Priority','X-MSMail-Priority','Status','Content-Length','Content-Transfer-Encoding','Lines'], axis = 1)

# get all the features
feature_list = list(t.columns)
features = np.array(t)

### Training the Data for SVM

In [117]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(t, train['Label'], test_size=0.2)

In [118]:
from sklearn.svm import SVC
svclassifier = SVC()
svclassifer_model = svclassifier.fit(X_train, y_train)




#### Test for precision, recall and accuracy

In [119]:
y_pred = svclassifer_model.predict(X_test)

In [120]:
precision, recall, fscore, support = score(y_test, y_pred, pos_label='1', average='binary')


In [121]:
print('Precision: {} / Recall: {} / Accuracy: {}'.format(round(precision, 3),
                                                        round(recall, 3),
                                                        round((y_pred==y_test).sum() / len(y_pred),3)))

Precision: 0.788 / Recall: 0.85 / Accuracy: 0.758


In [122]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[1864 1277]
 [ 839 4755]]
              precision    recall  f1-score   support

           0       0.69      0.59      0.64      3141
           1       0.79      0.85      0.82      5594

    accuracy                           0.76      8735
   macro avg       0.74      0.72      0.73      8735
weighted avg       0.75      0.76      0.75      8735



In [123]:
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred)

0.7577561534058386

## Testing the SVM using Sigmoid Kernel

In [124]:
from sklearn.svm import SVC
svclassifier = SVC(kernel='sigmoid')
svclassifier.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='sigmoid',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [125]:
y_pred3 = svclassifier.predict(X_test)

In [126]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred3))

[[1864 1277]
 [ 839 4755]]
              precision    recall  f1-score   support

           0       0.54      0.58      0.56      3141
           1       0.75      0.73      0.74      5594

    accuracy                           0.67      8735
   macro avg       0.65      0.65      0.65      8735
weighted avg       0.68      0.67      0.67      8735

