In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import roc_auc_score,confusion_matrix,precision_score,recall_score,accuracy_score
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer
import joblib

In [2]:
df = pd.read_csv('../../datasets/phishing_emails_kaggle/Phishing_Email.csv')
df.drop(columns=['Unnamed: 0'],inplace=True)
df = df.dropna()

In [3]:
df.columns = ["Content","Category"]
df["Category"] = df.Category.map({'Safe Email':0,'Phishing Email':1})

In [4]:
dataset = df.values
np.random.shuffle(dataset)

In [5]:
X = dataset[:,0]
Y = dataset[:,1]
Y= Y.astype('int32')

In [6]:
import sys
sys.path.append('../')
from processing.fix_contents import separate_email, remove_css

lemmatizer = WordNetLemmatizer()
for i in range(X.shape[0]):
    content = separate_email(X[i])[0]
    content = remove_css(content)
    X[i] = (" ").join([lemmatizer.lemmatize(word.lower(), pos='v') for word in content.split(" ")])

In [7]:
vectorizer = CountVectorizer()
X_transformed = vectorizer.fit_transform(X)

In [8]:
def print_metrics(Y_true, Y_predicted):
    print("Accuracy Score :" + str(accuracy_score(Y_true,Y_predicted)))
    print("Precision Score :" + str(precision_score(Y_true,Y_predicted)))
    print("Recall Score :" + str(recall_score(Y_true,Y_predicted)))
    print("ROC AUC Score :" + str(roc_auc_score(Y_true,Y_predicted)))
    print("Confusion Matrix : \n")
    print(confusion_matrix(Y_true,Y_predicted))

In [19]:
svm_model = joblib.load('../../trained_models/SVM_PhishTrain.joblib')
naiveBayes_model = joblib.load('../../trained_models/NB_PhishTrain.joblib')
logReg_model = joblib.load('../../trained_models/logReg_PhishTrain.joblib')

In [20]:
predicted_y = (svm_model.predict(X_transformed)+naiveBayes_model.predict(X_transformed)+logReg_model.predict(X_transformed)) // 3

In [21]:
print_metrics(Y, predicted_y)

Accuracy Score :0.9751529462273264
Precision Score :0.9713695801789402
Recall Score :0.9651258205689278
ROC AUC Score :0.9733772540399843
Confusion Matrix : 

[[11114   208]
 [  255  7057]]
