In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score,confusion_matrix,precision_score,recall_score,accuracy_score


from nltk.stem import WordNetLemmatizer

df = pd.read_csv('../../datasets/phishing_emails_kaggle/Phishing_Email.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Email Text,Email Type
0,0,"re : 6 . 1100 , disc : uniformitarianism , re ...",Safe Email
1,1,the other side of * galicismos * * galicismo *...,Safe Email
2,2,re : equistar deal tickets are you still avail...,Safe Email
3,3,\nHello I am your hot lil horny toy.\n I am...,Phishing Email
4,4,software at incredibly low prices ( 86 % lower...,Phishing Email


In [2]:
df.drop(columns=['Unnamed: 0'],inplace=True)
df.head()

Unnamed: 0,Email Text,Email Type
0,"re : 6 . 1100 , disc : uniformitarianism , re ...",Safe Email
1,the other side of * galicismos * * galicismo *...,Safe Email
2,re : equistar deal tickets are you still avail...,Safe Email
3,\nHello I am your hot lil horny toy.\n I am...,Phishing Email
4,software at incredibly low prices ( 86 % lower...,Phishing Email


In [3]:
df.isna().sum()

Email Text    16
Email Type     0
dtype: int64

In [4]:
df = df.dropna()
print(df.isna().sum())
df.shape

Email Text    0
Email Type    0
dtype: int64


(18634, 2)

In [5]:
df.columns = ["Content","Category"]

In [6]:
df["Category"] = df.Category.map({'Safe Email':0,'Phishing Email':1})
df.head()

Unnamed: 0,Content,Category
0,"re : 6 . 1100 , disc : uniformitarianism , re ...",0
1,the other side of * galicismos * * galicismo *...,0
2,re : equistar deal tickets are you still avail...,0
3,\nHello I am your hot lil horny toy.\n I am...,1
4,software at incredibly low prices ( 86 % lower...,1


In [7]:
dataset = df.values
np.random.shuffle(dataset)

In [8]:
X = dataset[:,0]
Y = dataset[:,1]
Y= Y.astype('int32')

In [9]:
import sys
sys.path.append('../')
from processing.fix_contents import separate_email, remove_css

lemmatizer = WordNetLemmatizer()
for i in range(X.shape[0]):
    content = separate_email(X[i])[0]
    content = remove_css(content)
    X[i] = (" ").join([lemmatizer.lemmatize(word.lower(), pos='v') for word in content.split(" ")])

In [10]:
vectorizer = CountVectorizer()
X_transformed = vectorizer.fit_transform(X)

In [11]:
X_train = X_transformed[0:13000,:]
Y_train = Y[0:13000]
X_test = X_transformed[13000:,:]
Y_test = Y[13000:]

In [12]:
X_train.shape

(13000, 149031)

In [13]:
def print_metrics(Y_true, Y_predicted):
    print("Accuracy Score :" + str(accuracy_score(Y_true,Y_predicted)))
    print("Precision Score :" + str(precision_score(Y_true,Y_predicted)))
    print("Recall Score :" + str(recall_score(Y_true,Y_predicted)))
    print("ROC AUC Score :" + str(roc_auc_score(Y_true,Y_predicted)))
    print("Confusion Matrix : \n")
    print(confusion_matrix(Y_true,Y_predicted))

In [15]:
model = LogisticRegression(max_iter=900)
model.fit(X_train,Y_train)

In [16]:
Y_predicted_LG = model.predict(X_test)

In [17]:
print_metrics(Y_test,Y_predicted_LG)

Accuracy Score :0.963258785942492
Precision Score :0.947069116360455
Recall Score :0.9617947578853843
ROC AUC Score :0.963013843619015
Confusion Matrix : 

[[3262  121]
 [  86 2165]]


In [21]:
import joblib

joblib.dump(model, '../../trained_models/logReg_PhishTrain.joblib') # saving model for later use
joblib.dump(vectorizer, '../../trained_models/vectorizer.joblib')

['../../trained_models/vectorizer.joblib']