In [1]:
import pandas as pd
from flask import Flask, render_template, request
import pickle
import string
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [2]:
model = pickle.load(open('email_phishing_detection.pkl', 'rb'))
vectorizer = pickle.load(open('count_vectorizer.pkl', 'rb'))

In [3]:
df = pd.read_csv('Datasets/CEAS_08.csv')
print(df.info())
print("______________________________")
# Randomly sample 3% of the rows
df_sampled = df.sample(frac=0.03, random_state=1)
df_sampled.to_csv('CEAS_08_sample.csv', index=False)

del df_sampled
del df

# Save the sampled data to a new CSV file
# this dataset includes: 39595 non-phishing emails, and 42891 phishing emails, based on the text content of the email
df = pd.read_csv("CEAS_08_sample.csv")
print(df.info())
count = df['label'].value_counts()
print(count)
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39154 entries, 0 to 39153
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sender    39154 non-null  object
 1   receiver  38692 non-null  object
 2   date      39154 non-null  object
 3   subject   39126 non-null  object
 4   body      39154 non-null  object
 5   label     39154 non-null  int64 
 6   urls      39154 non-null  int64 
dtypes: int64(2), object(5)
memory usage: 2.1+ MB
None
______________________________
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1175 entries, 0 to 1174
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sender    1175 non-null   object
 1   receiver  1156 non-null   object
 2   date      1175 non-null   object
 3   subject   1174 non-null   object
 4   body      1175 non-null   object
 5   label     1175 non-null   int64 
 6   urls      1175 non-null   int64 
dtypes: int64(2)

Unnamed: 0,sender,receiver,date,subject,body,label,urls
0,Luann Morrow <akstcaccompraguemnsdgs@accomprag...,user2.10@gvc.ceas-challenge.cc,"Wed, 06 Aug 2008 01:50:27 -0300",Exclusive Rx Reductions,\nExtraordinary RX Options\n\nhttp://anythingc...,1,1
1,herrmann chanshin <lambert@nsbienstock.com>,user2.10@gvc.ceas-challenge.cc,"Thu, 07 Aug 2008 07:54:22 +0000",Eden the sensual slut gets her pussy torn apart,TZHdJca\nLesbian Teen Hunter!! >>>\t,1,0
2,"""\\""Martin v. Löwis\\"""" <qpnysl@v.loewis.de>",Christian Heimes <wluhe@cheimes.de>,"Wed, 06 Aug 2008 01:53:53 +0100",Re: [Python-Dev] Fixing buildbot/external(-amd...,>> - vcbuild db-4.4.20\\build_win32\\Berkele...,0,1
3,Morgan Mayo <dwtmpm@tmp.dk>,user2.5@gvc.ceas-challenge.cc,"Wed, 06 Aug 2008 09:59:42 +0600",Get the cheapest software offer!,\n Brilliant opportunity to get software right...,1,1
4,Sherman Santos <Sherman@satiz.it>,user7@gvc.ceas-challenge.cc,"Wed, 06 Aug 2008 14:28:33 +0300",Fondle all her internal nerve endings,We are glad to introduce you the results of th...,1,1


In [4]:
for column in ['sender', 'receiver', 'date', 'subject', 'body']:
    df[column] = df[column].fillna('')

df['text_combined'] = df['sender'] + " " + df['receiver'] + " " + df['date'] + " " + df['subject'] + " " + df['body']

In [5]:
def preprocess(email):
    email = email.translate(str.maketrans("", "", string.punctuation)).split()
    email = [stemmer.stem(word) for word in email if word not in stopwords_set]
    email = " ".join(email)
    return email

In [6]:
stemmer = PorterStemmer()
stopwords_set = set(stopwords.words('english'))

correct_predictions = 0
total = len(df)

for i in range(len(df)):
    email = df["text_combined"].iloc[i].lower()
    curr_label = df['label'].iloc[i]
    preprocessed_email = preprocess(email)
    transformed_email = vectorizer.transform([preprocessed_email])
    prediction = model.predict(transformed_email)[0]
    probability = model.predict_proba(transformed_email)[0][1]
    print(f"Model Prediction: {prediction} | Model Probability: {probability} | Current Label: {curr_label}")
    if prediction == curr_label:
        correct_predictions += 1

print(f"correctness accuracy: {correct_predictions}/{total} or {correct_predictions/total}")

Model Prediction: 1 | Model Probability: 0.9812602543007566 | Current Label: 1
Model Prediction: 1 | Model Probability: 0.9819523429880604 | Current Label: 1
Model Prediction: 0 | Model Probability: 0.00040655722325414373 | Current Label: 0
Model Prediction: 1 | Model Probability: 0.9955550637206892 | Current Label: 1
Model Prediction: 1 | Model Probability: 0.9913374137188772 | Current Label: 1
Model Prediction: 1 | Model Probability: 0.9961367719290697 | Current Label: 1
Model Prediction: 0 | Model Probability: 6.356028427205966e-15 | Current Label: 0
Model Prediction: 1 | Model Probability: 0.9876111310850748 | Current Label: 1
Model Prediction: 0 | Model Probability: 0.013171721622506334 | Current Label: 0
Model Prediction: 0 | Model Probability: 0.00793855065579992 | Current Label: 0
Model Prediction: 1 | Model Probability: 0.9741578056639922 | Current Label: 1
Model Prediction: 1 | Model Probability: 0.9950270519221761 | Current Label: 1
Model Prediction: 1 | Model Probability: 0