### Imports

In [1]:
import string
import numpy as np
import pandas as pd
import pickle
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Shreya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Load and Explore Data

In [3]:
import pandas as pd

# Load the CSV file
# this dataset includes: 39595 non-phishing emails, and 42891 phishing emails, based on the text content of the email
df = pd.read_csv('Datasets/CEAS_08.csv')
print(df.info())
print("______________________________")
# Randomly sample 5% of the rows
df_sampled = df.sample(frac=0.05, random_state=1)  # Adjust frac for the desired fraction
df_sampled.to_csv('CEAS_08_2.csv', index=False)

del df_sampled
del df

# Save the sampled data to a new CSV file
# this dataset includes: 39595 non-phishing emails, and 42891 phishing emails, based on the text content of the email
df = pd.read_csv("CEAS_08_2.csv")
print(df.info())
count = df['label'].value_counts()
print(count)
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39154 entries, 0 to 39153
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sender    39154 non-null  object
 1   receiver  38692 non-null  object
 2   date      39154 non-null  object
 3   subject   39126 non-null  object
 4   body      39154 non-null  object
 5   label     39154 non-null  int64 
 6   urls      39154 non-null  int64 
dtypes: int64(2), object(5)
memory usage: 2.1+ MB
None
______________________________
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1958 entries, 0 to 1957
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sender    1958 non-null   object
 1   receiver  1931 non-null   object
 2   date      1958 non-null   object
 3   subject   1957 non-null   object
 4   body      1958 non-null   object
 5   label     1958 non-null   int64 
 6   urls      1958 non-null   int64 
dtypes: int64(2)

Unnamed: 0,sender,receiver,date,subject,body,label,urls
0,Luann Morrow <akstcaccompraguemnsdgs@accomprag...,user2.10@gvc.ceas-challenge.cc,"Wed, 06 Aug 2008 01:50:27 -0300",Exclusive Rx Reductions,\nExtraordinary RX Options\n\nhttp://anythingc...,1,1
1,herrmann chanshin <lambert@nsbienstock.com>,user2.10@gvc.ceas-challenge.cc,"Thu, 07 Aug 2008 07:54:22 +0000",Eden the sensual slut gets her pussy torn apart,TZHdJca\nLesbian Teen Hunter!! >>>\t,1,0
2,"""\\""Martin v. Löwis\\"""" <qpnysl@v.loewis.de>",Christian Heimes <wluhe@cheimes.de>,"Wed, 06 Aug 2008 01:53:53 +0100",Re: [Python-Dev] Fixing buildbot/external(-amd...,>> - vcbuild db-4.4.20\\build_win32\\Berkele...,0,1
3,Morgan Mayo <dwtmpm@tmp.dk>,user2.5@gvc.ceas-challenge.cc,"Wed, 06 Aug 2008 09:59:42 +0600",Get the cheapest software offer!,\n Brilliant opportunity to get software right...,1,1
4,Sherman Santos <Sherman@satiz.it>,user7@gvc.ceas-challenge.cc,"Wed, 06 Aug 2008 14:28:33 +0300",Fondle all her internal nerve endings,We are glad to introduce you the results of th...,1,1


### Text and Preprocessing

In [4]:
for column in ['sender', 'receiver', 'date', 'subject', 'body']:
    df[column] = df[column].fillna('')

df['text_combined'] = df['sender'] + " " + df['receiver'] + " " + df['date'] + " " + df['subject'] + " " + df['body']

stemmer = PorterStemmer()
corpus = []

stopwords_set = set(stopwords.words('english'))

for i in range(len(df)):
    text = df["text_combined"].iloc[i].lower()
    text = text.translate(str.maketrans("", "", string.punctuation)).split()
    # stemming
    text = [stemmer.stem(word) for word in text if word not in stopwords_set]
    text = " ".join(text)
    corpus.append(text)

In [5]:
corpus[0]

'luann morrow akstcaccompraguemnsdgsaccompraguecz user210gvcceaschallengecc wed 06 aug 2008 015027 0300 exclus rx reduct extraordinari rx option httpanythingcandospaceslivecomdefaultaspx wrong use rita r san francisco'

### Feature Extraction

In [6]:
# Vectorize
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus).toarray()
y = df.label

with open('count_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

In [7]:
del df
import gc
gc.collect()

0

### Model Selection

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

### Model Training and Evaluation

In [9]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Naive Bayes": GaussianNB(),
    "Support Vector Machine": SVC(),
    "Random Forest": RandomForestClassifier(),
    "K-Nearest Neighbors": KNeighborsClassifier()
}

best_accuracy = 0
best_model = None
best_model_name = ""
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} Accuracy: {accuracy:.4f}")
    if accuracy > best_accuracy:
        best_model_name = model_name
        best_model = model
        best_accuracy = accuracy


print(f"\nBest model: {best_model_name} with Accuracy: {best_accuracy:.4f}")

Logistic Regression Accuracy: 0.9796
Naive Bayes Accuracy: 0.9609
Support Vector Machine Accuracy: 0.8912
Random Forest Accuracy: 0.9745
K-Nearest Neighbors Accuracy: 0.6565

Best model: Logistic Regression with Accuracy: 0.9796


### Deployment

In [10]:
with open('email_phishing_detection.pkl', 'wb') as f:
    pickle.dump(best_model, f)