### Imports

In [1]:
import string
import numpy as np
import pandas as pd
import pickle
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Shreya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Load and Explore Data

In [3]:
df = pd.read_csv("phishing_email.csv")
print(df.info())
count = df['label'].value_counts()
print(count)
# this dataset includes: 39595 non-phishing emails, and 42891 phishing emails, based on the text content of the email
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82486 entries, 0 to 82485
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   text_combined  82486 non-null  object
 1   label          82486 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 1.3+ MB
None
label
1    42891
0    39595
Name: count, dtype: int64


Unnamed: 0,text_combined,label
0,hpl nom may 25 2001 see attached file hplno 52...,0
1,nom actual vols 24 th forwarded sabrae zajac h...,0
2,enron actuals march 30 april 1 201 estimated a...,0
3,hpl nom may 30 2001 see attached file hplno 53...,0
4,hpl nom june 1 2001 see attached file hplno 60...,0


In [4]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('phishing_email.csv')

# Randomly sample 50% of the rows
df_sampled = df.sample(frac=0.05, random_state=1)  # Adjust frac for the desired fraction
df_sampled.to_csv('phishing_email_2.csv', index=False)

del df_sampled
del df

# Save the sampled data to a new CSV file
df = pd.read_csv("phishing_email_2.csv")
print(df.info())
count = df['label'].value_counts()
print(count)
# this dataset includes: 39595 non-phishing emails, and 42891 phishing emails, based on the text content of the email
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4124 entries, 0 to 4123
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   text_combined  4124 non-null   object
 1   label          4124 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 64.6+ KB
None
label
1    2158
0    1966
Name: count, dtype: int64


Unnamed: 0,text_combined,label
0,godaddycom wjysgzgodaddycom dear tony meyer we...,0
1,fw duke transaction original message collonges...,0
2,fercinfo 2 whole picture sally fyi jeff hodge ...,0
3,usaa usasecuritywhosaycom ensure delivery inbo...,1
4,dave funk dinsytengineeringuiowaedu wed 27 feb...,0


### Text and Preprocessing

In [5]:
stemmer = PorterStemmer()
corpus = []

stopwords_set = set(stopwords.words('english'))

for i in range(len(df)):
    text = df["text_combined"].iloc[i].lower()
    text = text.translate(str.maketrans("", "", string.punctuation)).split()
    # stemming
    text = [stemmer.stem(word) for word in text if word not in stopwords_set]
    text = " ".join(text)
    corpus.append(text)

In [6]:
corpus[0]

'godaddycom wjysgzgodaddycom dear toni meyer weekend treat someth special place internet youll find everyth need godaddycom host email shop cart secur certif weve got dont forget domain name come need build web site start blog creat onlin photo album much place order midnight pacif time sunday march 9 2008 save 10 40 minimum order get discount simpli enter sourc code gdp0309c shop cart mention code call thank alway go daddi custom sincer bob parson ceo founder godaddycom place sourc code gdp0309c cart order save share offer famili friend applic discount domain product discount membership mainten plan addit disk space bandwidth renew poster gift card discount reflect shop cart use conjunct offer promot quick blogcast free web site email account valu 9617yr purchas separ free host photo album quick blogcast servic adsupport comparison price accur 372008 subject chang without notic sure listen go daddi live formerli life onlin bob parsonst everi wednesday 1 pm pt4 pm et live via internet 

### Feature Extraction

In [7]:
# Vectorize
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus).toarray()
y = df.label

with open('count_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

In [8]:
del df
import gc
gc.collect()

0

### Model Selection

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

### Model Training and Evaluation

In [10]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Naive Bayes": GaussianNB(),
    "Support Vector Machine": SVC(),
    "Random Forest": RandomForestClassifier(),
    "K-Nearest Neighbors": KNeighborsClassifier()
}

best_accuracy = 0
best_model = None
best_model_name = ""
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} Accuracy: {accuracy:.4f}")
    if accuracy > best_accuracy:
        best_model_name = model_name
        best_model = model
        best_accuracy = accuracy


print(f"\nBest model: {best_model_name} with Accuracy: {best_accuracy:.4f}")

Logistic Regression Accuracy: 0.9588
Naive Bayes Accuracy: 0.9208
Support Vector Machine Accuracy: 0.8223
Random Forest Accuracy: 0.9483
K-Nearest Neighbors Accuracy: 0.7342

Best model: Logistic Regression with Accuracy: 0.9588


### Deployment

In [11]:
with open('email_phishing_detection.pkl', 'wb') as f:
    pickle.dump(best_model, f)