In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from gensim.models import Word2Vec
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# PHASE 1: Data Collection

In [2]:
print("Loading dataset...")
file_path = "/kaggle/input/phishingemails/Phishing_Email.csv"  # Updated path
try:
    df = pd.read_csv(file_path)
    df.columns = df.columns.str.strip().str.lower()
    df['email text'] = df['email text'].fillna('')
    print("Dataset Loaded. Sample:")
    print(df.head())
except FileNotFoundError:
    print(f"Error: File not found at {file_path}")
    exit()


Loading dataset...
Dataset Loaded. Sample:
   unnamed: 0                                         email text  \
0           0  re : 6 . 1100 , disc : uniformitarianism , re ...   
1           1  the other side of * galicismos * * galicismo *...   
2           2  re : equistar deal tickets are you still avail...   
3           3  \nHello I am your hot lil horny toy.\n    I am...   
4           4  software at incredibly low prices ( 86 % lower...   

       email type  
0      Safe Email  
1      Safe Email  
2      Safe Email  
3  Phishing Email  
4  Phishing Email  


# PHASE 2: Text Preprocessing

In [3]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    return ' '.join(tokens)

print("Preprocessing data...")
nltk.download('stopwords')
nltk.download('punkt')
df['email text'] = df['email text'].apply(preprocess_text)
print("Preprocessing completed.")

Preprocessing data...
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Preprocessing completed.


# PHASE 3: Feature Extraction

In [4]:
print("Extracting features using TF-IDF...")
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf_vectorizer.fit_transform(df['email text'])
y = df['email type'].astype('category').cat.codes
print("Feature extraction completed.")

Extracting features using TF-IDF...
Feature extraction completed.


# PHASE 4: Model Training - Naïve Bayes & Logistic Regression

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)
print("Training Naïve Bayes model...")
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
y_pred_nb = nb_model.predict(X_test)
print("Training Logistic Regression model...")
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)

Training Naïve Bayes model...
Training Logistic Regression model...


# PHASE 5: Evaluation

In [6]:
def evaluate_model(y_true, y_pred, model_name):
    print(f"\n{model_name} Performance:")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred, average='weighted'))
    print("Recall:", recall_score(y_true, y_pred, average='weighted'))
    print("F1 Score:", f1_score(y_true, y_pred, average='weighted'))

evaluate_model(y_test, y_pred_nb, "Naïve Bayes")
evaluate_model(y_test, y_pred_lr, "Logistic Regression")


Naïve Bayes Performance:
Accuracy: 0.9533512064343164
Precision: 0.9533402545788052
Recall: 0.9533512064343164
F1 Score: 0.9533454439217011

Logistic Regression Performance:
Accuracy: 0.9654155495978552
Precision: 0.9654944404391096
Recall: 0.9654155495978552
F1 Score: 0.9654427597728343


# PHASE 6: Word2Vec Embedding for LSTM

In [7]:
print("Training Word2Vec model...")
sentences = [text.split() for text in df['email text']]
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
embedding_matrix = np.zeros((len(w2v_model.wv) + 1, 100))
for i, word in enumerate(w2v_model.wv.index_to_key):
    embedding_matrix[i] = w2v_model.wv[word]
print("Word2Vec training completed.")


Training Word2Vec model...
Word2Vec training completed.


# PHASE 7: LSTM Model Training

In [8]:
print("Preparing LSTM model...")
max_len = 100
vocab_size = len(w2v_model.wv) + 1
X_train_pad = tf.keras.preprocessing.sequence.pad_sequences(X_train.toarray(), maxlen=max_len)
X_test_pad = tf.keras.preprocessing.sequence.pad_sequences(X_test.toarray(), maxlen=max_len)

model = Sequential([
    Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=max_len, trainable=False),
    LSTM(100, return_sequences=True),
    LSTM(50),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print("Training LSTM model...")
model.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_data=(X_test_pad, y_test))


Preparing LSTM model...




Training LSTM model...
Epoch 1/5
[1m467/467[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 125ms/step - accuracy: 0.6096 - loss: 0.6735 - val_accuracy: 0.6094 - val_loss: 0.6716
Epoch 2/5
[1m467/467[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 127ms/step - accuracy: 0.6129 - loss: 0.6697 - val_accuracy: 0.6094 - val_loss: 0.6703
Epoch 3/5
[1m467/467[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 125ms/step - accuracy: 0.6092 - loss: 0.6698 - val_accuracy: 0.6094 - val_loss: 0.6698
Epoch 4/5
[1m467/467[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 127ms/step - accuracy: 0.6104 - loss: 0.6691 - val_accuracy: 0.6094 - val_loss: 0.6692
Epoch 5/5
[1m467/467[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 128ms/step - accuracy: 0.6084 - loss: 0.6699 - val_accuracy: 0.6094 - val_loss: 0.6693


<keras.src.callbacks.history.History at 0x7c61974715d0>

# PHASE 8: Evaluation of LSTM

In [9]:
y_pred_lstm = (model.predict(X_test_pad) > 0.5).astype("int32")
evaluate_model(y_test, y_pred_lstm, "LSTM")
print("Project Completed.")

[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 45ms/step

LSTM Performance:
Accuracy: 0.6093833780160858
Precision: 0.37134810140229574
Recall: 0.6093833780160858
F1 Score: 0.4614787333768326
Project Completed.


  _warn_prf(average, modifier, msg_start, len(result))
