In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import os
import json

gpus = tf.config.list_physical_devices('GPU')#ensure you have gpu
print(f"GPU available: {gpus}")
tf.config.experimental.set_memory_growth(gpus[0], True)

In [None]:
file_path = "Phishing_Email.csv"
df = pd.read_csv(file_path)

In [None]:
#columns are ID, email_text, email_type
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
df['email_type'] = df['email_type'].map({'Safe Email': 0, 'Phishing Email': 1})

df = df.iloc[:72000] #i want 9000 emails max, but the dataset has 94000

#this all splits between training, test, and validation
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    df['email_text'], df['email_type'], test_size=0.33, random_state=42
)
val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts, temp_labels, test_size=0.5, random_state=42
)

#dataset has a lot of nulls, this fixes them
train_texts = train_texts.astype(str)
val_texts = val_texts.astype(str)
test_texts = test_texts.astype(str)
df['email_text'] = df['email_text'].fillna("").astype(str)
print(train_texts.head())
print(type(train_texts.iloc[0]))

In [None]:
max_vocab_size = 20000
max_length = 500
tokenizer = Tokenizer(num_words=max_vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(train_texts)

train_sequences = tokenizer.texts_to_sequences(train_texts)
val_sequences = tokenizer.texts_to_sequences(val_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)

train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post', truncating='post')
val_padded = pad_sequences(val_sequences, maxlen=max_length, padding='post', truncating='post')
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post', truncating='post')

In [None]:
train_labels = np.array(train_labels)#numpy arrays
val_labels = np.array(val_labels)
test_labels = np.array(test_labels)

model = tf.keras.Sequential([#this is an lstm
    tf.keras.layers.Embedding(input_dim=max_vocab_size, output_dim=128, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True)),  
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),  
    tf.keras.layers.Dense(128, activation='relu'),  
    tf.keras.layers.Dropout(0.5),#both dropout layers help with overfitting
    tf.keras.layers.Dense(64, activation='relu'),  
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(1, activation='sigmoid')#output from 0 to 1
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.build(input_shape=(None, max_length))

model.summary()

In [None]:
epochs = 10
batch_size = 32

history = model.fit(train_padded, train_labels, epochs=10, validation_data=(val_padded, val_labels), verbose=1)

model.save("phishing_model.h5")#save model
print("Model saved as phishing_model.h5")

tokenizer_json = tokenizer.to_json()#save tokenizer
with open("tokenizer.json", "w") as f:
    f.write(tokenizer_json)

if os.path.exists("phishing_model.h5"):#reload if you want to train existing model
    model = keras.models.load_model("phishing_model.h5")
    print("Model loaded for further training.")

#evaluate on test
test_loss, test_acc = model.evaluate(test_padded, test_labels, verbose=1)
print(f"Test Accuracy: {test_acc * 100:.2f}%")