In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
import joblib
from imblearn.over_sampling import SMOTE

In [3]:
# Load dataset
df = pd.read_csv("email_phishing_data.csv")
df

Unnamed: 0,num_words,num_unique_words,num_stopwords,num_links,num_unique_domains,num_email_addresses,num_spelling_errors,num_urgent_keywords,label
0,140,94,52,0,0,0,0,0,0
1,5,5,1,0,0,0,0,0,0
2,34,32,15,0,0,0,0,0,0
3,6,6,2,0,0,0,0,0,0
4,9,9,2,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
524841,782,327,301,2,2,2,52,1,0
524842,36,30,11,0,0,0,4,0,1
524843,61,46,11,0,0,0,3,0,0
524844,213,136,89,0,0,0,18,0,0


In [4]:
# Split into features and target
X = df.drop("label", axis=1)
y = df["label"]

In [5]:
# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [6]:
# Balance the data
sm = SMOTE(random_state=42)
X_resampled, y_resampled = sm.fit_resample(X_scaled, y)

In [7]:
# Split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [8]:
# Train ANN
mlp = MLPClassifier(hidden_layer_sizes=(10, 10), activation='relu', max_iter=1000, random_state=42)
mlp.fit(X_train, y_train)



In [9]:
# Evaluate
from sklearn.metrics import classification_report, confusion_matrix

y_pred = mlp.predict(X_test)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred, digits=4))

Confusion Matrix:
 [[76432 27167]
 [21431 82129]]
Classification Report:
               precision    recall  f1-score   support

           0     0.7810    0.7378    0.7588    103599
           1     0.7514    0.7931    0.7717    103560

    accuracy                         0.7654    207159
   macro avg     0.7662    0.7654    0.7652    207159
weighted avg     0.7662    0.7654    0.7652    207159



In [10]:
# Save model
joblib.dump(mlp, "phishing_ann_model.pkl")
joblib.dump(scaler, "scaler.pkl")

['scaler.pkl']

In [14]:
pip install numpy pandas scikit-learn joblib beautifulsoup4 pyenchant


Note: you may need to restart the kernel to use updated packages.


In [15]:
import re
import numpy as np
import joblib
import enchant
from bs4 import BeautifulSoup

In [18]:

# Load trained model and scaler
mlp = joblib.load("phishing_ann_model.pkl")
scaler = joblib.load("scaler.pkl")

# Define keywords and stopwords
STOPWORDS = set([
    "the", "is", "in", "and", "to", "a", "of", "for", "on", "at", "with", "as", "by", "this", "that"
])
PHISH_KEYWORDS = ["urgent", "immediately", "verify", "click now", "login", "confirm", "password",
    "refund", "account", "suspended", "security", "update", "warning", "reset", "billing",
    "locked", "alert", "access", "support", "bonus", "gift card", "act now", "overdue"]

#STEP 1: Take raw email content from user
print("üìß Paste the full email content (subject + body):")
print("üëâ When you're done, press ENTER twice.\n")

email_text = ""
while True:
    line = input()
    if line == "":
        break
    email_text += line + "\n"

# STEP 2: Feature Extraction
email_text = BeautifulSoup(email_text, "html.parser").get_text()

# Extract words
words = re.findall(r'\b\w+\b', email_text.lower())
num_words = len(words)
num_unique_words = len(set(words))
num_stopwords = sum(1 for w in words if w in STOPWORDS)

# Link and email features
num_links = len(re.findall(r'https?://\S+', email_text))
num_unique_domains = len(set(re.findall(r'https?://([^/\s]+)', email_text)))
num_email_addresses = len(re.findall(r'\b[\w\.-]+@[\w\.-]+\.\w+\b', email_text))

# Keyword features
num_urgent_keywords = sum(email_text.lower().count(k) for k in PHISH_KEYWORDS)

# Spelling errors
dictionary = enchant.Dict("en_US")
num_spelling_errors = sum(1 for w in words if not dictionary.check(w))

# Combine all features
features = [
    num_words,
    num_unique_words,
    num_stopwords,
    num_links,
    num_unique_domains,
    num_email_addresses,
    num_spelling_errors,
    num_urgent_keywords
]

# STEP 3: Predict with the trained ANN model
X_input_scaled = scaler.transform([features])
proba = mlp.predict_proba(X_input_scaled)[0][1]
pred = mlp.predict(X_input_scaled)[0]

#STEP 4: Output Result
print("\nüîç Feature Vector:", features)
print(f"üìä Probability of phishing: {proba:.2f}")
print("‚ö†Ô∏è PHISHING EMAIL!" if pred == 1 else "‚úÖ SAFE EMAIL")


üìß Paste the full email content (subject + body):
üëâ When you're done, press ENTER twice.



 From: support@bankoftrust.com To: customer@example.com Subject: Monthly Statement Available  Dear Valued Customer,  Your monthly bank statement for March 2025 is now available.  You can view or download your statement securely by logging into your online banking account.  Click here to log in  For your security, please do not share your password or personal information via email. If you have any concerns, contact our support team at support@bankoftrust.com.  Thank you for choosing Bank of Trust.  Sincerely, Bank of Trust Customer Service Website: www.bankoftrust.com
 



üîç Feature Vector: [89, 62, 11, 0, 0, 3, 4, 6]
üìä Probability of phishing: 0.12
‚úÖ SAFE EMAIL


