## Script to populate suspicious keywords

In [None]:
# script that you can run to extract words from csv or html encoded datasets to extract commonly phishing words
# make sure that extracted datasets are inside the root folder of this script

## Imports/Load/Extraction


In [None]:
import os
import html
import re
from pathlib import Path

def clean_html(raw_html: str) -> str: # remove HTML tags and decode entites like &nbsp, <br> /<p>/<h1> etc
    text = html.unescape(raw_html)
    text = re.sub(r'<[^>]+>', ' ', text)
    text = re.sub(r'\s+', ' ', text) # white space clean
    return text.strip()

def extract_body_from_raw_email(raw_content: str) -> str: #extract body from raw email
    raw_content = raw_content.replace('\r\n', '\n')
    parts = raw_content.split('\n\n', 1)
    body = parts[1] if len(parts) == 2 else raw_content
    return clean_html(body) 

def load_and_extract_words(folder_path: str) -> list: # load all files, extract body and tokenize words
    words = []
    folder = Path(folder_path)
    for file_path in folder.iterdir():
        if file_path.is_file():
            try:
                raw = file_path.read_text(encoding='utf-8', errors='ignore')
                body = extract_body_from_raw_email(raw)
                # extract clean words
                tokens = re.findall(r'\b[a-z]{2,}\b', body.lower())
                words.extend(tokens)
            except Exception:
                pass
    return words

In [None]:
import pandas as pd
from collections import Counter
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import names

required_packages = [
    "vader_lexicon",  # for sentiment analysis (VADER)
    "stopwords",      # for stopword removal
    "punkt",          # for tokenization
    "names"           # for removing person names
]

for pkg in required_packages:
    try:
        nltk.data.find(f"{pkg}")  # download once
    except LookupError:
        nltk.download(pkg, quiet=True)

# Initialize sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Load english stop words like pronouns, articles etc
stop_words = set(stopwords.words('english'))
custom_stopwords = {'dear', 'regards', 'hi', 'hello', 'thanks'}  # custom stopwords for emails
stop_words.update(custom_stopwords)
person_names = set(w.lower() for w in names.words()) # exclude names as they do not add to risk

# Load the CSV file
base_dir = os.getcwd()
csv_path = os.path.normpath(os.path.join(base_dir,"Phishing_validation_emails.csv"))
df = pd.read_csv(csv_path)
df['is_phishing'] = df['Email Type'] == 'Phishing Email'

# Load Kaggle EML 
ham = load_and_extract_words(os.path.normpath(os.path.join(base_dir,"easy_ham", "easy_ham"))) + load_and_extract_words(os.path.normpath(os.path.join(base_dir,"hard_ham", "hard_ham")))
spam = load_and_extract_words(os.path.normpath(os.path.join(base_dir,"spam_2", "spam_2")))
print(f"Ham words extracted: {len(ham)}")
print(f"Spam words extracted: {len(spam)}")


## Data Cleaning

In [None]:
# Find suspicious spam-only words
ham_common = set(word for word, _ in Counter(ham).most_common(500))
spam_common = Counter(spam).most_common(500)
suspicious_keywords = [
    word for word, _ in spam_common
    if word not in ham_common and len(word) >= 3
]

print(f"\nTop 20 suspicious phishing keywords:")
for i, kw in enumerate(suspicious_keywords[:20], 1):
    print(f"{i:2}. {kw}")

In [None]:
# EML Pre-processing
def clean_keyword_list(keywords):
    cleaned = []
    for w in keywords:
        if not isinstance(w, str):
            continue
        word = w.lower().strip()
        if word.isalpha() and word not in stop_words and word not in person_names:
            cleaned.append(word)
    return cleaned

suspicious_keywords = clean_keyword_list(suspicious_keywords)

# Tokenize test using NTLK, stopwords, only alphabetic words, human names
def clean_and_tokenize(text):
    if not isinstance(text, str):
        return []
    tokens = word_tokenize(text.lower())
    filtered = [
        w for w in tokens
        if w.isalpha()
        and w not in stop_words
        and w not in person_names 
    ]
    return filtered

In [41]:
# CSV Extraction
phishing_words = []
safe_words = []

for _, row in df.iterrows():
    words = clean_and_tokenize(row['Email Text'])
    if row['is_phishing']:
        phishing_words.extend(words)
    else:
        safe_words.extend(words)

print(phishing_words)
print(suspicious_keywords)
# combined = phishing_words + suspicious_keywords
# print(f"Phishing words: {len(combined)}")
print(f"Phishing words: {len(phishing_words)}")
print(f"Safe words: {len(safe_words)}")

['congratulations', 'gift', 'card', 'click', 'claim', 'prize', 'new', 'secure', 'message', 'bank', 'click', 'read', 'package', 'delivery', 'pending', 'please', 'provide', 'personal', 'information', 'confirm', 'delivery', 'package', 'delivery', 'pending', 'please', 'provide', 'personal', 'information', 'confirm', 'delivery', 'alert', 'unusual', 'login', 'attempt', 'detected', 'verify', 'account', 'clicking', 'subscription', 'expire', 'renew', 'continue', 'enjoying', 'services', 'subscription', 'expire', 'renew', 'continue', 'enjoying', 'services', 'package', 'delivery', 'pending', 'please', 'provide', 'personal', 'information', 'confirm', 'delivery', 'important', 'update', 'email', 'account', 'settings', 'avoid', 'service', 'interruption', 'new', 'secure', 'message', 'bank', 'click', 'read', 'alert', 'unusual', 'login', 'attempt', 'detected', 'verify', 'account', 'clicking', 'account', 'compromised', 'click', 'link', 'reset', 'password', 'immediately', 'account', 'compromised', 'click',

## Score Analysis

In [46]:
# Scoring words based on frequency it appears in phishing emails
phish_freq = Counter(phishing_words)
safe_freq = Counter(safe_words)

def score_word(word, phish_freq, safe_freq):
    phish_count = phish_freq[word]
    safe_count = safe_freq.get(word, 0)
    imbalance = phish_count / (safe_count + 1)  # avoid divide-by-zero
    # Sentiment extremeness (words with emotional charge get higher weight)
    comp = sia.polarity_scores(word)['compound']
    sentiment_extreme = abs(comp)
    return imbalance * (1 + sentiment_extreme)

# Scoring of words from both csv and eml
word_scores = []
for word in phish_freq:
    print(word)
    score = score_word(word, phish_freq, safe_freq)
    print(score)
    if score > 0:
        word_scores.append((word, score))

# Convert to dict for easy merging
word_scores_dict = {word: score for word, score in word_scores}

# Set base score and proportional decrement
n = len(suspicious_keywords)
temp_scores = [score_word(w, phish_freq, safe_freq) for w in phish_freq]
max_score = (max(temp_scores) if temp_scores else 1) * 0.9   # near top of your score range
min_score = (min(temp_scores) if temp_scores else 0) + ((max(temp_scores) if temp_scores else 1) * 0.1)  # near lower edge

for idx, word in enumerate(suspicious_keywords):
    # linear decrease from max_score to min_score
    auto_score = max_score - ((max_score - min_score) * (idx / max(n-1, 1)))
    # keep the higher score if already exists
    if word in word_scores_dict:
        word_scores_dict[word] = max(word_scores_dict[word], auto_score)
    else:
        word_scores_dict[word] = auto_score

word_scores = list(word_scores_dict.items())
word_scores.sort(key=lambda x: x[1], reverse=True)

congratulations
163.1388
gift
146.92079999999999
card
102.0
click
305.0
claim
102.0
prize
154.08120000000002
new
103.0
secure
1.2214159292035398
message
103.0
bank
103.0
read
103.0
package
93.0
delivery
186.0
pending
93.0
please
0.6591
provide
1.0108695652173914
personal
93.0
information
203.0
confirm
93.0
alert
123.12
unusual
188.0
login
95.0
attempt
95.0
detected
95.0
verify
95.0
account
377.0
clicking
95.0
subscription
1.1428571428571428
expire
112.0
renew
112.0
continue
222.0
enjoying
170.9904
services
112.0
important
107.0047
update
2.1630434782608696
email
89.0
settings
89.0
avoid
248.832
service
199.0
interruption
121.1468
compromised
100.0
link
100.0
reset
100.0
password
100.0
immediately
100.0
noticed
93.0
activity
93.0
log
93.0
review
0.9702970297029703
recent
93.0
transactions
93.0
payment
110.0
declined
110.0
billing
110.0
using
110.0
invoice
103.0
attached
0.4951923076923077
pay
113.5781
promptly
103.0
penalties
103.0


## Overwrite sus_keyword.txt

In [47]:
# DO NOT run this unless you are overwritting keyword detector data
with open("../Website/config/sus_keywords.txt", "w", encoding="utf-8") as f:
    for word, score in word_scores:
        f.write(f"{word}\t{score:.4f}\n")

print(f"\n✅ Saved {len(word_scores)} keywords to sus_keywords")


✅ Saved 221 keywords to sus_keywords
