
# Enhanced Dynamic DOM Analysis for Phishing Detection

This notebook extracts dynamic DOM-based features from a dataset of URLs and trains a machine learning model to detect phishing websites.


In [2]:

import pandas as pd

df = pd.read_csv('./URL-improved dataset.csv')
df.head()


Unnamed: 0,url,type
0,https://www.google.com,legitimate
1,https://www.youtube.com,legitimate
2,https://www.facebook.com,legitimate
3,https://www.baidu.com,legitimate
4,https://www.wikipedia.org,legitimate


In [8]:
features_df = pd.read_csv('./dom_features.csv')
features_df.head()


Unnamed: 0,url,type,forms,inputs,iframes,scripts,images,buttons,domDepth,maxChildren,titleLength,onmouseoverEvents,externalResourceRatio,inlineStyles,phishingKeywordHits,usesHTTPS,hasEval
0,https://www.imdb.com/name/nm0642993/,legitimate,0,0,0,0,0,0,5,2,13,0,0.0,0,0,True,0
1,https://www.videosurf.com/samantha-mcleod-38476,legitimate,0,0,0,0,0,0,5,3,13,0,0.0,0,0,True,0
2,https://www.scottvestal.com/studio.htm,legitimate,0,1,0,2,0,3,11,105,13,0,0.0,0,3,True,0
3,https://www.learningpracticalturkish.com/turki...,legitimate,0,0,1,4,0,1,9,9,32,0,0.2,1,0,True,0
4,https://www.aepriverops.com/careers/,legitimate,0,1,0,2,0,3,11,95,13,0,0.0,0,2,True,0


In [9]:

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

df_shuffled = features_df.sample(frac=1, random_state=42).reset_index(drop=True)
X = df_shuffled.drop(columns=['url', 'type'])
y = df_shuffled['type'].map({'legitimate': 0, 'phishing': 1})

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)


In [12]:

y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Total: {len(y_test)} Test Accuracy: {accuracy:.4f}")


Total: 121 Test Accuracy: 0.9669


In [13]:
# Add expected/predicted labels
comparison_df = X_test.copy()
comparison_df['expected'] = y_test.values
comparison_df['predicted'] = y_pred

# Add URLs
comparison_df['url'] = features_df.loc[comparison_df.index, 'url'].values

# Convert back to string labels for readability
label_map = {0: 'legitimate', 1: 'phishing'}
comparison_df['expected'] = comparison_df['expected'].map(label_map)
comparison_df['predicted'] = comparison_df['predicted'].map(label_map)

# Show correct predictions
correct = comparison_df[comparison_df['expected'] == comparison_df['predicted']]
print("✅ Correct Predictions (Sample):")
print(correct[['url', 'predicted', 'expected']].head(5))

# Show incorrect predictions
incorrect = comparison_df[comparison_df['expected'] != comparison_df['predicted']]
print("\n❌ Incorrect Predictions (Sample):")
print(incorrect[['url', 'predicted', 'expected']].head(5))


✅ Correct Predictions (Sample):
                                                   url   predicted    expected
575  https://www.marketwire.com/press-release/on-se...  legitimate  legitimate
317                      https://www.twitter.com/andol  legitimate  legitimate
86   https://www.pipl.com/directory/people/Faith/Ch...  legitimate  legitimate
419                       https://www.butchers.net.au/  legitimate  legitimate
392  https://www.bankspower.com/news/show/28-banks-...    phishing    phishing

❌ Incorrect Predictions (Sample):
                                                   url   predicted    expected
371     https://www.mid-centralconf.org/roster/8/9.php    phishing  legitimate
332         https://www.meetyourmusician.blogspot.com/  legitimate    phishing
445  https://www.albertleatribune.com/2011/11/20/go...  legitimate    phishing
154  https://www.local.com/business/details/kansas-...  legitimate    phishing


In [14]:
import json

def tree_to_dict(tree, feature_names):
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != -2 else "undefined!"
        for i in tree_.feature
    ]

    def recurse(node):
        if tree_.feature[node] != -2:  # not a leaf
            return {
                "feature": feature_name[node],
                "threshold": tree_.threshold[node],
                "left": recurse(tree_.children_left[node]),
                "right": recurse(tree_.children_right[node]),
            }
        else:
            return {"value": tree_.value[node].tolist()}

    return recurse(0)

# Export all trees
forest_json = [
    tree_to_dict(estimator, X_train.columns)
    for estimator in clf.estimators_
]

# Save to JSON file
with open("random_forest_model.json", "w") as f:
    json.dump(forest_json, f)