
# Enhanced Dynamic DOM Analysis for Phishing Detection

This notebook extracts dynamic DOM-based features from a dataset of URLs and trains a machine learning model to detect phishing websites.


In [3]:
import pandas as pd

features_df = pd.read_csv('./dom_features.csv')
features_df.head()


Unnamed: 0,url,type,forms,inputs,iframes,scripts,images,buttons,domDepth,maxChildren,titleLength,onmouseoverEvents,externalResourceRatio,inlineStyles,phishingKeywordHits,usesHTTPS,hasEval
0,https://www.imdb.com/name/nm0315192/bio,legitimate,0,0,0,0,0,0,5,2,13,0,0.0,0,0,True,0
1,https://www.evri.com/organization/23rd-infantr...,legitimate,0,8,2,35,3,26,22,48,46,0,0.133333,43,4,True,0
2,https://www.music.yahoo.com/eric-lapointe/,legitimate,1,5,0,5,6,3,17,8,38,0,0.769231,0,0,True,0
3,https://www.youtube.com/watch?v=joUMCgZl7sw,legitimate,1,2,2,56,5,49,27,309,7,0,0.195122,166,1,True,0
4,https://www.ecapcity.com/new-era-fitted-hats/m...,legitimate,4,19,2,55,7,7,20,108,32,0,0.378947,15,1,True,0


In [4]:

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

df_shuffled = features_df.sample(frac=1, random_state=42).reset_index(drop=True)
X = df_shuffled.drop(columns=['url', 'type'])
y = df_shuffled['type'].map({'legitimate': 0, 'phishing': 1})

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.01, random_state=42)

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)


In [5]:

y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Total: {len(y_test)} Test Accuracy: {accuracy:.4f}")


Total: 495 Test Accuracy: 0.9838


In [6]:
# Add expected/predicted labels
comparison_df = X_test.copy()
comparison_df['expected'] = y_test.values
comparison_df['predicted'] = y_pred

# Add URLs
comparison_df['url'] = features_df.loc[comparison_df.index, 'url'].values

# Convert back to string labels for readability
label_map = {0: 'legitimate', 1: 'phishing'}
comparison_df['expected'] = comparison_df['expected'].map(label_map)
comparison_df['predicted'] = comparison_df['predicted'].map(label_map)

# Show correct predictions
correct = comparison_df[comparison_df['expected'] == comparison_df['predicted']]
print("✅ Correct Predictions (Sample):")
print(correct[['url', 'predicted', 'expected']].head(5))

# Show incorrect predictions
incorrect = comparison_df[comparison_df['expected'] != comparison_df['predicted']]
print("\n❌ Incorrect Predictions (Sample):")
print(incorrect[['url', 'predicted', 'expected']].head(5))


✅ Correct Predictions (Sample):
                                                     url   predicted  \
36470  https://www.claudemoorejeweler.com/info_index....    phishing   
14628                 https://www.pioneertechnology.com/  legitimate   
28558        https://www.flixster.com/actor/john-hancock  legitimate   
10528               https://www.zizaru.com.hypestat.com/  legitimate   
17513  https://www.linkedin.com/pub/jim-gleeson/11/49...  legitimate   

         expected  
36470    phishing  
14628  legitimate  
28558  legitimate  
10528  legitimate  
17513  legitimate  

❌ Incorrect Predictions (Sample):
                                                     url   predicted  expected
16024        https://www.youtube.com/watch?v=t9bU-0-Kvio  legitimate  phishing
4041   https://www.familysearch.org/Eng/Search/ancest...  legitimate  phishing
30220  https://www.washingtonlife.com/directories/pho...  legitimate  phishing
47663                 http://coalimpex.com/web/re_02.php  legiti

In [8]:
import json

def tree_to_dict(tree, feature_names):
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != -2 else "undefined!"
        for i in tree_.feature
    ]

    def recurse(node):
        if tree_.feature[node] != -2:  # not a leaf
            return {
                "feature": feature_name[node],
                "threshold": tree_.threshold[node],
                "left": recurse(tree_.children_left[node]),
                "right": recurse(tree_.children_right[node]),
            }
        else:
            return {"value": tree_.value[node].tolist()}

    return recurse(0)

# Export all trees
forest_json = [
    tree_to_dict(estimator, X_train.columns)
    for estimator in clf.estimators_
]

# Save to JSON file
with open("random_forest_model.json", "w") as f:
    json.dump(forest_json, f)

In [7]:
openphish_df = pd.read_csv('./openphish_dom_features.csv')

openphish_df = openphish_df.reset_index(drop=True)
X = openphish_df.drop(columns=['url', 'type'])
y = openphish_df['type'].map({'legitimate': 0, 'phishing': 1})

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.01, random_state=42)

y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Total: {len(y_test)} Test Accuracy: {accuracy:.4f}")

Total: 3 Test Accuracy: 0.6667
