In [1]:
# Cell 1

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load PhiUSIIL (sample for speed)
df_phi = pd.read_csv("../data/raw/PhiUSIIL_Phishing_URL_Dataset.csv")
df_phi = df_phi.sample(60000, random_state=42)

# Load UCI
df_uci = pd.read_csv("../data/raw/uci-ml-phishing-dataset.csv")


In [2]:
# Cell 2

phi_features = [
    "URLLength",
    "DomainLength",
    "NoOfSubDomain",
    "NoOfDegitsInURL",
    "NoOfOtherSpecialCharsInURL",
    "IsHTTPS",
    "IsDomainIP"
]

X_phi = df_phi[phi_features]
y_phi = df_phi["label"]


In [3]:
print(df_uci.columns.tolist())


['id', 'having_IP_Address', 'URL_Length', 'Shortining_Service', 'having_At_Symbol', 'double_slash_redirecting', 'Prefix_Suffix', 'having_Sub_Domain', 'SSLfinal_State', 'Domain_registeration_length', 'Favicon', 'port', 'HTTPS_token', 'Request_URL', 'URL_of_Anchor', 'Links_in_tags', 'SFH', 'Submitting_to_email', 'Abnormal_URL', 'Redirect', 'on_mouseover', 'RightClick', 'popUpWidnow', 'Iframe', 'age_of_domain', 'DNSRecord', 'web_traffic', 'Page_Rank', 'Google_Index', 'Links_pointing_to_page', 'Statistical_report', 'Result']


In [4]:
# Cell 3

phi_subset = df_phi[[
    "URLLength",
    "IsDomainIP",
    "NoOfSubDomain",
    "IsHTTPS"
]].copy()

y_phi = df_phi["label"]

phi_subset.head()


Unnamed: 0,URLLength,IsDomainIP,NoOfSubDomain,IsHTTPS
136221,24,0,2,1
56609,59,0,0,1
46393,385,0,3,1
129746,19,0,0,0
131464,35,0,2,1


In [7]:
# Cell 4

uci_subset = df_uci[[
    "URL_Length",
    "having_IP_Address",
    "having_Sub_Domain",
    "SSLfinal_State"
]].copy()

y_uci = df_uci["Result"]

# Convert -1/1 to 0/1 for label
y_uci = y_uci.replace(-1, 0)

uci_subset.head()


Unnamed: 0,URL_Length,having_IP_Address,having_Sub_Domain,SSLfinal_State
0,1,-1,-1,-1
1,1,1,0,1
2,0,1,-1,-1
3,0,1,-1,-1
4,0,1,1,1


In [9]:
# Cell 5 (Fixed)

# Handle missing values first
phi_subset["URLLength"] = phi_subset["URLLength"].fillna(0)

# Use wide bins to cover all values
phi_subset["URL_Length"] = pd.cut(
    phi_subset["URLLength"],
    bins=[-1, 54, 75, np.inf],
    labels=[-1, 0, 1]
)

# Convert to integer safely
phi_subset["URL_Length"] = phi_subset["URL_Length"].astype(int)

# Convert NoOfSubDomain to binary
phi_subset["having_Sub_Domain"] = (phi_subset["NoOfSubDomain"] > 1).astype(int)

# Map IP & HTTPS directly
phi_subset["having_IP_Address"] = phi_subset["IsDomainIP"].astype(int)
phi_subset["SSLfinal_State"] = phi_subset["IsHTTPS"].astype(int)

# Final aligned set
phi_aligned = phi_subset[[
    "URL_Length",
    "having_IP_Address",
    "having_Sub_Domain",
    "SSLfinal_State"
]]

phi_aligned.head()


Unnamed: 0,URL_Length,having_IP_Address,having_Sub_Domain,SSLfinal_State
136221,-1,0,1,1
56609,0,0,0,1
46393,1,0,1,1
129746,-1,0,0,0
131464,-1,0,1,1


In [10]:
# Cell 6

from sklearn.ensemble import RandomForestClassifier

rf_cross = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    random_state=42
)

rf_cross.fit(phi_aligned, y_phi)


0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [11]:
# Cell 7

print("Train on Phi → Test on UCI")

print("Accuracy:",
      accuracy_score(y_uci, rf_cross.predict(uci_subset)))


Train on Phi → Test on UCI
Accuracy: 0.6546359113523292


In [12]:
# Train on UCI → Test on Phi

rf_reverse = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    random_state=42
)

rf_reverse.fit(uci_subset, y_uci)

print("Train on UCI → Test on Phi")
print("Accuracy:",
      accuracy_score(y_phi, rf_reverse.predict(phi_aligned)))


Train on UCI → Test on Phi
Accuracy: 0.8047833333333333


In [13]:
from sklearn.metrics import classification_report

print("Phi → UCI Report")
print(classification_report(y_uci, rf_cross.predict(uci_subset)))

print("UCI → Phi Report")
print(classification_report(y_phi, rf_reverse.predict(phi_aligned)))


Phi → UCI Report
              precision    recall  f1-score   support

           0       0.57      0.90      0.70      4898
           1       0.85      0.46      0.60      6157

    accuracy                           0.65     11055
   macro avg       0.71      0.68      0.65     11055
weighted avg       0.73      0.65      0.64     11055

UCI → Phi Report
              precision    recall  f1-score   support

           0       1.00      0.54      0.70     25600
           1       0.75      1.00      0.85     34400

    accuracy                           0.80     60000
   macro avg       0.87      0.77      0.78     60000
weighted avg       0.85      0.80      0.79     60000

