In [1]:
# Cell 1

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier


In [2]:
# Cell 2

df = pd.read_csv("../data/raw/PhiUSIIL_Phishing_URL_Dataset.csv")


In [3]:
# Cell 3

drop_cols = ["URL", "Domain", "Title", "FILENAME", "TLD"]

df = df.drop(columns=[col for col in drop_cols if col in df.columns])

df.head()


Unnamed: 0,URLLength,DomainLength,IsDomainIP,URLSimilarityIndex,CharContinuationRate,TLDLegitimateProb,URLCharProb,TLDLength,NoOfSubDomain,HasObfuscation,...,Pay,Crypto,HasCopyrightInfo,NoOfImage,NoOfCSS,NoOfJS,NoOfSelfRef,NoOfEmptyRef,NoOfExternalRef,label
0,31,24,0,100.0,1.0,0.522907,0.061933,3,1,0,...,0,0,1,34,20,28,119,0,124,1
1,23,16,0,100.0,0.666667,0.03265,0.050207,2,1,0,...,0,0,1,50,9,8,39,0,217,1
2,29,22,0,100.0,0.866667,0.028555,0.064129,2,2,0,...,0,0,1,10,2,7,42,2,5,1
3,26,19,0,100.0,1.0,0.522907,0.057606,3,1,0,...,1,1,1,3,27,15,22,1,31,1
4,33,26,0,100.0,1.0,0.079963,0.059441,3,1,0,...,1,0,1,244,15,34,72,1,85,1


In [4]:
# Cell 4

X = df.drop("label", axis=1)
y = df["label"]

print("Feature shape:", X.shape)


Feature shape: (235795, 50)


In [5]:
# Cell 5

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [7]:
# Cell 6

# Logistic (scaled)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train_scaled, y_train)

# Random Forest
rf_model = RandomForestClassifier(
    n_estimators=400,
    max_depth=20,
    min_samples_leaf=2,
    random_state=42
)
rf_model.fit(X_train, y_train)

# XGBoost
xgb_model = XGBClassifier(
    n_estimators=400,
    max_depth=7,
    eval_metric="logloss",
    random_state=42
)
xgb_model.fit(X_train, y_train)


0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [9]:
# Cell 7

print("=== Clean Accuracy (Full Feature Model) ===")

print("Logistic:", accuracy_score(y_test, log_model.predict(X_test_scaled)))
print("Random Forest:", accuracy_score(y_test, rf_model.predict(X_test)))
print("XGBoost:", accuracy_score(y_test, xgb_model.predict(X_test)))


=== Clean Accuracy (Full Feature Model) ===
Logistic: 0.9998727708390763
Random Forest: 1.0
XGBoost: 1.0


In [10]:
print(X.columns.tolist())


['URLLength', 'DomainLength', 'IsDomainIP', 'URLSimilarityIndex', 'CharContinuationRate', 'TLDLegitimateProb', 'URLCharProb', 'TLDLength', 'NoOfSubDomain', 'HasObfuscation', 'NoOfObfuscatedChar', 'ObfuscationRatio', 'NoOfLettersInURL', 'LetterRatioInURL', 'NoOfDegitsInURL', 'DegitRatioInURL', 'NoOfEqualsInURL', 'NoOfQMarkInURL', 'NoOfAmpersandInURL', 'NoOfOtherSpecialCharsInURL', 'SpacialCharRatioInURL', 'IsHTTPS', 'LineOfCode', 'LargestLineLength', 'HasTitle', 'DomainTitleMatchScore', 'URLTitleMatchScore', 'HasFavicon', 'Robots', 'IsResponsive', 'NoOfURLRedirect', 'NoOfSelfRedirect', 'HasDescription', 'NoOfPopup', 'NoOfiFrame', 'HasExternalFormSubmit', 'HasSocialNet', 'HasSubmitButton', 'HasHiddenFields', 'HasPasswordField', 'Bank', 'Pay', 'Crypto', 'HasCopyrightInfo', 'NoOfImage', 'NoOfCSS', 'NoOfJS', 'NoOfSelfRef', 'NoOfEmptyRef', 'NoOfExternalRef']


In [22]:
X_test_noise = X_test.copy()

noise_mask = np.random.rand(*X_test.shape) < 0.1
X_test_noise = X_test_noise.mask(noise_mask, 0)
print("LOG Model under noise:",
      accuracy_score(y_test, log_model.predict(X_test_noise)))
print("Random Forest under noise:",
      accuracy_score(y_test, rf_model.predict(X_test_noise)))
print("XG Model under noise:",
      accuracy_score(y_test, xgb_model.predict(X_test_noise)))



LOG Model under noise: 0.42990733476112725
Random Forest under noise: 0.925995038062724
XG Model under noise: 0.9194003265548464


In [12]:
import pandas as pd

importances = rf_model.feature_importances_

importance_df = pd.DataFrame({
    "feature": X.columns,
    "importance": importances
}).sort_values(by="importance", ascending=False)

importance_df.head(15)


Unnamed: 0,feature,importance
3,URLSimilarityIndex,0.20057
49,NoOfExternalRef,0.149453
22,LineOfCode,0.120953
47,NoOfSelfRef,0.102237
44,NoOfImage,0.096037
46,NoOfJS,0.070779
45,NoOfCSS,0.040249
36,HasSocialNet,0.035276
32,HasDescription,0.02567
43,HasCopyrightInfo,0.02466


In [20]:
# Remove top 3 features

top_features = ["URLSimilarityIndex", "NoOfExternalRef", "LineOfCode"]

X_reduced = X.drop(columns=top_features)

X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(
    X_reduced, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

rf_reduced = RandomForestClassifier(
    n_estimators=200,
    max_depth=15,
    min_samples_leaf=3,
    random_state=42
)

rf_reduced.fit(X_train_r, y_train_r)

print("LOG Model Clean Accuracy:",
      accuracy_score(y_test, log_model.predict(X_test_attack)))
print("Reduced Model Clean Accuracy:",
      accuracy_score(y_test_r, rf_reduced.predict(X_test_r)))
print("XG Model Clean Accuracy:",
      accuracy_score(y_test, xgb_model.predict(X_test_attack)))



LOG Model Clean Accuracy: 0.5452405691384465
Reduced Model Clean Accuracy: 0.9997667465383065
XG Model Clean Accuracy: 0.9999575902796921


In [21]:
   # Simulate attacker improving page quality

X_test_attack = X_test.copy()

attack_features = [
    "NoOfImage",
    "NoOfJS",
    "NoOfCSS",
    "HasSocialNet",
    "HasDescription",
    "HasCopyrightInfo"
]

for feature in attack_features:
    if feature in X_test_attack.columns:
        X_test_attack[feature] = X_test_attack[feature].max()
print("LOG Accuracy under structural mimic attack:",
      accuracy_score(y_test, log_model.predict(X_test_attack)))
print("RF Accuracy under structural mimic attack:",
      accuracy_score(y_test, rf_model.predict(X_test_attack)))
print("XG Accuracy under structural mimic attack:",
      accuracy_score(y_test, xgb_model.predict(X_test_attack)))



LOG Accuracy under structural mimic attack: 0.5452405691384465
RF Accuracy under structural mimic attack: 0.9996607222375369
XG Accuracy under structural mimic attack: 0.9999575902796921


In [23]:
# Remove top 10 features

top10 = importance_df.head(10)["feature"].tolist()

X_reduced10 = X.drop(columns=top10)

X_train_r10, X_test_r10, y_train_r10, y_test_r10 = train_test_split(
    X_reduced10, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

rf_reduced10 = RandomForestClassifier(
    n_estimators=200,
    max_depth=15,
    min_samples_leaf=3,
    random_state=42
)

rf_reduced10.fit(X_train_r10, y_train_r10)

print("Reduced Top-10 Model Accuracy:",
      accuracy_score(y_test_r10, rf_reduced10.predict(X_test_r10)))


Reduced Top-10 Model Accuracy: 0.998791322971225
