In [None]:
import joblib

# Load the model back
loaded_model = joblib.load("/content/url_phishing_lgbm_model.pkl")

print("Model loaded successfully")

Model loaded successfully


In [None]:
import numpy as np

# Example: one URL represented by extracted features
# [URLLength, NoOfDots, NoOfHyphens, NoOfDigits, IsHTTPS]
sample_features = np.array([[28, 1, 0, 4, 1]])

# Predict probability
prob = loaded_model.predict_proba(sample_features)[0][1]

# Apply threshold
prediction = 1 if prob >= FINAL_THRESHOLD else 0

print("Phishing probability:", prob)
print("Prediction (1 = phishing, 0 = benign):", prediction)


Phishing probability: 0.9999780985480546
Prediction (1 = phishing, 0 = benign): 1




In [None]:
# ============================================================
# 1. Imports
# ============================================================
import pandas as pd
import numpy as np
import joblib
import re

from sklearn.metrics import recall_score, precision_score, f1_score, confusion_matrix

# ============================================================
# 2. Load saved model
# ============================================================
model = joblib.load("url_phishing_lgbm_model.pkl")
FINAL_THRESHOLD = 0.25

print("Model loaded successfully")
print("Using threshold:", FINAL_THRESHOLD)

# ============================================================
# 3. Load PhiUSIIL dataset
# ============================================================
phi_path = "/content/PhiUSIIL_Phishing_URL_Dataset.csv"  # update path if needed
df = pd.read_csv(phi_path, encoding="latin1")

print("Initial PhiUSIIL shape:", df.shape)

# ============================================================
# 4. Keep ONLY URL and label
# ============================================================
df = df[["URL", "label"]].dropna()

df["label"] = df["label"].astype(int)

print("After keeping URL + label:", df.shape)
print("Label distribution:")
print(df["label"].value_counts())

# ============================================================
# 5. URL feature extraction (MUST match training logic)
# ============================================================
def extract_url_features(url):
    url = str(url)
    return {
        "URLLength": len(url),
        "NoOfDots": url.count("."),
        "NoOfHyphens": url.count("-"),
        "NoOfDigits": sum(char.isdigit() for char in url),
        "IsHTTPS": 1 if url.lower().startswith("https") else 0
    }

features_df = df["URL"].apply(extract_url_features).apply(pd.Series)

# ============================================================
# 6. Prepare test matrix
# ============================================================
X_test_ext = features_df[
    ["URLLength", "NoOfDots", "NoOfHyphens", "NoOfDigits", "IsHTTPS"]
]

y_test_ext = df["label"]

print("External test feature shape:", X_test_ext.shape)

# ============================================================
# 7. Predict probabilities
# ============================================================
y_probs = model.predict_proba(X_test_ext)[:, 1]

# Apply threshold
y_pred = (y_probs >= FINAL_THRESHOLD).astype(int)

# ============================================================
# 8. Evaluation
# ============================================================
recall = recall_score(y_test_ext, y_pred)
precision = precision_score(y_test_ext, y_pred)
f1 = f1_score(y_test_ext, y_pred)
cm = confusion_matrix(y_test_ext, y_pred)

print("\n=== External Validation Results (PhiUSIIL, URL-only) ===")
print("Recall:", recall)
print("Precision:", precision)
print("F1-score:", f1)

print("\nConfusion Matrix:")
print(cm)


Model loaded successfully
Using threshold: 0.25
Initial PhiUSIIL shape: (235795, 56)
After keeping URL + label: (235795, 2)
Label distribution:
label
1    134850
0    100945
Name: count, dtype: int64
External test feature shape: (235795, 5)

=== External Validation Results (PhiUSIIL, URL-only) ===
Recall: 1.0
Precision: 0.6015550767501305
F1-score: 0.7512137240647431

Confusion Matrix:
[[ 11626  89319]
 [     0 134850]]
