In [None]:
import pandas as pd
import numpy as np
import re
from urllib.parse import urlparse
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix
import lightgbm as lgb
import joblib
from scipy.sparse import hstack
import matplotlib.pyplot as plt
import seaborn as sns
import json
import sys
from scipy.sparse import csr_matrix # Added for explicit sparse matrix handling

# Set a style for plots
plt.style.use('ggplot')

In [None]:
# =======================
# 1️⃣ Load Datasets
# =======================
try:
    # ONLY loading the malicious_phish.csv as requested
    phish_urls = pd.read_csv("./Datasets/malicious_phish.csv")
except FileNotFoundError:
    print("❌ Error: 'malicious_phish.csv' not found. Please ensure it's in the 'Datasets' directory.")
    sys.exit(1)

print("✅ Dataset Loaded Successfully!")
print(f"malicious_phish.csv → {phish_urls.shape}")

# Filter out 'defacement', 'malware', etc., if you want a binary (Phishing/Benign) classification
phish_urls['type'] = phish_urls['type'].astype('category').cat.remove_unused_categories()
phish_urls = phish_urls[phish_urls['type'].isin(['benign', 'phishing'])].copy()

# =======================
# 2️⃣ Feature Extraction Function
# =======================
def extract_features(url):
    """Extracts a consistent set of handcrafted features from a URL."""
    url = str(url).strip()
    if not url:
        url = "http://invalid-url"
    if "://" not in url:
        url = "http://" + url
    try:
        parsed = urlparse(url)
    except Exception:
        parsed = urlparse("http://invalid-url")

    features = {}
    features['url_length'] = len(url)
    features['num_dots'] = url.count('.')
    features['num_hyphens'] = url.count('-')
    features['num_digits'] = sum(c.isdigit() for c in url)
    features['num_params'] = url.count('=') + url.count('?') + url.count('&')
    features['has_https'] = 1 if url.startswith('https') else 0
    features['has_ip'] = 1 if re.match(r'^(?:http[s]?://)?\d{1,3}(?:\.\d{1,3}){3}', url) else 0
    features['num_subdirs'] = url.count('/')
    features['has_at_symbol'] = 1 if '@' in url else 0
    features['subdomain_count'] = max(0, parsed.netloc.count('.') - 1)
    features['contains_login'] = 1 if 'login' in url.lower() else 0
    features['contains_verify'] = 1 if 'verify' in url.lower() else 0
    features['contains_secure'] = 1 if 'secure' in url.lower() else 0
    return features

# Apply feature extraction
phish_urls['url'] = phish_urls['url'].astype(str)
feature_data = phish_urls['url'].apply(extract_features).apply(pd.Series)

# Encode types: 0 → benign, 1 → phishing
y_final = phish_urls['type'].apply(lambda x: 0 if x == 'benign' else 1).values
X_numeric = feature_data

# Get the EXACT list of feature names (no need for legacy alignment)
numeric_feature_names = list(X_numeric.columns)

# Save feature names for prediction time
with open("numeric_feature_names.json", "w") as f:
    json.dump(numeric_feature_names, f)

In [None]:
# =======================
# 3️⃣ Scaling + TF-IDF
# =======================

# TF-IDF on URLs
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(3, 5), max_features=3000)
tfidf_features = vectorizer.fit_transform(phish_urls['url'])

# Scaling on numeric features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_numeric.fillna(0))

# Final feature matrix (TF-IDF + Scaled Handcrafted Features)
X_final = hstack([tfidf_features, X_scaled]).tocsr() # Convert to CSR for efficiency

print("\nDataset ready for training.")
print(f"Final Feature Matrix Shape: {X_final.shape}")

# =======================
# 4️⃣ Train-Test Split + Model Training
# =======================
X_train, X_test, y_train, y_test = train_test_split(
    X_final, y_final, test_size=0.2, random_state=42, stratify=y_final
)

# Calculate the imbalance factor (to improve precision)
# ratio = Count(Benign) / Count(Phishing)
ratio = len(y_final[y_final == 0]) / len(y_final[y_final == 1])

params = {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'metric': 'binary_logloss',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'max_depth': -1,
    'n_estimators': 400,
    'random_state': 42,
    # Use a factor slightly higher than the ratio to heavily penalize False Positives
    'scale_pos_weight': ratio * 1.5 
}

print("\n🚀 Training LightGBM model...")
lgb_model = lgb.LGBMClassifier(**params)
lgb_model.fit(X_train, y_train)

In [None]:
# =======================
# 5️⃣ Evaluation
# =======================
y_pred = lgb_model.predict(X_test)
y_prob = lgb_model.predict_proba(X_test)[:, 1]

print("\n📊 Model Evaluation:")
print(f"Accuracy  : {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision : {precision_score(y_test, y_pred):.4f}")
print(f"Recall    : {recall_score(y_test, y_pred):.4f}")
print(f"F1 Score  : {f1_score(y_test, y_pred):.4f}")
print(f"ROC AUC   : {roc_auc_score(y_test, y_prob):.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Plot the Confusion Matrix
plt.figure(figsize=(6,5))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Legitimate', 'Phishing'], yticklabels=['Legitimate', 'Phishing'])
plt.title("Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

# =======================
# 6️⃣ Save Model + Vectorizer + Scaler
# =======================
joblib.dump(lgb_model, "phishing_lightgbm_model_simple.joblib")
joblib.dump(vectorizer, "tfidf_vectorizer_simple.joblib")
joblib.dump(scaler, "scaler_simple.joblib")
print("\n✅ Model, Vectorizer, and Scaler saved successfully!")

In [None]:
# =======================
# 7️⃣ Prediction Function
# =======================
with open("numeric_feature_names.json", "r") as f:
    numeric_feature_names = json.load(f)

# Set a slightly stricter threshold to minimize False Positives
PHISHING_THRESHOLD = 0.95 

def predict_url(url):
    """Predicts the label for a single URL using the trained model and features."""
    
    # 1. Extract Handcrafted Features
    feats = extract_features(url)
    feats_df = pd.DataFrame([feats])
    
    # 2. Align Features (Ensures the feature vector has the correct columns and order)
    # This step is mainly for robustness against future changes, but essential for the scaler.
    for col in numeric_feature_names:
        if col not in feats_df.columns:
            feats_df[col] = 0 
    feats_df = feats_df[numeric_feature_names]
    
    # 3. Scale and Vectorize
    feats_scaled = scaler.transform(feats_df.fillna(0))
    tfidf_vec = vectorizer.transform([url]).tocsr() 
    
    # 4. Combine and Predict
    x = hstack([tfidf_vec, feats_scaled]).tocsr()
    
    # Use predict_proba to get the score
    prob = lgb_model.predict_proba(x)[0, 1]
    
    # Apply the custom threshold
    pred = 1 if prob >= PHISHING_THRESHOLD else 0
    
    print(f"\n🔗 URL: {url}")
    print(f"Prediction: {'Phishing' if pred == 1 else 'Legitimate'} (Probability: {prob:.4f}, Threshold: {PHISHING_THRESHOLD})")
    print(f"Confidence: {prob:.2%}")

# =======================
# 8️⃣ User Input Loop
# =======================
print("\n--- Starting Interactive URL Checker ---")
while True:
    user_url = input("\n\n---\nEnter a URL to check (or type 'exit' to quit): ").strip()
    if user_url.lower() == 'exit':
        print("👋 Exiting URL checker.")
        break
    if not user_url:
        print("⚠️ Please enter a valid URL.")
        continue
    try:
        predict_url(user_url)
    except Exception as e:
        print(f"❌ An error occurred during prediction: {e}")