In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [7]:
# =========================================================
# RANDOM FOREST CLASSIFIER (COMPATIBLE WITH 45-FEATURE DATASET)
# =========================================================

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    f1_score, precision_score, recall_score
)
from sklearn.utils.class_weight import compute_class_weight
import joblib
import os
import warnings
warnings.filterwarnings('ignore')

# =========================================================
# PATH CONFIGURATION
# =========================================================
DATA_PATH = "/content/drive/MyDrive/Webshield Dataset/TrancoCombinedDTs/More Branded Phishing Synthetic Too /More Branded Phishing Tranco Original.csv"
RESULTS_DIR = "/content/drive/MyDrive/Webshield Dataset/RF Results 640k+50k+35k More Branded Phishing Ones"

os.makedirs(RESULTS_DIR, exist_ok=True)
print(f"✅ Results will be saved in: {RESULTS_DIR}")

# =========================================================
# LOAD DATA
# =========================================================
data = pd.read_csv(DATA_PATH)
print(f"\nDataset shape: {data.shape}")
print(f"Columns: {list(data.columns)}")
print(f"Class distribution:\n{data['type'].value_counts()}")

# =========================================================
# HANDLE MISSING VALUES
# =========================================================
num_features = data.select_dtypes(include=[np.number]).columns.tolist()
for col in ['type', 'url']:
    if col in num_features:
        num_features.remove(col)
data[num_features] = data[num_features].fillna(data[num_features].median())

# =========================================================
# SPLIT FEATURES AND LABELS
# =========================================================
X = data.drop(['url', 'type'], axis=1)
y = data['type']

feature_names = list(X.columns)
print("\n✓ Features prepared (no scaling needed for tree-based models)")

# =========================================================
# DATA SPLIT
# =========================================================
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

print(f"\nData split - Train: {len(X_train)}, Val: {len(X_val)}, Test: {len(X_test)}")

# =========================================================
# HANDLE CLASS IMBALANCE
# =========================================================
print("\nComputing class weights (Random Forest)...")
classes = np.unique(y_train)
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weights_dict = dict(zip(classes, class_weights))
print(f"Class weights: {class_weights_dict}")

# =========================================================
# TRAIN RANDOM FOREST MODEL
# =========================================================
print("\n" + "="*60)
print("🌲 Training Random Forest Model")
print("="*60)

rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=25,
    min_samples_split=4,
    min_samples_leaf=2,
    max_features='sqrt',
    class_weight=class_weights_dict,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)
print("✓ Model training completed")

# =========================================================
# VALIDATION PERFORMANCE
# =========================================================
y_val_pred = rf.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"\nValidation Accuracy: {val_accuracy:.4f}")

# =========================================================
# TEST PERFORMANCE
# =========================================================
print("\n" + "="*60)
print("📊 MODEL PERFORMANCE (TEST SET)")
print("="*60)

y_pred = rf.predict(X_test)
y_proba = rf.predict_proba(X_test)
np.save(os.path.join(RESULTS_DIR, "rf_probabilities.npy"), y_proba)

test_accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy:  {test_accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")

# Save classification report
report = classification_report(y_test, y_pred, output_dict=True)
pd.DataFrame(report).transpose().to_csv(os.path.join(RESULTS_DIR, "rf_classification_report.csv"))
print("\nDetailed per-class report saved.")

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
pd.DataFrame(cm).to_csv(os.path.join(RESULTS_DIR, "rf_confusion_matrix.csv"))
print("Confusion matrix saved.")

# =========================================================
# FEATURE IMPORTANCE
# =========================================================
print("\n" + "="*60)
print("📈 TOP FEATURES CONTRIBUTING TO DECISIONS")
print("="*60)

feature_importances = pd.Series(
    rf.feature_importances_,
    index=feature_names
).sort_values(ascending=False)

print(feature_importances.head(15))
feature_importances.to_csv(os.path.join(RESULTS_DIR, "rf_feature_importance.csv"))
print("Feature importances saved.")

# =========================================================
# ERROR ANALYSIS
# =========================================================
misclassified_idx = np.where(y_pred != y_test)[0]
error_pct = len(misclassified_idx) / len(y_test) * 100
print(f"\nMisclassified samples: {len(misclassified_idx)} ({error_pct:.2f}%)")

# =========================================================
# SAVE MODEL
# =========================================================
print("\n" + "="*60)
print("💾 SAVING MODEL ARTIFACTS")
print("="*60)

MODEL_VERSION = "1.3.0"
MODEL_PATH = os.path.join(RESULTS_DIR, f"rf_url_classifier_v{MODEL_VERSION}.pkl")

metadata = {
    'version': MODEL_VERSION,
    'training_date': pd.Timestamp.now().strftime('%Y-%m-%d'),
    'test_accuracy': float(test_accuracy),
    'precision_weighted': float(precision),
    'recall_weighted': float(recall),
    'f1_weighted': float(f1),
    'feature_count': X.shape[1],
    'training_samples': len(X_train),
    'class_distribution': dict(pd.Series(y_train).value_counts()),
    'hyperparameters': rf.get_params(),
    'notes': 'Tree-based model (no scaling required)'
}

model_artifact = {
    'model': rf,
    'metadata': metadata,
    'feature_names': feature_names
}

joblib.dump(model_artifact, MODEL_PATH)
print(f"✓ Model saved to: {MODEL_PATH}")

# =========================================================
# FINAL SUMMARY
# =========================================================
summary_path = os.path.join(RESULTS_DIR, "rf_summary.txt")
with open(summary_path, "w") as f:
    f.write(f"Random Forest Model Summary (v{MODEL_VERSION})\n")
    f.write("="*60 + "\n")
    f.write(f"Accuracy: {test_accuracy:.4f}\n")
    f.write(f"Precision: {precision:.4f}\n")
    f.write(f"Recall: {recall:.4f}\n")
    f.write(f"F1 Score: {f1:.4f}\n")
    f.write(f"Validation Accuracy: {val_accuracy:.4f}\n")
    f.write(f"Total features: {len(feature_names)}\n")
    f.write(f"Saved model path: {MODEL_PATH}\n")

print(f"\n✅ All results and artifacts saved in: {RESULTS_DIR}")
print("Training completed successfully.")


✅ Results will be saved in: /content/drive/MyDrive/Webshield Dataset/RF Results 640k+50k+35k More Branded Phishing Ones

Dataset shape: (716118, 68)
Columns: ['action_word', 'brand_impersonation', 'brand_not_in_domain', 'brand_not_in_main_domain', 'brand_with_hyphen', 'consecutive_consonants', 'digit_letter_ratio', 'domain_bigram_diversity', 'domain_digit_ratio', 'domain_entropy', 'domain_has_digits', 'domain_is_dictionary_word', 'domain_length', 'domain_trigram_diversity', 'domain_url_ratio', 'domain_vowel_ratio', 'has_brand_name', 'has_character_substitution', 'has_ip', 'has_multiple_subdomains', 'has_port', 'host_entropy', 'is_country_tld', 'is_high_trust_tld', 'is_mixed_case', 'is_shortening_service', 'is_suspicious_tld', 'is_typosquatting', 'longest_token_length', 'multiple_brands_in_domain', 'num_digits', 'num_dots', 'num_encoded_chars', 'num_fragments', 'num_hyphens', 'num_letters', 'num_path_segments', 'num_query_params', 'num_repeated_chars', 'num_special_chars', 'num_subdomai

In [None]:
# LIGHT GBM WITH SCALING

import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    f1_score, precision_score, recall_score
)
from sklearn.utils.class_weight import compute_class_weight
import joblib
import warnings
warnings.filterwarnings('ignore')

# =========================================================
# LOAD DATA
# =========================================================
data = pd.read_csv('/content/drive/MyDrive/Webshield Dataset/urls_features_combined.csv')

print(f"Dataset shape: {data.shape}")
print(f"Columns: {list(data.columns)}")
print(f"Class distribution:\n{data['type'].value_counts()}")

# =========================================================
# HANDLE MISSING VALUES
# =========================================================
num_features = data.select_dtypes(include=[np.number]).columns.tolist()
for col in ['type', 'url']:
    if col in num_features:
        num_features.remove(col)
data[num_features] = data[num_features].fillna(data[num_features].median())

# =========================================================
# SPLIT FEATURES AND LABEL
# =========================================================
X = data.drop(['url', 'type'], axis=1)
y = data['type']

# SCALE FEATURES
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
joblib.dump(scaler, 'scaler.pkl')
print("✓ Scaler saved successfully")

# =========================================================
# DATA SPLIT
# =========================================================
X_train, X_temp, y_train, y_temp = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

print(f"\nData split - Train: {len(X_train)}, Val: {len(X_val)}, Test: {len(X_test)}")

# =========================================================
# HANDLE IMBALANCE (USE CLASS WEIGHTS)
# =========================================================
print("\nHandling imbalance using class weights (LightGBM).")
classes = np.unique(y_train)
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weights_dict = dict(zip(classes, class_weights))
print(f"\nClass weights: {class_weights_dict}")

# =========================================================
# TRAINING
# =========================================================
print("\n" + "="*50)
print("Training LightGBM Model...")
print("="*50)

lgbm = LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    num_leaves=64,
    subsample=0.8,
    colsample_bytree=0.8,
    class_weight=class_weights_dict,
    random_state=42,
    n_jobs=-1
)

lgbm.fit(X_train, y_train,
         eval_set=[(X_val, y_val)],
         eval_metric='multi_logloss')


print("✓ Model training completed")

# =========================================================
# VALIDATION METRICS
# =========================================================
y_val_pred = lgbm.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"\nValidation Accuracy: {val_accuracy:.4f}")

# =========================================================
# TEST METRICS
# =========================================================
print("\n" + "="*50)
print("MODEL PERFORMANCE METRICS (TEST SET)")
print("="*50)

y_pred = lgbm.predict(X_test)
y_proba = lgbm.predict_proba(X_test)

# Save probabilities
np.save('lgbm_probabilities.npy', y_proba)

# Accuracy
test_accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {test_accuracy:.4f}")

# Precision, Recall, F1 (weighted)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"\nPrecision (weighted): {precision:.4f}")
print(f"Recall (weighted):    {recall:.4f}")
print(f"F1 Score (weighted):  {f1:.4f}")

# Classification report
print("\nDetailed per-class report:")
print(classification_report(y_test, y_pred))

# Confusion matrix
print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(cm)

# Interpret performance
print("\n--- INTERPRETATION GUIDE ---")
print("• Accuracy measures overall correctness.")
print("• Precision shows how many predicted malicious URLs were truly malicious.")
print("• Recall shows how many real malicious URLs were correctly found.")
print("• F1 balances precision and recall (good general metric).")
print("Ideal targets: Accuracy >0.93, F1 >0.90, per-class recall >0.85")

# =========================================================
# ERROR ANALYSIS
# =========================================================
print("\n" + "="*50)
print("ERROR ANALYSIS")
print("="*50)

misclassified_idx = np.where(y_pred != y_test)[0]
error_pct = len(misclassified_idx) / len(y_test) * 100
print(f"\nTotal misclassified samples: {len(misclassified_idx)} ({error_pct:.2f}%)")

# =========================================================
# FEATURE IMPORTANCE
# =========================================================
print("\n" + "="*50)
print("TOP FEATURES CONTRIBUTING TO DECISIONS")
print("="*50)

feature_importances = pd.Series(lgbm.feature_importances_, index=X.columns).sort_values(ascending=False)
print("\nTop 15 Most Important Features:")
print(feature_importances.head(15))

# =========================================================
# SAVE MODEL ARTIFACTS
# =========================================================
print("\n" + "="*50)
print("SAVING MODEL ARTIFACTS")
print("="*50)

MODEL_VERSION = "1.3.0"
metadata = {
    'version': MODEL_VERSION,
    'training_date': pd.Timestamp.now().strftime('%Y-%m-%d'),
    'test_accuracy': float(test_accuracy),
    'precision_weighted': float(precision),
    'recall_weighted': float(recall),
    'f1_weighted': float(f1),
    'feature_count': X.shape[1],
    'training_samples': len(X_train),
    'class_distribution': dict(pd.Series(y_train).value_counts()),
    'hyperparameters': lgbm.get_params()
}

model_artifact = {
    'model': lgbm,
    'scaler': scaler,
    'metadata': metadata,
    'feature_names': list(X.columns)
}

joblib.dump(model_artifact, f'lgbm_url_classifier_v{MODEL_VERSION}.pkl')
print(f"✓ Model saved as 'lgbm_url_classifier_v{MODEL_VERSION}.pkl'")

print("\n" + "="*50)
print("TRAINING COMPLETED SUCCESSFULLY")
print("="*50)

# FINAL QUICK SUMMARY
print("\nQuick summary:")
print(f"Accuracy: {test_accuracy:.4f}")
print(f"Precision: {precision:.4f} | Recall: {recall:.4f} | F1: {f1:.4f}")
print("Check the classification report for class-wise scores to confirm consistency.")


Dataset shape: (651191, 31)
Columns: ['url', 'type', 'url_length', 'num_dots', 'num_hyphens', 'num_underscores', 'num_digits', 'num_letters', 'num_special_chars', 'has_ip', 'num_subdomains', 'domain_length', 'host_entropy', 'path_length', 'num_path_segments', 'num_query_params', 'query_length', 'num_encoded_chars', 'num_fragments', 'suspicious_word', 'sensitive_word', 'action_word', 'is_shortening_service', 'is_mixed_case', 'url_entropy', 'path_entropy', 'domain_entropy', 'num_repeated_chars', 'longest_token_length', 'suspicious_prefix_suffix', 'num_suspicious_symbols']
Class distribution:
type
benign        428103
defacement     96457
phishing       94111
malware        32520
Name: count, dtype: int64
✓ Scaler saved successfully

Data split - Train: 520952, Val: 65119, Test: 65120

Handling imbalance using class weights (LightGBM).

Class weights: {'benign': np.float64(0.3802769196629312), 'defacement': np.float64(1.6877859133026631), 'malware': np.float64(5.006073185731857), 'phishin

In [None]:
# LIGHT GBM WITHOUT SCALING

import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    f1_score, precision_score, recall_score
)
from sklearn.utils.class_weight import compute_class_weight
import joblib
import warnings
warnings.filterwarnings('ignore')

# =========================================================
# LOAD DATA
# =========================================================
data = pd.read_csv('/content/drive/MyDrive/Webshield Dataset/urls_features_combined.csv')

print(f"Dataset shape: {data.shape}")
print(f"Columns: {list(data.columns)}")
print(f"Class distribution:\n{data['type'].value_counts()}")

# =========================================================
# HANDLE MISSING VALUES
# =========================================================
num_features = data.select_dtypes(include=[np.number]).columns.tolist()
for col in ['type', 'url']:
    if col in num_features:
        num_features.remove(col)
data[num_features] = data[num_features].fillna(data[num_features].median())

# =========================================================
# SPLIT FEATURES AND LABEL
# =========================================================
X = data.drop(['url', 'type'], axis=1)
y = data['type']

# NO SCALING NEEDED - Tree-based models don't require it
feature_names = list(X.columns)
print("✓ Features prepared (no scaling needed for tree models)")

# =========================================================
# DATA SPLIT
# =========================================================
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

print(f"\nData split - Train: {len(X_train)}, Val: {len(X_val)}, Test: {len(X_test)}")

# =========================================================
# HANDLE IMBALANCE (USE CLASS WEIGHTS)
# =========================================================
print("\nHandling imbalance using class weights (LightGBM).")
classes = np.unique(y_train)
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weights_dict = dict(zip(classes, class_weights))
print(f"\nClass weights: {class_weights_dict}")

# =========================================================
# TRAINING
# =========================================================
print("\n" + "="*50)
print("Training LightGBM Model...")
print("="*50)

lgbm = LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    num_leaves=64,
    max_depth=-1,  # Added: no depth limit (let num_leaves control complexity)
    subsample=0.8,
    colsample_bytree=0.8,
    class_weight=class_weights_dict,
    random_state=42,
    n_jobs=-1,
    verbose=-1  # Added: suppress training output
)

lgbm.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric='multi_logloss'
)

print("✓ Model training completed")

# =========================================================
# VALIDATION METRICS
# =========================================================
y_val_pred = lgbm.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"\nValidation Accuracy: {val_accuracy:.4f}")

# =========================================================
# TEST METRICS
# =========================================================
print("\n" + "="*50)
print("MODEL PERFORMANCE METRICS (TEST SET)")
print("="*50)

y_pred = lgbm.predict(X_test)
y_proba = lgbm.predict_proba(X_test)

# Save probabilities
np.save('lgbm_probabilities.npy', y_proba)

# Accuracy
test_accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {test_accuracy:.4f}")

# Precision, Recall, F1 (weighted)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"\nPrecision (weighted): {precision:.4f}")
print(f"Recall (weighted):    {recall:.4f}")
print(f"F1 Score (weighted):  {f1:.4f}")

# Classification report
print("\nDetailed per-class report:")
print(classification_report(y_test, y_pred))

# Confusion matrix
print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(cm)

# Interpret performance
print("\n--- INTERPRETATION GUIDE ---")
print("• Accuracy measures overall correctness.")
print("• Precision shows how many predicted malicious URLs were truly malicious.")
print("• Recall shows how many real malicious URLs were correctly found.")
print("• F1 balances precision and recall (good general metric).")
print("Ideal targets: Accuracy >0.93, F1 >0.90, per-class recall >0.85")

# =========================================================
# ERROR ANALYSIS
# =========================================================
print("\n" + "="*50)
print("ERROR ANALYSIS")
print("="*50)

misclassified_idx = np.where(y_pred != y_test)[0]
error_pct = len(misclassified_idx) / len(y_test) * 100
print(f"\nTotal misclassified samples: {len(misclassified_idx)} ({error_pct:.2f}%)")

# =========================================================
# FEATURE IMPORTANCE
# =========================================================
print("\n" + "="*50)
print("TOP FEATURES CONTRIBUTING TO DECISIONS")
print("="*50)

feature_importances = pd.Series(
    lgbm.feature_importances_,
    index=feature_names
).sort_values(ascending=False)

print("\nTop 15 Most Important Features:")
print(feature_importances.head(15))

# Save feature importance
feature_importances.to_csv('lgbm_feature_importance.csv')

# =========================================================
# SAVE MODEL ARTIFACTS
# =========================================================
print("\n" + "="*50)
print("SAVING MODEL ARTIFACTS")
print("="*50)

MODEL_VERSION = "1.3.0"
metadata = {
    'version': MODEL_VERSION,
    'training_date': pd.Timestamp.now().strftime('%Y-%m-%d'),
    'test_accuracy': float(test_accuracy),
    'precision_weighted': float(precision),
    'recall_weighted': float(recall),
    'f1_weighted': float(f1),
    'feature_count': X.shape[1],
    'training_samples': len(X_train),
    'class_distribution': dict(pd.Series(y_train).value_counts()),
    'hyperparameters': lgbm.get_params(),
    'notes': 'No feature scaling applied (tree-based model)'
}

model_artifact = {
    'model': lgbm,
    'metadata': metadata,
    'feature_names': feature_names
}

joblib.dump(model_artifact, f'lgbm_url_classifier_v{MODEL_VERSION}.pkl')
print(f"✓ Model saved as 'lgbm_url_classifier_v{MODEL_VERSION}.pkl'")

print("\n" + "="*50)
print("TRAINING COMPLETED SUCCESSFULLY")
print("="*50)

# FINAL QUICK SUMMARY
print("\nQuick summary:")
print(f"Accuracy: {test_accuracy:.4f}")
print(f"Precision: {precision:.4f} | Recall: {recall:.4f} | F1: {f1:.4f}")
print("Check the classification report for class-wise scores to confirm consistency.")

Dataset shape: (651191, 31)
Columns: ['url', 'type', 'url_length', 'num_dots', 'num_hyphens', 'num_underscores', 'num_digits', 'num_letters', 'num_special_chars', 'has_ip', 'num_subdomains', 'domain_length', 'host_entropy', 'path_length', 'num_path_segments', 'num_query_params', 'query_length', 'num_encoded_chars', 'num_fragments', 'suspicious_word', 'sensitive_word', 'action_word', 'is_shortening_service', 'is_mixed_case', 'url_entropy', 'path_entropy', 'domain_entropy', 'num_repeated_chars', 'longest_token_length', 'suspicious_prefix_suffix', 'num_suspicious_symbols']
Class distribution:
type
benign        428103
defacement     96457
phishing       94111
malware        32520
Name: count, dtype: int64
✓ Features prepared (no scaling needed for tree models)

Data split - Train: 520952, Val: 65119, Test: 65120

Handling imbalance using class weights (LightGBM).

Class weights: {'benign': np.float64(0.3802769196629312), 'defacement': np.float64(1.6877859133026631), 'malware': np.float64(

In [None]:
import joblib
artifact = joblib.load('/content/lgbm_url_classifier_v1.3.0.pkl')
print(artifact['feature_names'])


['url_length', 'num_dots', 'num_hyphens', 'num_underscores', 'num_digits', 'num_letters', 'num_special_chars', 'has_ip', 'num_subdomains', 'domain_length', 'host_entropy', 'path_length', 'num_path_segments', 'num_query_params', 'query_length', 'num_encoded_chars', 'num_fragments', 'suspicious_word', 'sensitive_word', 'action_word', 'is_shortening_service', 'is_mixed_case', 'url_entropy', 'path_entropy', 'domain_entropy', 'num_repeated_chars', 'longest_token_length', 'suspicious_prefix_suffix', 'num_suspicious_symbols']


In [9]:
# PREDICT CUSTOM URL WITH CUSTOM MODEL

import re, math, joblib, pandas as pd
from urllib.parse import urlparse, unquote

# ---------- Helper ----------
def entropy(s):
    if not s: return 0
    p = [s.count(c)/len(s) for c in set(s)]
    return -sum(pi*math.log2(pi) for pi in p)

# ---------- Extract 29 numeric features ----------
def extract_features(url):
    if not re.match(r'^[a-zA-Z]+://', url):
        url = 'http://' + url
    p = urlparse(url)
    host, path, q = p.netloc.lower(), p.path, p.query
    full = unquote(url)

    return pd.DataFrame([{
        'url_length': len(full),
        'num_dots': full.count('.'),
        'num_hyphens': full.count('-'),
        'num_underscores': full.count('_'),
        'num_digits': sum(c.isdigit() for c in full),
        'num_letters': sum(c.isalpha() for c in full),
        'num_special_chars': len(re.findall(r'[^A-Za-z0-9]', full)),
        'has_ip': 1 if re.match(r'^\d{1,3}(\.\d{1,3}){3}$', host) else 0,
        'num_subdomains': max(len(host.split('.')) - 2, 0),
        'domain_length': len(host),
        'host_entropy': entropy(host),
        'path_length': len(path),
        'num_path_segments': path.count('/'),
        'num_query_params': q.count('='),
        'query_length': len(q),
        'num_encoded_chars': full.count('%') + full.count('+'),
        'num_fragments': 1 if p.fragment else 0,
        'suspicious_word': int(any(w in full for w in ['login','secure','update','bank'])),
        'sensitive_word': int(any(w in full for w in ['password','credit','card'])),
        'action_word': int(any(w in full for w in ['click','submit','run'])),
        'is_shortening_service': int(any(s in host for s in ['bit.ly','t.co','goo.gl'])),
        'is_mixed_case': int(any(c.isupper() for c in full) and any(c.islower() for c in full)),
        'url_entropy': entropy(full),
        'path_entropy': entropy(path),
        'domain_entropy': entropy(host),
        'num_repeated_chars': max(len(list(g)) for _,g in re.findall(r'((\w)\2*)', full)) if full else 0,
        'longest_token_length': max((len(t) for t in re.split(r'[^A-Za-z0-9]', full) if t), default=0),
        'suspicious_prefix_suffix': int(host.startswith(('login','secure')) or host.endswith(('login','secure'))),
        'num_suspicious_symbols': sum(c in '!@#$%^&*()=+{}[]|\\:;"<>,?~`' for c in full)
    }])

# ---------- Predict ----------
artifact = joblib.load('/content/drive/MyDrive/Webshield Dataset/LIGHTGBM 640k+40k+35K more branded phishing /lgbm_url_classifier_v1.3.0.pkl')
model = artifact['model']
features = artifact['feature_names']

url = "www.github.com"
X = extract_features(url)[features]

pred = model.predict(X)[0]
proba = model.predict_proba(X)[0]

print("URL:", url)
print("Predicted:", pred)
print("Probabilities:", dict(zip(model.classes_, proba)))


KeyError: "['brand_impersonation', 'brand_not_in_domain', 'brand_not_in_main_domain', 'brand_with_hyphen', 'consecutive_consonants', 'digit_letter_ratio', 'domain_bigram_diversity', 'domain_digit_ratio', 'domain_has_digits', 'domain_is_dictionary_word', 'domain_trigram_diversity', 'domain_url_ratio', 'domain_vowel_ratio', 'has_brand_name', 'has_character_substitution', 'has_multiple_subdomains', 'has_port', 'is_country_tld', 'is_high_trust_tld', 'is_suspicious_tld', 'is_typosquatting', 'multiple_brands_in_domain', 'num_suspicious_words', 'path_has_suspicious_ext', 'path_url_ratio', 'punycode_domain', 'query_has_redirect', 'query_url_ratio', 'special_char_ratio', 'subdomain_count_dot', 'subdomain_length', 'suspicious_tld_brand_combo', 'tld_length', 'tld_trust_category', 'typosquatting_similarity', 'uppercase_ratio', 'uses_https'] not in index"

In [2]:
# CURRENT USED LIGHTGBM

import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    f1_score, precision_score, recall_score
)
from sklearn.utils.class_weight import compute_class_weight
import joblib
import os
import warnings
warnings.filterwarnings('ignore')

# =========================================================
# PATH CONFIGURATION
# =========================================================
DATA_PATH = "/content/drive/MyDrive/Webshield Dataset/TrancoCombinedDTs/More Branded Phishing Synthetic Too /More Branded Phishing Tranco Original.csv"
RESULTS_DIR = "/content/drive/MyDrive/Webshield Dataset/LIGHTGBM 640k+40k+35K more branded phishing "

os.makedirs(RESULTS_DIR, exist_ok=True)
print(f"✅ Results will be saved in: {RESULTS_DIR}")

# =========================================================
# LOAD DATA
# =========================================================
data = pd.read_csv(DATA_PATH)

print(f"\nDataset shape: {data.shape}")
print(f"Columns: {list(data.columns)}")
print(f"Class distribution:\n{data['type'].value_counts()}")

# =========================================================
# HANDLE MISSING VALUES
# =========================================================
num_features = data.select_dtypes(include=[np.number]).columns.tolist()
for col in ['type', 'url']:
    if col in num_features:
        num_features.remove(col)
data[num_features] = data[num_features].fillna(data[num_features].median())

# =========================================================
# SPLIT FEATURES AND LABEL
# =========================================================
X = data.drop(['url', 'type'], axis=1)
y = data['type']

feature_names = list(X.columns)
print("\n✓ Features prepared (no scaling needed for tree-based models)")

# =========================================================
# TRAIN / VALIDATION / TEST SPLIT
# =========================================================
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

print(f"\nData split - Train: {len(X_train)}, Val: {len(X_val)}, Test: {len(X_test)}")

# =========================================================
# HANDLE CLASS IMBALANCE
# =========================================================
print("\nComputing class weights (LightGBM)...")
classes = np.unique(y_train)
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weights_dict = dict(zip(classes, class_weights))
print(f"Class weights: {class_weights_dict}")

# =========================================================
# TRAIN LIGHTGBM MODEL
# =========================================================
print("\n" + "="*60)
print("🚀 Training LightGBM Model")
print("="*60)

lgbm = LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    num_leaves=64,
    max_depth=-1,
    subsample=0.8,
    colsample_bytree=0.8,
    class_weight=class_weights_dict,
    random_state=42,
    n_jobs=-1,
    verbose=-1
)

lgbm.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric='multi_logloss'
)

print("✓ Model training completed")

# =========================================================
# VALIDATION PERFORMANCE
# =========================================================
y_val_pred = lgbm.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"\nValidation Accuracy: {val_accuracy:.4f}")

# =========================================================
# TEST PERFORMANCE
# =========================================================
print("\n" + "="*60)
print("📊 MODEL PERFORMANCE (TEST SET)")
print("="*60)

y_pred = lgbm.predict(X_test)
y_proba = lgbm.predict_proba(X_test)
np.save(os.path.join(RESULTS_DIR, "lgbm_probabilities.npy"), y_proba)

test_accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy:  {test_accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")

# Save classification report
report = classification_report(y_test, y_pred, output_dict=True)
pd.DataFrame(report).transpose().to_csv(os.path.join(RESULTS_DIR, "lgbm_classification_report.csv"))
print("\nDetailed per-class report saved.")

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
pd.DataFrame(cm).to_csv(os.path.join(RESULTS_DIR, "lgbm_confusion_matrix.csv"))
print("Confusion matrix saved.")

# =========================================================
# FEATURE IMPORTANCE
# =========================================================
print("\n" + "="*60)
print("📈 TOP FEATURES CONTRIBUTING TO DECISIONS")
print("="*60)

feature_importances = pd.Series(
    lgbm.feature_importances_,
    index=feature_names
).sort_values(ascending=False)

print(feature_importances.head(15))
feature_importances.to_csv(os.path.join(RESULTS_DIR, "lgbm_feature_importance.csv"))
print("Feature importances saved.")

# =========================================================
# ERROR ANALYSIS
# =========================================================
misclassified_idx = np.where(y_pred != y_test)[0]
error_pct = len(misclassified_idx) / len(y_test) * 100
print(f"\nMisclassified samples: {len(misclassified_idx)} ({error_pct:.2f}%)")

# =========================================================
# SAVE MODEL
# =========================================================
print("\n" + "="*60)
print("💾 SAVING MODEL ARTIFACTS")
print("="*60)

MODEL_VERSION = "1.3.0"
MODEL_PATH = os.path.join(RESULTS_DIR, f"lgbm_url_classifier_v{MODEL_VERSION}.pkl")

metadata = {
    'version': MODEL_VERSION,
    'training_date': pd.Timestamp.now().strftime('%Y-%m-%d'),
    'test_accuracy': float(test_accuracy),
    'precision_weighted': float(precision),
    'recall_weighted': float(recall),
    'f1_weighted': float(f1),
    'feature_count': X.shape[1],
    'training_samples': len(X_train),
    'class_distribution': dict(pd.Series(y_train).value_counts()),
    'hyperparameters': lgbm.get_params(),
    'notes': 'Tree-based model (no feature scaling required)'
}

model_artifact = {
    'model': lgbm,
    'metadata': metadata,
    'feature_names': feature_names
}

joblib.dump(model_artifact, MODEL_PATH)
print(f"✓ Model saved to: {MODEL_PATH}")

# =========================================================
# FINAL SUMMARY
# =========================================================
summary_path = os.path.join(RESULTS_DIR, "lgbm_summary.txt")
with open(summary_path, "w") as f:
    f.write(f"LightGBM Model Summary (v{MODEL_VERSION})\n")
    f.write("="*60 + "\n")
    f.write(f"Accuracy: {test_accuracy:.4f}\n")
    f.write(f"Precision: {precision:.4f}\n")
    f.write(f"Recall: {recall:.4f}\n")
    f.write(f"F1 Score: {f1:.4f}\n")
    f.write(f"Validation Accuracy: {val_accuracy:.4f}\n")
    f.write(f"Total features: {len(feature_names)}\n")
    f.write(f"Saved model path: {MODEL_PATH}\n")

print(f"\n✅ All results and artifacts saved in: {RESULTS_DIR}")
print("Training completed successfully.")


✅ Results will be saved in: /content/drive/MyDrive/Webshield Dataset/LIGHTGBM 640k+40k+35K more branded phishing 

Dataset shape: (716118, 68)
Columns: ['action_word', 'brand_impersonation', 'brand_not_in_domain', 'brand_not_in_main_domain', 'brand_with_hyphen', 'consecutive_consonants', 'digit_letter_ratio', 'domain_bigram_diversity', 'domain_digit_ratio', 'domain_entropy', 'domain_has_digits', 'domain_is_dictionary_word', 'domain_length', 'domain_trigram_diversity', 'domain_url_ratio', 'domain_vowel_ratio', 'has_brand_name', 'has_character_substitution', 'has_ip', 'has_multiple_subdomains', 'has_port', 'host_entropy', 'is_country_tld', 'is_high_trust_tld', 'is_mixed_case', 'is_shortening_service', 'is_suspicious_tld', 'is_typosquatting', 'longest_token_length', 'multiple_brands_in_domain', 'num_digits', 'num_dots', 'num_encoded_chars', 'num_fragments', 'num_hyphens', 'num_letters', 'num_path_segments', 'num_query_params', 'num_repeated_chars', 'num_special_chars', 'num_subdomains', '

In [13]:
# PREDICTOIN ON BENIGN

import re, math, joblib, pandas as pd
from urllib.parse import urlparse, unquote
from tldextract import extract
from collections import Counter

# ---------- Helper ----------
def entropy(s):
    if not s or not isinstance(s, str):
        return 0
    s = ''.join(c for c in s if 32 <= ord(c) <= 126)
    if not s:
        return 0
    p = [s.count(c) / len(s) for c in set(s)]
    return -sum(pi * math.log2(pi) for pi in p if pi > 0)

def vowel_consonant_ratio(s):
    vowels = sum(1 for c in s.lower() if c in 'aeiou')
    consonants = sum(1 for c in s.lower() if c.isalpha() and c not in 'aeiou')
    return vowels / consonants if consonants > 0 else 0

def count_ngrams(s, n=2):
    if len(s) < n: return 0
    ngrams = [s[i:i+n] for i in range(len(s)-n+1)]
    return len(set(ngrams))

def get_tld_category(tld):
    high_trust = ['com','org','net','edu','gov','mil','in','co.in','ac.in','gov.in']
    suspicious = ['tk','ml','ga','cf','gq','pw','cc','top','xyz','club','work','buzz','loan']
    if tld in high_trust: return 2
    if tld in suspicious: return 0
    return 1

def longest_repeated_char(s):
    if not s: return 0
    max_count = count = 1
    for i in range(1, len(s)):
        if s[i] == s[i-1]:
            count += 1
            max_count = max(max_count, count)
        else:
            count = 1
    return max_count


# ---------- Feature Extraction ----------
def extract_features(url):
    if not re.match(r'^[a-zA-Z]+://', url):
        url = 'http://' + url

    p = urlparse(url)
    ext = extract(url)
    domain, subdomain, suffix = ext.domain or '', ext.subdomain or '', ext.suffix or ''
    host, path, q = p.netloc.lower(), p.path, p.query
    full = unquote(url)

    features = {
        'url_length': len(full),
        'num_dots': full.count('.'),
        'num_hyphens': full.count('-'),
        'num_underscores': full.count('_'),
        'num_digits': sum(c.isdigit() for c in full),
        'num_letters': sum(c.isalpha() for c in full),
        'num_special_chars': sum(full.count(c) for c in ['@','?','=','%','&','!','+','$']),
        'has_ip': int(bool(re.match(r'\d+\.\d+\.\d+\.\d+', host))),
        'num_subdomains': len(subdomain.split('.')) if subdomain else 0,
        'has_multiple_subdomains': int(len(subdomain.split('.')) >= 3),
        'domain_length': len(domain),
        'host_entropy': entropy(domain),
        'domain_entropy': entropy(domain),
        'domain_has_digits': int(any(c.isdigit() for c in domain)),
        'domain_digit_ratio': sum(c.isdigit() for c in domain) / len(domain) if len(domain) > 0 else 0,
        'domain_vowel_ratio': vowel_consonant_ratio(domain),
        'domain_bigram_diversity': count_ngrams(domain, 2) / len(domain) if len(domain) >= 2 else 0,
        'domain_trigram_diversity': count_ngrams(domain, 3) / len(domain) if len(domain) >= 3 else 0,
        'suspicious_prefix_suffix': int('-' in domain or domain.startswith('www-') or domain.startswith('m-')),
        'num_suspicious_symbols': sum(domain.count(c) for c in ['@', '!', '*']),
        'subdomain_length': len(subdomain),
        'domain_is_dictionary_word': int(domain.lower() in ['google','facebook','amazon','apple','microsoft','wikipedia','paypal','youtube','twitter','linkedin','instagram','flipkart','zomato','swiggy','nykaa','icici','hdfc','sbi','axis']),
        'tld_length': len(suffix),
        'tld_trust_category': get_tld_category(suffix.lower()),
        'is_suspicious_tld': int(suffix.lower() in ['tk','ml','ga','cf','gq','pw','cc','top','xyz','club','work','buzz','loan']),
        'is_high_trust_tld': int(suffix.lower() in ['com','org','net','edu','gov','mil','in','co.in','ac.in','gov.in']),
        'is_country_tld': int(len(suffix) == 2 and suffix.isalpha()),
        'path_length': len(path),
        'num_path_segments': len([p for p in path.split('/') if p]),
        'num_query_params': len(q.split('&')) if q else 0,
        'query_length': len(q),
        'num_encoded_chars': full.count('%'),
        'num_fragments': full.count('#'),
        'path_entropy': entropy(path),
        'path_has_suspicious_ext': int(any(ext in path.lower() for ext in ['.exe','.zip','.apk','.scr','.bat','.cmd'])),
        'query_has_redirect': int(any(word in q.lower() for word in ['redirect','url=','next=','continue=','return='])),
        'path_url_ratio': len(path)/len(full) if len(full)>0 else 0,
        'suspicious_word': int(any(w in full.lower() for w in ['login','secure','update','account','verify','confirm','click','bank','paypal','signin','password','urgent','suspended','locked','expire','reward','prize','winner','claim','free','wallet','kyc','blocked','reactivate'])),
        'num_suspicious_words': sum(1 for w in ['login','secure','update','account','verify','confirm','click','bank','paypal','signin','password','urgent','suspended','locked','expire','reward','prize','winner','claim','free','wallet','kyc','blocked','reactivate'] if w in full.lower()),
        'sensitive_word': int(any(w in full.lower() for w in ['bank','paypal','account','password','credit','card','wallet','upi'])),
        'action_word': int(any(w in full.lower() for w in ['click','verify','confirm','update','download','install'])),
        'has_brand_name': int(any(b in full.lower() for b in ['google','facebook','amazon','microsoft','apple','paypal','netflix','instagram','twitter','linkedin','youtube','yahoo','ebay','icici','hdfc','sbi','axis','swiggy','zomato'])),
        'brand_not_in_domain': int(any(b in full.lower() for b in ['google','facebook','amazon','apple','paypal','youtube']) and not any(b in domain.lower() for b in ['google','facebook','amazon','apple','paypal','youtube'])),
        'is_shortening_service': int(any(s in full for s in ['bit.ly','tinyurl','goo.gl','t.co','ow.ly','is.gd','buff.ly'])),
        'is_mixed_case': int(any(c.isupper() for c in full) and any(c.islower() for c in full)),
        'num_repeated_chars': longest_repeated_char(full),
        'longest_token_length': max((len(t) for t in re.split(r'[./?=&_-]', full) if t), default=0),
        'digit_letter_ratio': sum(c.isdigit() for c in full) / sum(c.isalpha() for c in full) if sum(c.isalpha() for c in full) > 0 else 0,
        'special_char_ratio': sum(1 for c in full if not c.isalnum()) / len(full) if len(full) > 0 else 0,
        'uppercase_ratio': sum(1 for c in full if c.isupper()) / len(full) if len(full) > 0 else 0,
        'consecutive_consonants': max((len(m.group()) for m in re.finditer(r'[bcdfghjklmnpqrstvwxyz]+', full.lower())), default=0),
        'url_entropy': entropy(full),
        'has_port': int(':' in host and not host.startswith('[')),
        'uses_https': int(p.scheme == 'https'),
        'punycode_domain': int('xn--' in domain),
        'subdomain_count_dot': subdomain.count('.') if subdomain else 0,
        'domain_url_ratio': len(domain)/len(full) if len(full)>0 else 0,
        'query_url_ratio': len(q)/len(full) if len(full)>0 else 0
    }

    return pd.DataFrame([features])


# ---------- Load Model ----------
artifact = joblib.load('/content/drive/MyDrive/Webshield Dataset/RF Results 640k+50k+35k More Branded Phishing Ones/rf_url_classifier_v1.3.0.pkl')
model = artifact['model']
features = artifact['feature_names']

urls = [
    "www.google.tk..https.com",
    "www.facebook.com",
    "www.youtube.com",
    "www.twitter.com",
    "www.instagram.com",
    "www.wikipedia.org",
    "www.amazon.com",
    "www.netflix.com",
    "www.linkedin.com",
    "https://www.google.com",
    "https://www.facebook129.232.23.com",
    "https://www.amazon.com",
    "https://www.microsoft.com",
    "https://www.apple.com",
    "https://www.github.com",
    "https://www.stackoverflow.com",
    "https://www.reddit.com",
    "https://www.flipkart.com",
    "https://www.paytm.com",
    "https://www.icicibank.com",
    "https://www.hdfcbank.com",
    "https://www.swiggy.com",
    "https://www.zomato.com",
    "https://www.google-login.tk",
    "https://www.paypal-secure.ml",
    "https://www.amazon-verify.ga",
    "https://www.facebook-recovery.cf",
    "https://secure-netflix-account.xyz",
    "https://www.apple-id-locked.top",
    "https://www.paypal.com.verify-account.com",
    "https://www.amazon.com-login.net",
    "https://secure-google.com",
    "https://www.facebook-help.com",
    "https://www.goog1e.com",
    "https://www.faceb00k.com",
    "https://www.microoft.com",
    "https://www.arnazon.com",
    "https://login-google.com",
    "https://accounts-google-secure.test",
    "https://google-secure-login.test",
    "https://paypal-secure-login.test",
    "https://secure-paypal-update.test",
    "https://signin-amazon.test",
    "https://amazon-secure-update.test",
    "https://appleid-recovery.test",
    "https://appleid-security.test",
    "https://netflix-support.test",
    "https://facebook-account-secure.test",
    "https://linkedin-security.test",
    "https://github-login.test",
    "https://microsoft-account.verify.test",
    "https://support-google.com.scam",
    "https://google.payments.verify.test",
    "https://paypai.com",
    "https://amzon-payments.com",
    "https://face-book-login.org",
    "http://update-paypal.info",
    "http://secure-paypal-login.info",
    "https://verify-paytm.secure.test",
    "https://icici-bank-login.test",
    "https://hdfc-bank-verify.test",
    "https://swiggy-support-login.test",
    "https://zomato-account-verify.test",
    "https://accounts.google.security-alert.test",
    "https://apple-support-login.test",
    "https://microsoft-update-account.test",
    "https://paypal-account-recovery.test",
    "https://amazon-billing-alert.test",
    "https://netflix-payment-issue.test",
    "https://facebook-verify-now.test",
    "https://linkedin-verify-account.test",
    "https://github-2fa-setup.test",
    "https://stackoverflow-login.test",
    "https://reddit-security-alert.test",
    "https://flipkart-payment-verify.test",
    "https://paytm-verify-now.test",
    "https://secure-icicibank-login.test",
    "https://hdfc-verify-account.test",
    "https://delivery-swiggy.verify.test",
    "https://zomato-verify-payment.test",
    "https://www-google-login.example",
    "https://paypal-confirm.example",
    "https://amazon-secure.example",
    "https://facebook-restore.example",
    "https://netflix-verify.example",
    "https://apple-account.example",
    "https://google-support.example",
    "https://paypal-support.example",
    "https://amazon-support.example",
    "https://facebook-support.example",
    "https://netflix-support.example",
    "https://phish-google.test",
    "https://phish-paypal.test",
    "https://phish-amazon.test",
    "https://secure-login-google.co",
    "https://secure-login-paypal.co",
    "https://login-amazon-secure.co",
    "http://verify-google-login.org",
    "http://verify-paypal-login.org",
    "http://verify-amazon-login.org",
]


# ---------- Run Predictions ----------
results = []
for url in urls:
    X = extract_features(url)
    for col in features:
        if col not in X.columns:
            X[col] = 0
    X = X[features]
    pred = model.predict(X)[0]
    proba = model.predict_proba(X)[0]
    results.append({'URL': url, 'Predicted': pred, 'Probabilities': dict(zip(model.classes_, proba))})

df_results = pd.DataFrame(results)
print(df_results)
df_results.to_csv("famous_url_predictions.csv", index=False)


                               URL Predicted  \
0         www.google.tk..https.com  phishing   
1                 www.facebook.com  phishing   
2                  www.youtube.com  phishing   
3                  www.twitter.com  phishing   
4                www.instagram.com  phishing   
..                             ...       ...   
95  https://secure-login-paypal.co  phishing   
96  https://login-amazon-secure.co  phishing   
97  http://verify-google-login.org  phishing   
98  http://verify-paypal-login.org  phishing   
99  http://verify-amazon-login.org  phishing   

                                        Probabilities  
0   {'benign': 0.23361583424647478, 'defacement': ...  
1   {'benign': 0.18621176977234996, 'defacement': ...  
2   {'benign': 0.1789960719938273, 'defacement': 0...  
3   {'benign': 0.19526379504649935, 'defacement': ...  
4   {'benign': 0.18468890165460167, 'defacement': ...  
..                                                ...  
95  {'benign': 0.09370786574118

In [5]:
!pip install tldextract


Collecting tldextract
  Downloading tldextract-5.3.0-py3-none-any.whl.metadata (11 kB)
Collecting requests-file>=1.4 (from tldextract)
  Downloading requests_file-3.0.1-py2.py3-none-any.whl.metadata (1.7 kB)
Downloading tldextract-5.3.0-py3-none-any.whl (107 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/107.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.4/107.4 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading requests_file-3.0.1-py2.py3-none-any.whl (4.5 kB)
Installing collected packages: requests-file, tldextract
Successfully installed requests-file-3.0.1 tldextract-5.3.0


In [None]:
from IPython.display import display
for i in range(0, len(df_results), 50):
    display(df_results.iloc[i:i+50])


Unnamed: 0,URL,Predicted,Probabilities
0,www.google.com,phishing,"{'benign': 0.030185431073480238, 'defacement':..."
1,www.facebook.com,phishing,"{'benign': 0.0005442962039898519, 'defacement'..."
2,www.youtube.com,phishing,"{'benign': 0.03275617603324646, 'defacement': ..."
3,www.twitter.com,phishing,"{'benign': 0.03805893547989424, 'defacement': ..."
4,www.instagram.com,phishing,"{'benign': 0.0009180806652737231, 'defacement'..."
5,www.wikipedia.org,phishing,"{'benign': 0.000673696197627628, 'defacement':..."
6,www.reddit.com,phishing,"{'benign': 0.03309617778003734, 'defacement': ..."
7,www.amazon.com,phishing,"{'benign': 0.022444612302584172, 'defacement':..."
8,www.netflix.com,phishing,"{'benign': 0.05171961030327802, 'defacement': ..."
9,www.linkedin.com,phishing,"{'benign': 0.0010022433352467484, 'defacement'..."


Unnamed: 0,URL,Predicted,Probabilities
50,www.ft.com,phishing,"{'benign': 0.021397945169677645, 'defacement':..."
51,www.nationalgeographic.com,phishing,"{'benign': 0.00012468282013351225, 'defacement..."
52,www.researchgate.net,phishing,"{'benign': 0.0006666064569650584, 'defacement'..."
53,www.acm.org,phishing,"{'benign': 0.003961787268266451, 'defacement':..."
54,www.sciencedirect.com,phishing,"{'benign': 0.0004117697439476663, 'defacement'..."
55,www.arxiv.org,phishing,"{'benign': 0.0007793887596098537, 'defacement'..."
56,www.samsung.com,phishing,"{'benign': 0.022554089013883415, 'defacement':..."
57,www.intel.com,phishing,"{'benign': 0.0012189438901446014, 'defacement'..."
58,www.dell.com,phishing,"{'benign': 0.003948581330013873, 'defacement':..."
59,www.hp.com,phishing,"{'benign': 0.01535204834454217, 'defacement': ..."


Unnamed: 0,URL,Predicted,Probabilities
100,www.php.net,phishing,"{'benign': 0.009861047471100497, 'defacement':..."
101,www.mysql.com,phishing,"{'benign': 0.0007793887596098537, 'defacement'..."
102,www.postgresql.org,phishing,"{'benign': 0.0008867337940460282, 'defacement'..."
103,www.mongodb.com,phishing,"{'benign': 0.026155735137941494, 'defacement':..."
104,www.cloudflare.com,phishing,"{'benign': 0.0008842840625185488, 'defacement'..."
105,www.verisign.com,phishing,"{'benign': 0.0007767303014170193, 'defacement'..."
106,www.digicert.com,phishing,"{'benign': 0.0014959233172086386, 'defacement'..."
107,www.aws.amazon.com,phishing,"{'benign': 0.0004761493974926992, 'defacement'..."
108,www.azure.microsoft.com,phishing,"{'benign': 0.00018401524580659575, 'defacement..."
109,www.cloud.google.com,phishing,"{'benign': 0.00023172299794682009, 'defacement..."


Unnamed: 0,URL,Predicted,Probabilities
150,www.zomato.com,phishing,"{'benign': 0.030598226390323353, 'defacement':..."
151,www.ubereats.com,phishing,"{'benign': 0.0009704850755191934, 'defacement'..."
152,www.dominos.com,phishing,"{'benign': 0.026155735137941494, 'defacement':..."
153,www.pizzahut.com,phishing,"{'benign': 0.0018374301022528767, 'defacement'..."
154,www.mcdonalds.com,phishing,"{'benign': 0.0006689886936755397, 'defacement'..."
155,www.kfc.com,phishing,"{'benign': 0.008448519270339747, 'defacement':..."
156,www.burgerking.com,phishing,"{'benign': 0.00039676797633791114, 'defacement..."
157,www.starbucks.com,phishing,"{'benign': 0.0009180806652737231, 'defacement'..."
158,www.pepsi.com,phishing,"{'benign': 0.0014083709108840107, 'defacement'..."
159,www.coca-cola.com,phishing,"{'benign': 0.0018815013174856728, 'defacement'..."


In [10]:
import os
os.makedirs('/content/drive/MyDrive/Webshield Dataset/Prediction Results', exist_ok=True)


In [11]:
# =========================================================
# PREDICTION SCRIPT FOR DIFFERENT DATASET (WITH FEATURE EXTRACTION)
# =========================================================

import re, math, joblib, pandas as pd
from urllib.parse import urlparse, unquote
from tldextract import extract
from collections import Counter

# ---------- Helper Functions ----------
def entropy(s):
    if not s or not isinstance(s, str): return 0
    s = ''.join(c for c in s if 32 <= ord(c) <= 126)
    if not s: return 0
    p = [s.count(c) / len(s) for c in set(s)]
    return -sum(pi * math.log2(pi) for pi in p if pi > 0)

def vowel_consonant_ratio(s):
    vowels = sum(1 for c in s.lower() if c in 'aeiou')
    consonants = sum(1 for c in s.lower() if c.isalpha() and c not in 'aeiou')
    return vowels / consonants if consonants > 0 else 0

def count_ngrams(s, n=2):
    if len(s) < n: return 0
    ngrams = [s[i:i+n] for i in range(len(s)-n+1)]
    return len(set(ngrams))

def get_tld_category(tld):
    high_trust = ['com','org','net','edu','gov','mil','in','co.in','ac.in','gov.in']
    suspicious = ['tk','ml','ga','cf','gq','pw','cc','top','xyz','club','work','buzz','loan']
    if tld in high_trust: return 2
    if tld in suspicious: return 0
    return 1

def longest_repeated_char(s):
    if not s: return 0
    max_count = count = 1
    for i in range(1, len(s)):
        if s[i] == s[i-1]:
            count += 1
            max_count = max(max_count, count)
        else:
            count = 1
    return max_count


# ---------- Feature Extraction ----------
def extract_features(url):
    if not isinstance(url, str) or not url.strip():
        return pd.Series(dtype=float)
    if not re.match(r'^[a-zA-Z]+://', url):
        url = 'http://' + url

    p = urlparse(url)
    ext = extract(url)
    domain, subdomain, suffix = ext.domain or '', ext.subdomain or '', ext.suffix or ''
    host, path, q = p.netloc.lower(), p.path, p.query
    full = unquote(url)

    f = {
        'url_length': len(full),
        'num_dots': full.count('.'),
        'num_hyphens': full.count('-'),
        'num_underscores': full.count('_'),
        'num_digits': sum(c.isdigit() for c in full),
        'num_letters': sum(c.isalpha() for c in full),
        'num_special_chars': sum(full.count(c) for c in ['@','?','=','%','&','!','+','$']),
        'has_ip': int(bool(re.match(r'\d+\.\d+\.\d+\.\d+', host))),
        'num_subdomains': len(subdomain.split('.')) if subdomain else 0,
        'has_multiple_subdomains': int(len(subdomain.split('.')) >= 3),
        'domain_length': len(domain),
        'host_entropy': entropy(domain),
        'domain_entropy': entropy(domain),
        'domain_has_digits': int(any(c.isdigit() for c in domain)),
        'domain_digit_ratio': sum(c.isdigit() for c in domain) / len(domain) if len(domain) > 0 else 0,
        'domain_vowel_ratio': vowel_consonant_ratio(domain),
        'domain_bigram_diversity': count_ngrams(domain, 2) / len(domain) if len(domain) >= 2 else 0,
        'domain_trigram_diversity': count_ngrams(domain, 3) / len(domain) if len(domain) >= 3 else 0,
        'suspicious_prefix_suffix': int('-' in domain or domain.startswith('www-') or domain.startswith('m-')),
        'num_suspicious_symbols': sum(domain.count(c) for c in ['@', '!', '*']),
        'subdomain_length': len(subdomain),
        'domain_is_dictionary_word': int(domain.lower() in ['google','facebook','amazon','apple','microsoft','paypal','youtube','twitter','linkedin','instagram','flipkart','zomato','swiggy','nykaa','icici','hdfc','sbi','axis']),
        'tld_length': len(suffix),
        'tld_trust_category': get_tld_category(suffix.lower()),
        'is_suspicious_tld': int(suffix.lower() in ['tk','ml','ga','cf','gq','pw','cc','top','xyz','club','work','buzz','loan']),
        'is_high_trust_tld': int(suffix.lower() in ['com','org','net','edu','gov','mil','in','co.in','ac.in','gov.in']),
        'is_country_tld': int(len(suffix) == 2 and suffix.isalpha()),
        'path_length': len(path),
        'num_path_segments': len([p for p in path.split('/') if p]),
        'num_query_params': len(q.split('&')) if q else 0,
        'query_length': len(q),
        'num_encoded_chars': full.count('%'),
        'num_fragments': full.count('#'),
        'path_entropy': entropy(path),
        'path_has_suspicious_ext': int(any(ext in path.lower() for ext in ['.exe','.zip','.apk','.scr','.bat','.cmd'])),
        'query_has_redirect': int(any(word in q.lower() for word in ['redirect','url=','next=','continue=','return='])),
        'path_url_ratio': len(path)/len(full) if len(full)>0 else 0,
        'suspicious_word': int(any(w in full.lower() for w in ['login','secure','update','account','verify','confirm','click','bank','paypal','signin','password','urgent','suspended','locked','expire','reward','prize','winner','claim','free','wallet','kyc','blocked','reactivate'])),
        'num_suspicious_words': sum(1 for w in ['login','secure','update','account','verify','confirm','click','bank','paypal','signin','password','urgent','suspended','locked','expire','reward','prize','winner','claim','free','wallet','kyc','blocked','reactivate'] if w in full.lower()),
        'sensitive_word': int(any(w in full.lower() for w in ['bank','paypal','account','password','credit','card','wallet','upi'])),
        'action_word': int(any(w in full.lower() for w in ['click','verify','confirm','update','download','install'])),
        'has_brand_name': int(any(b in full.lower() for b in ['google','facebook','amazon','microsoft','apple','paypal','netflix','instagram','twitter','linkedin','youtube','yahoo','ebay','icici','hdfc','sbi','axis','swiggy','zomato'])),
        'brand_not_in_domain': int(any(b in full.lower() for b in ['google','facebook','amazon','apple','paypal','youtube']) and not any(b in domain.lower() for b in ['google','facebook','amazon','apple','paypal','youtube'])),
        'is_shortening_service': int(any(s in full for s in ['bit.ly','tinyurl','goo.gl','t.co','ow.ly','is.gd','buff.ly'])),
        'is_mixed_case': int(any(c.isupper() for c in full) and any(c.islower() for c in full)),
        'num_repeated_chars': longest_repeated_char(full),
        'longest_token_length': max((len(t) for t in re.split(r'[./?=&_-]', full) if t), default=0),
        'digit_letter_ratio': sum(c.isdigit() for c in full) / sum(c.isalpha() for c in full) if sum(c.isalpha() for c in full) > 0 else 0,
        'special_char_ratio': sum(1 for c in full if not c.isalnum()) / len(full) if len(full) > 0 else 0,
        'uppercase_ratio': sum(1 for c in full if c.isupper()) / len(full) if len(full) > 0 else 0,
        'consecutive_consonants': max((len(m.group()) for m in re.finditer(r'[bcdfghjklmnpqrstvwxyz]+', full.lower())), default=0),
        'url_entropy': entropy(full),
        'has_port': int(':' in host and not host.startswith('[')),
        'uses_https': int(p.scheme == 'https'),
        'punycode_domain': int('xn--' in domain),
        'subdomain_count_dot': subdomain.count('.') if subdomain else 0,
        'domain_url_ratio': len(domain)/len(full) if len(full)>0 else 0,
        'query_url_ratio': len(q)/len(full) if len(full)>0 else 0
    }
    return pd.Series(f)

# ---------- Load Model ----------
artifact = joblib.load('/content/drive/MyDrive/Webshield Dataset/LIGHTBGM Results/lgbm_url_classifier_v1.3.0.pkl')
model = artifact['model']
features = artifact['feature_names']

# ---------- Load Dataset ----------
input_file = '/content/drive/MyDrive/Webshield Dataset/Dataset 2 test train/archive (1)/test_dataset.csv'  # must contain 'url' and optionally 'label'
df = pd.read_csv(input_file).head(10000)
print(f"Loaded {len(df)} URLs (first 10,000 only)")

# normalize column names
df.columns = [c.lower().strip() for c in df.columns]
if 'type' in df.columns:
    df.rename(columns={'type': 'label'}, inplace=True)
if 'label' not in df.columns:
    df['label'] = None

# ---------- Feature Extraction ----------
print("Extracting features...")
feature_data = df['url'].apply(extract_features)
for col in features:
    if col not in feature_data.columns:
        feature_data[col] = 0
feature_data = feature_data[features]

# ---------- Run Predictions ----------
print("Running predictions...")
df['res'] = model.predict(feature_data)

# ---------- Save Output ----------
output_df = df[['url', 'label', 'res']]
output_file = '/content/drive/MyDrive/Webshield Dataset/Prediction Results/predicted_urls_10000.csv'
output_df.to_csv(output_file, index=False)
print(f"✅ Saved predictions to: {output_file}")
print(output_df.head(10))


Loaded 10000 URLs (first 10,000 only)
Extracting features...
Running predictions...
✅ Saved predictions to: /content/drive/MyDrive/Webshield Dataset/Prediction Results/predicted_urls_10000.csv
                                 url  label         res
0                    spa-security.de      0    phishing
1                       mallander.de      0    phishing
2                       zzndb.com.cn      0    phishing
3                    enviroseal.com/      0    phishing
4      hunt1ngtonbank.3utilities.com      1    phishing
5              augenblickstudios.com      0    phishing
6  semidiceviprima.ilcannocchiale.it      0    phishing
7          nrgeology.blogspot.com.au      0    phishing
8  eng.uwo.ca/research/fluidization/      0    phishing
9              geocities.com/crinoo/      0  defacement


In [None]:
# EXTRA CHEKCING AND CONFIRMATION

import pandas as pd
import joblib

# Load model
model = joblib.load('lgbm_url_classifier_v1.3.0.pkl')

# Comprehensive test cases


# Extract features and predict
from feature_utils import extract_features_enhanced

results = []
for category, urls in test_cases.items():
    print(f"\n{'='*70}")
    print(f"{category}")
    print(f"{'='*70}")

    for url in urls:
        features = extract_features_enhanced(url)
        features_df = pd.DataFrame([features])

        prediction = model.predict(features_df)[0]
        probabilities = model.predict_proba(features_df)[0]
        confidence = max(probabilities) * 100

        status = "✅" if (category.startswith('✅') and prediction == 'benign') or \
                        (category.startswith('❌') and prediction != 'benign') else "❌"

        print(f"{status} {url:60s} → {prediction:12s} ({confidence:.1f}%)")

        results.append({
            'category': category,
            'url': url,
            'prediction': prediction,
            'confidence': confidence,
            'correct': status == "✅"
        })

# Summary
results_df = pd.DataFrame(results)
accuracy = results_df['correct'].sum() / len(results_df) * 100
print(f"\n{'='*70}")
print(f"📊 OVERALL TEST ACCURACY: {accuracy:.1f}%")
print(f"{'='*70}")
print(f"Correct: {results_df['correct'].sum()}/{len(results_df)}")
