In [7]:
import pandas as pd
import numpy as np
import re
from urllib.parse import urlparse
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder

from xgboost import XGBClassifier
from scipy.sparse import hstack, csr_matrix
import joblib
import time
import string

df = pd.read_csv("malicious_phish.csv")

print(f"üìä Dataset shape: {df.shape}")
print(f"\nüè∑Ô∏è Class distribution:")
print(df['type'].value_counts())
print(f"\n{df['type'].value_counts(normalize=True) * 100}")

# Remove duplicates
print(f"\nüîç Duplicate URLs: {df['url'].duplicated().sum()}")
df = df.drop_duplicates(subset=['url'])
print(f"‚úÖ After removing duplicates: {len(df)} rows")

print(f"\nüìä Sample URLs:")
print(df.head(10))

üìä Dataset shape: (666192, 2)

üè∑Ô∏è Class distribution:
type
benign        435103
phishing       99111
defacement     97457
malware        34520
type               1
Name: count, dtype: int64

type
benign        65.311952
phishing      14.877243
defacement    14.628966
malware        5.181689
type           0.000150
Name: proportion, dtype: float64

üîç Duplicate URLs: 16283
‚úÖ After removing duplicates: 649909 rows

üìä Sample URLs:
                                                 url        type
0                                   br-icloud.com.br    phishing
1                mp3raid.com/music/krizz_kaliko.html      benign
2                    bopsecrets.org/rexroth/cr/1.htm      benign
3  http://www.garage-pirenne.be/index.php?option=...  defacement
4  http://adventure-nicaragua.net/index.php?optio...  defacement
5  http://buzzfil.net/m/show-art/ils-etaient-loin...      benign
6      espn.go.com/nba/player/_/id/3457/brandon-rush      benign
7     yourbittorrent.com/?q=anthon

In [8]:
# ===============================
# üìå ENHANCED FEATURE ENGINEERING
# ===============================

def extract_advanced_features(url):
    """Extract comprehensive features from URL with additional complexity"""
    url = str(url)
    
    # Parse URL components
    try:
        parsed = urlparse(url)
        domain = parsed.netloc
        path = parsed.path
    except:
        domain = ""
        path = ""
    
    features = {
        # Length features
        "url_length": len(url),
        "domain_length": len(domain),
        "path_length": len(path),
        
        # Character composition
        "num_digits": sum(c.isdigit() for c in url),
        "num_letters": sum(c.isalpha() for c in url),
        "num_specials": sum(c in ['@','-','?','=','%','/','&','#','.'] for c in url),
        "digit_ratio": sum(c.isdigit() for c in url) / len(url) if len(url) > 0 else 0,
        "letter_ratio": sum(c.isalpha() for c in url) / len(url) if len(url) > 0 else 0,
        
        # Protocol & security
        "has_https": int("https" in url.lower()),
        "has_http": int("http://" in url.lower()),
        
        # Suspicious keywords
        "has_login": int(any(word in url.lower() for word in ["login", "signin", "account"])),
        "has_secure": int("secure" in url.lower()),
        "has_update": int("update" in url.lower()),
        "has_banking": int(any(word in url.lower() for word in ["bank", "paypal", "payment"])),
        "has_verify": int("verify" in url.lower() or "confirm" in url.lower()),
        
        # Structure features
        "num_dots": url.count('.'),
        "num_hyphens": url.count('-'),
        "num_underscores": url.count('_'),
        "num_slashes": url.count('/'),
        "num_questions": url.count('?'),
        "num_equals": url.count('='),
        "num_ats": url.count('@'),
        "num_ampersands": url.count('&'),
        
        # Domain features
        "num_subdomains": domain.count('.'),
        "has_ip": int(bool(re.search(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', url))),
        
        # Entropy (randomness measure)
        "entropy": -sum((url.count(c)/len(url))*np.log2(url.count(c)/len(url)) 
                       for c in set(url)) if len(url) > 0 else 0,
        
        # Suspicious patterns
        "has_double_slash": int('//' in url[8:]),
        "has_port": int(':' in domain),
        "abnormal_tld": int(url.endswith(('.tk', '.ml', '.ga', '.cf', '.gq'))),
        
        # NEW: Additional complexity features
        "uppercase_count": sum(c.isupper() for c in url),
        "consecutive_digits": max([len(x) for x in re.findall(r'\d+', url)] or [0]),
        "consecutive_letters": max([len(x) for x in re.findall(r'[a-zA-Z]+', url)] or [0]),
        "special_char_ratio": sum(c in string.punctuation for c in url) / len(url) if len(url) > 0 else 0,
    }
    
    return features

print("\nüìä Extracting enhanced features...")
feature_df = pd.DataFrame([extract_advanced_features(u) for u in df["url"]])
print(f"‚úÖ Feature matrix shape: {feature_df.shape}")


üìä Extracting enhanced features...
‚úÖ Feature matrix shape: (649909, 33)


In [9]:
# ===============================
# üìå ADVANCED TF-IDF VECTORIZATION
# ===============================

print("\nüìù Creating advanced TF-IDF representation...")

# Character-level n-grams
tfidf = TfidfVectorizer(
    analyzer='char_wb',
    ngram_range=(2,5),
    max_features=3000,
    min_df=2,
    max_df=0.95,
    sublinear_tf=True
)

X_tfidf = tfidf.fit_transform(df["url"])
print(f"‚úÖ TF-IDF matrix shape: {X_tfidf.shape}")

# Combine features
X_numeric = csr_matrix(feature_df.values)
X_all = hstack([X_tfidf, X_numeric], format='csr')
print(f"‚úÖ Combined feature matrix shape: {X_all.shape}")

# Encode labels
le = LabelEncoder()
y = le.fit_transform(df["type"])
print(f"\nüè∑Ô∏è Classes mapping: {dict(zip(le.classes_, le.transform(le.classes_)))}")


üìù Creating advanced TF-IDF representation...
‚úÖ TF-IDF matrix shape: (649909, 3000)
‚úÖ Combined feature matrix shape: (649909, 3033)

üè∑Ô∏è Classes mapping: {'benign': np.int64(0), 'defacement': np.int64(1), 'malware': np.int64(2), 'phishing': np.int64(3), 'type': np.int64(4)}


In [10]:
import xgboost as xgb_lib

print("\nüîÑ Performing full XGBoost cross-validation (5-fold)...")

dtrain = xgb_lib.DMatrix(X_all, label=y)

params = {
    "objective": "multi:softprob",
    "num_class": len(le.classes_),
    "eval_metric": ["mlogloss", "merror"],
    "max_depth": 6,
    "learning_rate": 0.1,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "seed": 42
}

cv_results = xgb_lib.cv(
    params=params,
    dtrain=dtrain,
    num_boost_round=1000,
    nfold=5,
    stratified=True,
    early_stopping_rounds=30,
    metrics=["mlogloss", "merror"],
    verbose_eval=25,
    seed=42,
    shuffle=True,
    as_pandas=True
)

best_round = len(cv_results)
print(f"\n‚úÖ Best CV round: {best_round}")
print(cv_results.tail(5))


üîÑ Performing full XGBoost cross-validation (5-fold)...




[0]	train-mlogloss:1.40741+0.00007	train-merror:0.06167+0.00040	test-mlogloss:1.40754+0.00019	test-merror:0.06224+0.00084
[25]	train-mlogloss:0.24516+0.00032	train-merror:0.04563+0.00014	test-mlogloss:0.24680+0.00141	test-merror:0.04621+0.00074
[50]	train-mlogloss:0.12880+0.00056	train-merror:0.03551+0.00029	test-mlogloss:0.13183+0.00123	test-merror:0.03638+0.00027
[75]	train-mlogloss:0.10156+0.00049	train-merror:0.03004+0.00014	test-mlogloss:0.10569+0.00133	test-merror:0.03136+0.00050
[100]	train-mlogloss:0.08836+0.00040	train-merror:0.02654+0.00017	test-mlogloss:0.09362+0.00127	test-merror:0.02824+0.00048
[125]	train-mlogloss:0.08008+0.00035	train-merror:0.02411+0.00020	test-mlogloss:0.08625+0.00119	test-merror:0.02607+0.00051
[150]	train-mlogloss:0.07407+0.00036	train-merror:0.02232+0.00021	test-mlogloss:0.08123+0.00111	test-merror:0.02468+0.00043
[175]	train-mlogloss:0.06935+0.00035	train-merror:0.02090+0.00023	test-mlogloss:0.07745+0.00107	test-merror:0.02351+0.00049
[200]	train-m

In [14]:
# ===============================
# üìå DATA SPLITTING
# ===============================

X_train, X_test, y_train, y_test = train_test_split(
    X_all, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)

print(f"\nüìÇ Training set: {X_train.shape}")
print(f"üìÇ Test set: {X_test.shape}")
print(f"\nüìä Training set class distribution:")
for i, class_name in enumerate(le.classes_):
    count = (y_train == i).sum()
    print(f"  {class_name}: {count} ({count/len(y_train)*100:.1f}%)")


üìÇ Training set: (519926, 3033)
üìÇ Test set: (129982, 3033)

üìä Training set class distribution:
  benign: 345077 (66.4%)
  defacement: 76409 (14.7%)
  malware: 20514 (3.9%)
  phishing: 77926 (15.0%)
  type: 0 (0.0%)


In [15]:
# ===============================
# üìå ENHANCED XGBOOST MODEL
# ===============================

print("\nüöÄ Training ENHANCED XGBoost classifier...")
print("="*60)

# Enhanced parameters for better learning
xgb_final = XGBClassifier(
    n_estimators=best_round,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='multi:softprob',
    num_class=len(le.classes_),
    eval_metric=['mlogloss', 'merror'],
    random_state=42,
    n_jobs=-1
)

start_time = time.time()

# Train the model
xgb_final.fit(X_all, y)

training_time = time.time() - start_time

print(f"\n‚úÖ Training completed in {training_time:.2f} seconds ({training_time/60:.2f} minutes)")


üöÄ Training ENHANCED XGBoost classifier...

‚úÖ Training completed in 1777.94 seconds (29.63 minutes)


In [17]:
# ===============================
# üìå COMPREHENSIVE MODEL SUMMARY (No CV)
# ===============================

# Use the final trained model
y_pred = xgb_final.predict(X_test)
y_pred_proba = xgb_final.predict_proba(X_test)

print("\n" + "="*60)
print("üìä ENHANCED MODEL SUMMARY")
print("="*60)

# ------------------------------
# Overall performance
# ------------------------------
acc = accuracy_score(y_test, y_pred)
f1_macro = f1_score(y_test, y_pred, average='macro')
f1_weighted = f1_score(y_test, y_pred, average='weighted')

print(f"\n‚úÖ Overall Accuracy: {acc:.4f} ({acc*100:.2f}%)")
print(f"‚úÖ F1-Score (Macro): {f1_macro:.4f}")
print(f"‚úÖ F1-Score (Weighted): {f1_weighted:.4f}")

# ------------------------------
# Classification report
# ------------------------------
unique_labels = sorted(list(set(y_test) | set(y_pred)))
target_names = le.inverse_transform(unique_labels)

print("\nüìã Classification Report:")
print(classification_report(y_test, y_pred, labels=unique_labels, target_names=target_names))

# ------------------------------
# Confusion matrix
# ------------------------------
print("\nüî¢ Confusion Matrix:")
cm = confusion_matrix(y_test, y_pred, labels=unique_labels)
print(cm)

# ------------------------------
# Per-class metrics (accurate + stable)
# ------------------------------
print("\nüìà Per-Class Metrics:")
for i, class_label in enumerate(unique_labels):
    class_name = le.inverse_transform([class_label])[0]
    class_acc = cm[i, i] / cm[i].sum() if cm[i].sum() > 0 else 0
    class_f1 = f1_score((y_test == class_label), (y_pred == class_label))
    print(f"  {class_name:15s}: Accuracy={class_acc:.4f} ({class_acc*100:.2f}%), F1={class_f1:.4f}")

# ------------------------------
# Feature importance
# ------------------------------
print("\nüîç Top 20 Most Important Features:")
feature_names = list(feature_df.columns)
feature_importance = xgb_final.feature_importances_

# Slice last N elements for numeric features
feature_importance_numeric = feature_importance[-len(feature_names):]

top_features = sorted(
    zip(feature_names, feature_importance_numeric),
    key=lambda x: x[1],
    reverse=True
)[:20]

for feat, imp in top_features:
    print(f"  {feat:30s}: {imp:.6f}")

# ------------------------------
# Save final enhanced model
# ------------------------------
print("\nüíæ Saving enhanced model and preprocessors...")
joblib.dump(xgb_final, "url_detector_ENHANCED.pkl")
joblib.dump(tfidf, "url_tfidf_ENHANCED.pkl")
joblib.dump(le, "url_label_ENHANCED.pkl")

print("‚úÖ Enhanced model saved successfully!")
print("\nModel files:")
print("  - url_detector_ENHANCED.pkl")
print("  - url_tfidf_ENHANCED.pkl")
print("  - url_label_ENHANCED.pkl")



üìä ENHANCED MODEL SUMMARY

‚úÖ Overall Accuracy: 0.9922 (99.22%)
‚úÖ F1-Score (Macro): 0.9888
‚úÖ F1-Score (Weighted): 0.9921

üìã Classification Report:
              precision    recall  f1-score   support

      benign       0.99      1.00      0.99     86270
  defacement       1.00      1.00      1.00     19102
     malware       1.00      0.97      0.99      5129
    phishing       0.98      0.97      0.97     19481

    accuracy                           0.99    129982
   macro avg       0.99      0.98      0.99    129982
weighted avg       0.99      0.99      0.99    129982


üî¢ Confusion Matrix:
[[86055     0     0   215]
 [    0 19101     0     1]
 [    4     1  4997   127]
 [  655    14     0 18812]]

üìà Per-Class Metrics:
  benign         : Accuracy=0.9975 (99.75%), F1=0.9949
  defacement     : Accuracy=0.9999 (99.99%), F1=0.9996
  malware        : Accuracy=0.9743 (97.43%), F1=0.9870
  phishing       : Accuracy=0.9657 (96.57%), F1=0.9738

üîç Top 20 Most Important F