In [35]:
import pandas as pd
import numpy as np
import re
from urllib.parse import urlparse
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder

from xgboost import XGBClassifier
from scipy.sparse import hstack, csr_matrix
import joblib
import time

In [36]:
df = pd.read_csv("malicious_phish.csv")  # Your combined CSV with original + synthetic data

print(f"üìä Dataset shape: {df.shape}")
print(f"\nüè∑Ô∏è Class distribution:")
print(df['type'].value_counts())
print(f"\n{df['type'].value_counts(normalize=True) * 100}")

# Remove duplicates
print(f"\nüîç Duplicate URLs: {df['url'].duplicated().sum()}")
df = df.drop_duplicates(subset=['url'])
print(f"‚úÖ After removing duplicates: {len(df)} rows")

print(f"\nüìä Sample URLs:")
print(df.head(10))

üìä Dataset shape: (666192, 2)

üè∑Ô∏è Class distribution:
type
benign        435103
phishing       99111
defacement     97457
malware        34520
type               1
Name: count, dtype: int64

type
benign        65.311952
phishing      14.877243
defacement    14.628966
malware        5.181689
type           0.000150
Name: proportion, dtype: float64

üîç Duplicate URLs: 16283
‚úÖ After removing duplicates: 649909 rows

üìä Sample URLs:
                                                 url        type
0                                   br-icloud.com.br    phishing
1                mp3raid.com/music/krizz_kaliko.html      benign
2                    bopsecrets.org/rexroth/cr/1.htm      benign
3  http://www.garage-pirenne.be/index.php?option=...  defacement
4  http://adventure-nicaragua.net/index.php?optio...  defacement
5  http://buzzfil.net/m/show-art/ils-etaient-loin...      benign
6      espn.go.com/nba/player/_/id/3457/brandon-rush      benign
7     yourbittorrent.com/?q=anthon

In [40]:
print("üîç Checking class distribution...")
print(df['type'].value_counts())

# Remove classes with too few samples
min_samples = 10  # Need at least 10 samples per class
class_counts = df['type'].value_counts()
valid_classes = class_counts[class_counts >= min_samples].index

print(f"\n‚ö†Ô∏è Classes with < {min_samples} samples will be removed:")
print(class_counts[class_counts < min_samples])

# Filter dataset
df_clean = df[df['type'].isin(valid_classes)].copy()

print(f"\n‚úÖ Original dataset: {len(df)} rows")
print(f"‚úÖ Cleaned dataset: {len(df_clean)} rows")
print(f"\nüè∑Ô∏è Final class distribution:")
print(df_clean['type'].value_counts())

# Update df
df = df_clean

# %%
# ===============================
# üìå STEP 2: FEATURE ENGINEERING (RE-RUN)
# ===============================

def extract_advanced_features(url):
    """Extract comprehensive features from URL"""
    url = str(url)
    
    # Parse URL components
    try:
        parsed = urlparse(url)
        domain = parsed.netloc
        path = parsed.path
    except:
        domain = ""
        path = ""
    
    features = {
        # Length features
        "url_length": len(url),
        "domain_length": len(domain),
        "path_length": len(path),
        
        # Character composition
        "num_digits": sum(c.isdigit() for c in url),
        "num_letters": sum(c.isalpha() for c in url),
        "num_specials": sum(c in ['@','-','?','=','%','/','&','#','.'] for c in url),
        "digit_ratio": sum(c.isdigit() for c in url) / len(url) if len(url) > 0 else 0,
        
        # Protocol & security
        "has_https": int("https" in url.lower()),
        "has_http": int("http://" in url.lower()),
        
        # Suspicious keywords
        "has_login": int(any(word in url.lower() for word in ["login", "signin", "account"])),
        "has_secure": int("secure" in url.lower()),
        "has_update": int("update" in url.lower()),
        "has_banking": int(any(word in url.lower() for word in ["bank", "paypal", "payment"])),
        "has_verify": int("verify" in url.lower() or "confirm" in url.lower()),
        
        # Structure features
        "num_dots": url.count('.'),
        "num_hyphens": url.count('-'),
        "num_underscores": url.count('_'),
        "num_slashes": url.count('/'),
        "num_questions": url.count('?'),
        "num_equals": url.count('='),
        "num_ats": url.count('@'),
        "num_ampersands": url.count('&'),
        
        # Domain features
        "num_subdomains": domain.count('.'),
        "has_ip": int(bool(re.search(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', url))),
        
        # Entropy (randomness measure)
        "entropy": -sum((url.count(c)/len(url))*np.log2(url.count(c)/len(url)) 
                       for c in set(url)) if len(url) > 0 else 0,
        
        # Suspicious patterns
        "has_double_slash": int('//' in url[8:]),  # after http://
        "has_port": int(':' in domain),
        "abnormal_tld": int(url.endswith(('.tk', '.ml', '.ga', '.cf', '.gq'))),
    }
    
    return features

print("\nüìä Extracting features from cleaned dataset...")
feature_df = pd.DataFrame([extract_advanced_features(u) for u in df["url"]])
print(f"‚úÖ Feature matrix shape: {feature_df.shape}")

üîç Checking class distribution...
type
benign        431347
phishing       97407
defacement     95511
malware        25643
type               1
Name: count, dtype: int64

‚ö†Ô∏è Classes with < 10 samples will be removed:
type
type    1
Name: count, dtype: int64

‚úÖ Original dataset: 649909 rows
‚úÖ Cleaned dataset: 649908 rows

üè∑Ô∏è Final class distribution:
type
benign        431347
phishing       97407
defacement     95511
malware        25643
Name: count, dtype: int64

üìä Extracting features from cleaned dataset...
‚úÖ Feature matrix shape: (649908, 28)


In [41]:
# %%
# ===============================
# üìå STEP 3: TF-IDF VECTORIZATION
# ===============================

print("\nüìù Vectorizing URLs with TF-IDF...")
tfidf = TfidfVectorizer(
    analyzer='char_wb',
    ngram_range=(3,4),
    max_features=2000,
    min_df=3,
    max_df=0.95
)

X_tfidf = tfidf.fit_transform(df["url"])
print(f"‚úÖ TF-IDF matrix shape: {X_tfidf.shape}")

# Combine features (keep sparse format)
X_numeric = csr_matrix(feature_df.values)
X_all = hstack([X_tfidf, X_numeric], format='csr')
print(f"‚úÖ Combined feature matrix shape: {X_all.shape}")

# Encode labels
le = LabelEncoder()
y = le.fit_transform(df["type"])
print(f"\nüè∑Ô∏è Classes mapping: {dict(zip(le.classes_, le.transform(le.classes_)))}")



üìù Vectorizing URLs with TF-IDF...
‚úÖ TF-IDF matrix shape: (649908, 2000)
‚úÖ Combined feature matrix shape: (649908, 2028)

üè∑Ô∏è Classes mapping: {'benign': np.int64(0), 'defacement': np.int64(1), 'malware': np.int64(2), 'phishing': np.int64(3)}


In [42]:
X_train, X_test, y_train, y_test = train_test_split(
    X_all, y, test_size=0.2, random_state=42, stratify=y
)
print(f"\nüìÇ Training set: {X_train.shape}")
print(f"üìÇ Test set: {X_test.shape}")



üìÇ Training set: (519926, 2028)
üìÇ Test set: (129982, 2028)


In [43]:
print("\nüöÄ Training XGBoost classifier...")
print("="*60)

xgb = XGBClassifier(
    n_estimators=150,
    max_depth=6,
    learning_rate=0.15,
    subsample=0.8,
    colsample_bytree=0.8,
    tree_method="hist",
    n_jobs=-1,
    random_state=42
)

start_time = time.time()
xgb.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    verbose=50
)
training_time = time.time() - start_time

print(f"\n‚úÖ Training completed in {training_time:.2f} seconds ({training_time/60:.2f} minutes)")



üöÄ Training XGBoost classifier...
[0]	validation_0-mlogloss:1.19586
[50]	validation_0-mlogloss:0.37059
[100]	validation_0-mlogloss:0.35438
[149]	validation_0-mlogloss:0.35685

‚úÖ Training completed in 209.05 seconds (3.48 minutes)


In [44]:
y_pred = xgb.predict(X_test)
y_pred_proba = xgb.predict_proba(X_test)

print("\n" + "="*60)
print("üìä MODEL EVALUATION RESULTS")
print("="*60)

# Overall accuracy
acc = accuracy_score(y_test, y_pred)
print(f"\n‚úÖ Overall Accuracy: {acc:.4f} ({acc*100:.2f}%)")

# Detailed classification report
print("\nüìã Classification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

# Confusion matrix
print("\nüî¢ Confusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(cm)

# Per-class accuracy
print("\nüìà Per-Class Accuracy:")
for i, class_name in enumerate(le.classes_):
    class_acc = cm[i, i] / cm[i].sum() if cm[i].sum() > 0 else 0
    print(f"  {class_name:15s}: {class_acc:.4f} ({class_acc*100:.2f}%)")

# Cross-validation score
print("\nüîÑ Cross-Validation Score (5-fold):")
cv_scores = cross_val_score(xgb, X_train, y_train, cv=5, scoring='f1_macro')
print(f"  Mean F1-Score: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")



üìä MODEL EVALUATION RESULTS

‚úÖ Overall Accuracy: 0.9751 (97.51%)

üìã Classification Report:
              precision    recall  f1-score   support

      benign       0.98      0.99      0.99     86270
  defacement       0.98      0.99      0.99     19102
     malware       0.99      0.91      0.95      5129
    phishing       0.95      0.89      0.92     19481

    accuracy                           0.98    129982
   macro avg       0.98      0.95      0.96    129982
weighted avg       0.97      0.98      0.97    129982


üî¢ Confusion Matrix:
[[85719     7     6   538]
 [   48 18991     2    61]
 [   89    66  4682   292]
 [ 1851   251    23 17356]]

üìà Per-Class Accuracy:
  benign         : 0.9936 (99.36%)
  defacement     : 0.9942 (99.42%)
  malware        : 0.9128 (91.28%)
  phishing       : 0.8909 (89.09%)

üîÑ Cross-Validation Score (5-fold):
  Mean F1-Score: 0.9609 (+/- 0.0007)


In [None]:
print("\nüîç Top 20 Most Important Features:")
feature_names = list(feature_df.columns)
feature_importance = xgb.feature_importances_[-len(feature_names):]

top_features = sorted(zip(feature_names, feature_importance), 
                     key=lambda x: x[1], reverse=True)[:20]
for feat, imp in top_features:
    print(f"  {feat:25s}: {imp:.4f}")

In [45]:
print("\nüíæ Saving model and preprocessors...")
joblib.dump(xgb, "url_detector_model_final.pkl")
joblib.dump(tfidf, "url_tfidf_vectorizer_final.pkl")
joblib.dump(le, "url_label_encoder_final.pkl")
print("‚úÖ Model saved successfully!")
print("\nFiles saved:")
print("  - url_detector_model_final.pkl")
print("  - url_tfidf_vectorizer_final.pkl")
print("  - url_label_encoder_final.pkl")



üíæ Saving model and preprocessors...
‚úÖ Model saved successfully!

Files saved:
  - url_detector_model_final.pkl
  - url_tfidf_vectorizer_final.pkl
  - url_label_encoder_final.pkl


In [46]:
def predict_url(url):
    """Predict if a URL is malicious"""
    # Normalize URL (remove protocol and www)
    normalized_url = url.replace('https://', '').replace('http://', '').replace('www.', '')
    if normalized_url.endswith('/') and normalized_url.count('/') == 1:
        normalized_url = normalized_url[:-1]
    
    # Extract features from ORIGINAL URL
    features = extract_advanced_features(url)
    feature_vec = pd.DataFrame([features])
    
    # TF-IDF on NORMALIZED URL
    tfidf_vec = tfidf.transform([normalized_url])
    
    # Combine
    X_numeric = csr_matrix(feature_vec.values)
    X = hstack([tfidf_vec, X_numeric], format='csr')
    
    # Predict
    pred = xgb.predict(X)[0]
    proba = xgb.predict_proba(X)[0]
    
    result = {
        "url": url,
        "normalized": normalized_url,
        "prediction": le.inverse_transform([pred])[0],
        "confidence": float(max(proba)),
        "probabilities": {
            class_name: float(prob) 
            for class_name, prob in zip(le.classes_, proba)
        }
    }
    return result

In [47]:
test_urls = [
    "https://www.google.com/",
    "https://www.facebook.com/",
    "https://chatgpt.com/",
    "https://www.amazon.com/",
    "http://paypal-verify.tk/login",
    "http://192.168.1.1/malware.exe",
    "http://apple-secure.ml/verify",
    "https://www.github.com/",
]

print("\n" + "="*70)
print("üß™ TESTING PREDICTIONS")
print("="*70)

for url in test_urls:
    result = predict_url(url)
    
    # Emoji
    if result['prediction'] == 'benign':
        emoji = "‚úÖ"
        status = "SAFE"
    elif result['prediction'] == 'phishing':
        emoji = "‚ö†Ô∏è"
        status = "PHISHING"
    elif result['prediction'] == 'malware':
        emoji = "üö®"
        status = "MALWARE"
    else:
        emoji = "‚ö°"
        status = "DEFACED"
    
    print(f"\n{emoji} {url}")
    print(f"   Normalized: {result['normalized']}")
    print(f"   Prediction: {status} ({result['confidence']:.1%})")
    
    # Show probabilities
    for class_name, prob in result['probabilities'].items():
        if prob > 0.01:
            bar = "‚ñà" * int(prob * 30)
            print(f"     {class_name:12s}: {prob:5.1%} {bar}")

print("\n" + "="*70)



üß™ TESTING PREDICTIONS

‚ö†Ô∏è https://www.google.com/
   Normalized: google.com
   Prediction: PHISHING (94.6%)
     benign      :  2.2% 
     malware     :  3.0% 
     phishing    : 94.6% ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà

‚ö†Ô∏è https://www.facebook.com/
   Normalized: facebook.com
   Prediction: PHISHING (94.1%)
     benign      :  2.5% 
     malware     :  2.9% 
     phishing    : 94.1% ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà

‚úÖ https://chatgpt.com/
   Normalized: chatgpt.com
   Prediction: SAFE (49.8%)
     benign      : 49.8% ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
     malware     :  1.1% 
     phishing    : 48.7% ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà

‚ö†Ô∏è https://www.amazon.com/
   Normalized: amazon.com
   Prediction: PHISHING (91.6%)
     benign      :  4.8% ‚ñà
     malware     :  2.9% 
     phishing    : 91.6% ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñ

In [50]:
new_url = "dropbox.com"

result = predict_url(new_url)

print("\n" + "="*60)
print(f"üîç URL: {result['url']}")
print(f"üéØ Prediction: {result['prediction'].upper()}")
print(f"üìä Confidence: {result['confidence']:.2%}")
print("\nüìà Probabilities:")
for class_name, prob in result['probabilities'].items():
    bar = "‚ñà" * int(prob * 50)
    print(f"  {class_name:12s}: {prob:6.2%} {bar}")
print("="*60)

# Safety status
if result['prediction'] == 'benign':
    print("‚úÖ SAFE - URL appears legitimate")
elif result['prediction'] == 'phishing':
    print("‚ö†Ô∏è PHISHING - Do NOT enter credentials!")
elif result['prediction'] == 'malware':
    print("üö® MALWARE - Do NOT visit this URL!")
else:
    print("‚ö° DEFACED - Website may be compromised")


üîç URL: dropbox.com
üéØ Prediction: PHISHING
üìä Confidence: 93.58%

üìà Probabilities:
  benign      :  1.34% 
  defacement  :  0.00% 
  malware     :  5.08% ‚ñà‚ñà
  phishing    : 93.58% ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
‚ö†Ô∏è PHISHING - Do NOT enter credentials!


In [52]:
# %%
# üîç INVESTIGATE WHAT'S IN YOUR DATA
# ===============================

print("="*60)
print("CHECKING YOUR DATASET")
print("="*60)

# Check if synthetic data was actually added
print(f"\nTotal rows: {len(df)}")
print(f"\nClass distribution:")
print(df['type'].value_counts())

# Check if google.com exists as benign
print("\n" + "="*60)
print("CHECKING FOR 'google.com' IN DATASET")
print("="*60)

google_exact = df[df['url'] == 'google.com']
print(f"\nExact match 'google.com': {len(google_exact)} rows")
if len(google_exact) > 0:
    print(google_exact[['url', 'type']].head())

google_contains = df[df['url'].str.contains('google', case=False, na=False)]
print(f"\nContains 'google': {len(google_contains)} rows")
print(f"Distribution:")
print(google_contains['type'].value_counts())
print("\nSample google URLs:")
print(google_contains['url'].head(20).tolist())

# Check format of URLs
print("\n" + "="*60)
print("CHECKING URL FORMAT")
print("="*60)
print("\nSample benign URLs:")
print(df[df['type'] == 'benign']['url'].head(20).tolist())

print("\nSample phishing URLs:")
print(df[df['type'] == 'phishing']['url'].head(10).tolist())

# Check if synthetic URLs have protocols
has_protocol = df['url'].str.contains('http://', case=False, na=False) | df['url'].str.contains('https://', case=False, na=False)
print(f"\n‚ö†Ô∏è URLs with protocols (http:// or https://): {has_protocol.sum()}")

CHECKING YOUR DATASET

Total rows: 649908

Class distribution:
type
benign        431347
phishing       97407
defacement     95511
malware        25643
Name: count, dtype: int64

CHECKING FOR 'google.com' IN DATASET

Exact match 'google.com': 1 rows
               url    type
651244  google.com  benign

Contains 'google': 5231 rows
Distribution:
type
benign        3408
phishing      1526
malware        254
defacement      43
Name: count, dtype: int64

Sample google URLs:
['https://docs.google.com/spreadsheet/viewform?formkey=dGg2Z1lCUHlSdjllTVNRUW50TFIzSkE6MQ', 'http://drive-google-com.fanalav.com/6a7ec96d6a4b8b887e9f9ace81b40a99/', 'sites.google.com/a/woodplanning.com/www/', 'google.com/hostednews/afp/article/ALeqM5iK8qoGy6KCQ835kZ1ps-VBbCEmqg?docId=CNG.70c74f0238858f49a6b97a2c9ed0618b.71', 'groups.google.com/group/alt.conspiracy.jfk/browse_thread/thread/885ffad05b486021', 'http://thenextweb.com/google/2014/10/01/google-announces-10-price-cut-compute-engine-instances-google-drive-pass

In [53]:
# %%
# üîß CLEAN ALL URLs - REMOVE PROTOCOLS
# ===============================

print("üîß Cleaning all URLs in dataset...")

def clean_url(url):
    """Remove protocols and www from URLs"""
    url = str(url).strip()
    url = url.replace('https://', '').replace('http://', '').replace('www.', '')
    # Remove trailing slash only if no path
    if url.endswith('/') and url.count('/') == 1:
        url = url[:-1]
    return url

# Clean all URLs
df['url'] = df['url'].apply(clean_url)

print("‚úÖ URLs cleaned!")

# Check results
print(f"\nüîç URLs with protocols remaining: {df['url'].str.contains('http', case=False, na=False).sum()}")

# Check google.com again
google_exact = df[df['url'] == 'google.com']
print(f"\nExact 'google.com' matches: {len(google_exact)}")
if len(google_exact) > 0:
    print(google_exact['type'].value_counts())

# Check other domains
print("\nChecking major domains:")
for domain in ['facebook.com', 'amazon.com', 'youtube.com', 'github.com']:
    count = len(df[df['url'] == domain])
    if count > 0:
        types = df[df['url'] == domain]['type'].value_counts()
        print(f"  {domain}: {count} rows - {types.to_dict()}")

# Remove duplicates after cleaning
print(f"\nüîç Duplicates after cleaning: {df['url'].duplicated().sum()}")
df = df.drop_duplicates(subset=['url'], keep='first')
print(f"‚úÖ After deduplication: {len(df)} rows")

# Save
df.to_csv("malicious_phish_cleaned.csv", index=False)
print("\nüíæ Saved to: malicious_phish_cleaned.csv")

üîß Cleaning all URLs in dataset...
‚úÖ URLs cleaned!

üîç URLs with protocols remaining: 3510

Exact 'google.com' matches: 2
type
benign    2
Name: count, dtype: int64

Checking major domains:
  facebook.com: 2 rows - {'benign': 2}
  amazon.com: 2 rows - {'benign': 2}
  youtube.com: 1 rows - {'benign': 1}
  github.com: 1 rows - {'benign': 1}

üîç Duplicates after cleaning: 6686
‚úÖ After deduplication: 643222 rows

üíæ Saved to: malicious_phish_cleaned.csv


In [54]:
# %%
# üîÑ NOW RETRAIN THE MODEL WITH CLEANED DATA
# ===============================

print("\nüìä Extracting features from CLEANED dataset...")
feature_df = pd.DataFrame([extract_advanced_features(u) for u in df["url"]])
print(f"‚úÖ Feature matrix shape: {feature_df.shape}")

print("\nüìù Vectorizing URLs with TF-IDF...")
tfidf = TfidfVectorizer(
    analyzer='char_wb',
    ngram_range=(3,4),
    max_features=2000,
    min_df=3,
    max_df=0.95
)

X_tfidf = tfidf.fit_transform(df["url"])
X_numeric = csr_matrix(feature_df.values)
X_all = hstack([X_tfidf, X_numeric], format='csr')

le = LabelEncoder()
y = le.fit_transform(df["type"])

print(f"‚úÖ Combined shape: {X_all.shape}")
print(f"üè∑Ô∏è Classes: {dict(zip(le.classes_, le.transform(le.classes_)))}")

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X_all, y, test_size=0.2, random_state=42, stratify=y
)

# Train
print("\nüöÄ Training model on CLEANED data...")
xgb = XGBClassifier(
    n_estimators=150,
    max_depth=6,
    learning_rate=0.15,
    subsample=0.8,
    colsample_bytree=0.8,
    tree_method="hist",
    n_jobs=-1,
    random_state=42
)

xgb.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=50)

# Evaluate
y_pred = xgb.predict(X_test)
acc = accuracy_score(y_test, y_pred)

print(f"\n‚úÖ Accuracy: {acc:.4f} ({acc*100:.2f}%)")
print("\nüìã Classification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

# Save
joblib.dump(xgb, "url_detector_FINAL.pkl")
joblib.dump(tfidf, "url_tfidf_FINAL.pkl")
joblib.dump(le, "url_label_FINAL.pkl")
print("\nüíæ Model saved!")


üìä Extracting features from CLEANED dataset...
‚úÖ Feature matrix shape: (643222, 28)

üìù Vectorizing URLs with TF-IDF...
‚úÖ Combined shape: (643222, 2028)
üè∑Ô∏è Classes: {'benign': np.int64(0), 'defacement': np.int64(1), 'malware': np.int64(2), 'phishing': np.int64(3)}

üöÄ Training model on CLEANED data...
[0]	validation_0-mlogloss:1.25555
[50]	validation_0-mlogloss:0.70911
[100]	validation_0-mlogloss:0.69038
[149]	validation_0-mlogloss:0.68129

‚úÖ Accuracy: 0.8874 (88.74%)

üìã Classification Report:
              precision    recall  f1-score   support

      benign       0.88      0.97      0.92     85200
  defacement       0.96      0.88      0.92     19102
     malware       0.99      0.90      0.94      5118
    phishing       0.83      0.50      0.63     19225

    accuracy                           0.89    128645
   macro avg       0.91      0.82      0.85    128645
weighted avg       0.89      0.89      0.88    128645


üíæ Model saved!


In [57]:
# %%
# üß™ TEST THE FIXED MODEL
# ===============================

def predict_url_fixed(url):
    """Predict with cleaned URL"""
    normalized = clean_url(url)   # ‚úÖ use same cleaning function
    
    features = extract_advanced_features(normalized)  # ‚úÖ use cleaned url here
    feature_vec = pd.DataFrame([features])
    
    tfidf_vec = tfidf.transform([normalized])  # ‚úÖ same cleaned url
    X_numeric = csr_matrix(feature_vec.values)
    X = hstack([tfidf_vec, X_numeric], format='csr')
    
    pred = xgb.predict(X)[0]
    proba = xgb.predict_proba(X)[0]
    
    return {
        'prediction': le.inverse_transform([pred])[0],
        'confidence': float(max(proba)),
        'probabilities': dict(zip(le.classes_, proba))
    }


# Test
test_urls = [
    "https://www.google.com/",
    "https://www.facebook.com/",
    "https://www.amazon.com/",
    "http://paypal-verify.tk/login",
    "http://192.168.1.1/malware.exe",
]

print("\n" + "="*70)
print("üß™ TESTING FIXED MODEL")
print("="*70)

for url in test_urls:
    result = predict_url_fixed(url)
    emoji = "‚úÖ" if result['prediction'] == 'benign' else "‚ö†Ô∏è" if result['prediction'] == 'phishing' else "üö®"
    print(f"\n{emoji} {url}")
    print(f"   {result['prediction'].upper()} ({result['confidence']:.1%})")


üß™ TESTING FIXED MODEL

‚úÖ https://www.google.com/
   BENIGN (55.9%)

‚úÖ https://www.facebook.com/
   BENIGN (74.5%)

‚úÖ https://www.amazon.com/
   BENIGN (74.0%)

‚ö†Ô∏è http://paypal-verify.tk/login
   PHISHING (90.8%)

üö® http://192.168.1.1/malware.exe
   MALWARE (99.8%)
