# Malicious URL Classifier - Cyber Carnival 2026
**Dataset:** malicious_phish.csv  
**Classes:** benign, phishing, defacement, malware  
**Balancing:** SMOTE (Synthetic Minority Oversampling Technique)

In [4]:
# ── IMPORTS ──────────────────────────────────
import pandas as pd
import numpy as np
import re
import time
import joblib
import warnings
warnings.filterwarnings('ignore')

from urllib.parse import urlparse
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (classification_report, confusion_matrix,
                             accuracy_score, f1_score)
from imblearn.over_sampling import SMOTE

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns

print('All imports successful!')
print('NOTE: If imblearn missing, run: pip install imbalanced-learn')

All imports successful!
NOTE: If imblearn missing, run: pip install imbalanced-learn


In [3]:
# ── SECTION 1: LOAD DATA ─────────────────────
print('=' * 60)
print('MALICIOUS URL CLASSIFIER - Cyber Carnival 2026')
print('=' * 60)

df = pd.read_csv('malicious_phish.csv')
print(f'\n[1] Dataset loaded: {len(df):,} rows')
print('\nClass Distribution (Before SMOTE):')
print(df['type'].value_counts())

MALICIOUS URL CLASSIFIER - Cyber Carnival 2026

[1] Dataset loaded: 651,191 rows

Class Distribution (Before SMOTE):
type
benign        428103
defacement     96457
phishing       94111
malware        32520
Name: count, dtype: int64


In [4]:
# ── SECTION 2: FEATURE ENGINEERING ──────────
print('\n[2] Extracting features...')

def extract_features(url):
    url = str(url)
    try:
        parsed = urlparse(url if url.startswith('http') else 'http://' + url)
        hostname = parsed.hostname or ''
        path = parsed.path or ''
    except:
        hostname = ''
        path = url

    features = {
        'url_length':          len(url),
        'hostname_length':     len(hostname),
        'path_length':         len(path),
        'num_dots':            url.count('.'),
        'num_hyphens':         url.count('-'),
        'num_underscores':     url.count('_'),
        'num_slashes':         url.count('/'),
        'num_at':              url.count('@'),
        'num_question':        url.count('?'),
        'num_equals':          url.count('='),
        'num_ampersand':       url.count('&'),
        'num_percent':         url.count('%'),
        'num_digits':          sum(c.isdigit() for c in url),
        'has_ip':              1 if re.search(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', url) else 0,
        'has_https':           1 if url.startswith('https') else 0,
        'has_www':             1 if 'www.' in url else 0,
        'has_at_sign':         1 if '@' in url else 0,
        'has_double_slash':    1 if '//' in url[7:] else 0,
        'has_hex_encoding':    1 if '%' in url else 0,
        'num_subdomains':      len(hostname.split('.')) - 2 if hostname else 0,
        'has_suspicious_word': 1 if any(w in url.lower() for w in [
            'login', 'secure', 'account', 'update', 'bank', 'verify',
            'confirm', 'paypal', 'signin', 'ebay', 'admin', 'password'
        ]) else 0,
        'digit_ratio':         sum(c.isdigit() for c in url) / max(len(url), 1),
        'letter_ratio':        sum(c.isalpha() for c in url) / max(len(url), 1),
        'special_ratio':       sum(not c.isalnum() for c in url) / max(len(url), 1),
    }
    return features

t0 = time.time()
features_list = [extract_features(u) for u in df['url']]
X = pd.DataFrame(features_list)
print(f'   Feature extraction done in {time.time()-t0:.1f}s')
print(f'   Feature matrix: {X.shape}')


[2] Extracting features...
   Feature extraction done in 47.7s
   Feature matrix: (651191, 24)


In [5]:
# ── SECTION 3: ENCODE LABELS ─────────────────
le = LabelEncoder()
y = le.fit_transform(df['type'])
print(f'[3] Classes: {list(le.classes_)}')
print(f'    Encoded : {list(range(len(le.classes_)))}')

[3] Classes: ['benign', 'defacement', 'malware', 'phishing']
    Encoded : [0, 1, 2, 3]


In [6]:
# ── SECTION 4: TRAIN/TEST SPLIT ──────────────
# Split BEFORE SMOTE so test set remains real/original data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f'[4] Train size: {len(X_train):,} | Test size: {len(X_test):,}')
print('\nTrain class distribution (Before SMOTE):')
unique, counts = np.unique(y_train, return_counts=True)
for cls, cnt in zip(le.classes_, counts):
    print(f'   {cls:<12} : {cnt:,}')

[4] Train size: 520,952 | Test size: 130,239

Train class distribution (Before SMOTE):
   benign       : 342,482
   defacement   : 77,165
   malware      : 26,016
   phishing     : 75,289


In [7]:
# ── SECTION 5: APPLY SMOTE ───────────────────
print('\n[5] Applying SMOTE to balance training data...')

smote = SMOTE(random_state=42, k_neighbors=5)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

print(f'   Before SMOTE: {len(X_train):,} samples')
print(f'   After  SMOTE: {len(X_train_sm):,} samples')
print('\nTrain class distribution (After SMOTE):')
unique, counts = np.unique(y_train_sm, return_counts=True)
for cls, cnt in zip(le.classes_, counts):
    print(f'   {cls:<12} : {cnt:,}')


[5] Applying SMOTE to balance training data...
   Before SMOTE: 520,952 samples
   After  SMOTE: 1,369,928 samples

Train class distribution (After SMOTE):
   benign       : 342,482
   defacement   : 342,482
   malware      : 342,482
   phishing     : 342,482


In [8]:
# ── SECTION 6: TRAIN MODELS ──────────────────
models = {
    'Random Forest': RandomForestClassifier(
        n_estimators=100, n_jobs=-1, random_state=42, max_depth=20
    )
}

results = {}
print('[6] Training models on SMOTE-balanced data...\n')

for name, model in models.items():
    print(f'  Training {name}...')
    t0 = time.time()
    model.fit(X_train_sm, y_train_sm)   # train on SMOTE data
    train_time = time.time() - t0

    y_pred = model.predict(X_test)      # evaluate on original test data
    acc = accuracy_score(y_test, y_pred)
    f1  = f1_score(y_test, y_pred, average='weighted')

    results[name] = {
        'model':    model,
        'y_pred':   y_pred,
        'accuracy': acc,
        'f1':       f1,
        'time':     train_time,
    }
    print(f'    Accuracy: {acc:.4f} | F1: {f1:.4f} | Time: {train_time:.1f}s')

[6] Training models on SMOTE-balanced data...

  Training Random Forest...
    Accuracy: 0.9453 | F1: 0.9462 | Time: 123.9s


In [None]:
# ── SECTION 7: SAVE BEST MODEL AS PKL ────────
best_name = max(results, key=lambda k: results[k]['f1'])
best = results[best_name]

joblib.dump(best['model'], 'best_model.pkl')
joblib.dump(le, 'label_encoder.pkl')

print(f'[7] Best model : {best_name}')
print(f'    Accuracy   : {best["accuracy"]:.4f}')
print(f'    F1 Score   : {best["f1"]:.4f}')
print('    Saved      : best_model.pkl')
print('    Saved      : label_encoder.pkl')

[7] Best model : Random Forest
    Accuracy   : 0.9453
    F1 Score   : 0.9462
    Saved      : best_model.pkl
    Saved      : label_encoder.pkl


In [10]:
# ── SECTION 8: DETAILED CLASSIFICATION REPORT ─
print(f'[8] Best Model: {best_name} (F1={best["f1"]:.4f})')
print('\nClassification Report (evaluated on original test data):')
print(classification_report(y_test, best['y_pred'], target_names=le.classes_))

[8] Best Model: Random Forest (F1=0.9462)

Classification Report (evaluated on original test data):
              precision    recall  f1-score   support

      benign       0.98      0.95      0.97     85621
  defacement       0.93      0.97      0.95     19292
     malware       0.97      0.93      0.95      6504
    phishing       0.81      0.90      0.85     18822

    accuracy                           0.95    130239
   macro avg       0.92      0.94      0.93    130239
weighted avg       0.95      0.95      0.95    130239



In [11]:
# ── SECTION 9: VISUALIZATIONS ────────────────
print('[9] Generating visualizations...')

fig, axes = plt.subplots(2, 3, figsize=(20, 12))
fig.suptitle('Malicious URL Classifier (SMOTE Balanced) - Cyber Carnival 2026',
             fontsize=15, fontweight='bold')

# 9a. Class distribution BEFORE SMOTE
ax = axes[0, 0]
counts_orig = df['type'].value_counts()
colors = ['#2ecc71', '#e74c3c', '#f39c12', '#9b59b6']
bars = ax.bar(counts_orig.index, counts_orig.values, color=colors, edgecolor='black', linewidth=0.5)
ax.set_title('Class Distribution - Before SMOTE', fontweight='bold')
ax.set_xlabel('URL Type')
ax.set_ylabel('Count')
for bar, val in zip(bars, counts_orig.values):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 500,
            f'{val:,}', ha='center', va='bottom', fontsize=9)

# 9b. Class distribution AFTER SMOTE
ax = axes[0, 1]
unique_sm, counts_sm = np.unique(y_train_sm, return_counts=True)
bars = ax.bar(le.classes_, counts_sm, color=colors, edgecolor='black', linewidth=0.5)
ax.set_title('Class Distribution - After SMOTE (Train)', fontweight='bold')
ax.set_xlabel('URL Type')
ax.set_ylabel('Count')
for bar, val in zip(bars, counts_sm):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 500,
            f'{val:,}', ha='center', va='bottom', fontsize=9)

# 9c. Model comparison
ax = axes[0, 2]
model_names = list(results.keys())
accs = [results[m]['accuracy'] for m in model_names]
f1s  = [results[m]['f1'] for m in model_names]
x = np.arange(len(model_names))
w = 0.35
ax.bar(x - w/2, accs, w, label='Accuracy', color='#3498db', edgecolor='black', linewidth=0.5)
ax.bar(x + w/2, f1s,  w, label='F1 Score', color='#e74c3c', edgecolor='black', linewidth=0.5)
ax.set_title('Model Performance Comparison', fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels([m.replace(' ', '\n') for m in model_names])
ax.set_ylim(0.7, 1.0)
ax.legend()
ax.set_ylabel('Score')

# 9d. Confusion matrix
ax = axes[1, 0]
cm = confusion_matrix(y_test, best['y_pred'])
cm_norm = cm.astype(float) / cm.sum(axis=1, keepdims=True)
sns.heatmap(cm_norm, annot=True, fmt='.2%', cmap='Blues',
            xticklabels=le.classes_, yticklabels=le.classes_, ax=ax)
ax.set_title(f'Confusion Matrix - {best_name}', fontweight='bold')
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')

# 9e. Per-class F1 score
ax = axes[1, 1]
from sklearn.metrics import f1_score as f1_per
per_class_f1 = f1_per(y_test, best['y_pred'], average=None)
bars = ax.bar(le.classes_, per_class_f1, color=colors, edgecolor='black', linewidth=0.5)
ax.set_title(f'Per-Class F1 Score - {best_name}', fontweight='bold')
ax.set_xlabel('Class')
ax.set_ylabel('F1 Score')
ax.set_ylim(0, 1.1)
for bar, val in zip(bars, per_class_f1):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02,
            f'{val:.3f}', ha='center', va='bottom', fontsize=10, fontweight='bold')

# 9f. Feature importance
ax = axes[1, 2]
best_model = best['model']
if hasattr(best_model, 'feature_importances_'):
    fi = pd.Series(best_model.feature_importances_, index=X.columns)
    fi.nlargest(15).sort_values().plot(kind='barh', ax=ax, color='#27ae60',
                                       edgecolor='black', linewidth=0.5)
    ax.set_title(f'Top 15 Feature Importances - {best_name}', fontweight='bold')
    ax.set_xlabel('Importance')
else:
    coef = np.abs(best_model.coef_).mean(axis=0)
    fi = pd.Series(coef, index=X.columns)
    fi.nlargest(15).sort_values().plot(kind='barh', ax=ax, color='#27ae60',
                                       edgecolor='black', linewidth=0.5)
    ax.set_title('Top 15 Feature Coefficients', fontweight='bold')
    ax.set_xlabel('Mean |Coefficient|')

plt.tight_layout()
plt.savefig('results_visualization.png', dpi=150, bbox_inches='tight')
plt.show()
print('   Saved: results_visualization.png')

[9] Generating visualizations...
   Saved: results_visualization.png


In [12]:
# ── SECTION 10: SAVE SUMMARY REPORT ──────────
summary = f"""
MALICIOUS URL CLASSIFIER - RESULTS SUMMARY
Cyber Carnival 2026 Hackathon
{'='*60}

DATASET
  Total samples : {len(df):,}
  Classes       : {', '.join(le.classes_)}
  Distribution  :
{df['type'].value_counts().to_string()}

SMOTE BALANCING
  Train samples before SMOTE : {len(X_train):,}
  Train samples after  SMOTE : {len(X_train_sm):,}
  Test set untouched         : {len(X_test):,} (original data only)

FEATURE ENGINEERING
  Total features : {X.shape[1]}

MODEL RESULTS (Trained on SMOTE | Tested on Original)
{'-'*50}
"""
for name, r in results.items():
    summary += f"  {name}\n"
    summary += f"    Accuracy  : {r['accuracy']:.4f} ({r['accuracy']*100:.2f}%)\n"
    summary += f"    F1 Score  : {r['f1']:.4f}\n"
    summary += f"    Train Time: {r['time']:.1f}s\n\n"

summary += f"""
BEST MODEL: {best_name}
  Accuracy : {best['accuracy']:.4f} ({best['accuracy']*100:.2f}%)
  F1 Score : {best['f1']:.4f}

CLASSIFICATION REPORT ({best_name}):
{classification_report(y_test, best['y_pred'], target_names=le.classes_)}
"""

with open('results_summary.txt', 'w', encoding='utf-8') as f:
    f.write(summary)

print('[10] Saved: results_summary.txt')
print('\n' + '='*60)
print(f'DONE. Best Model: {best_name} | Accuracy: {best["accuracy"]:.4f}')
print('='*60)

[10] Saved: results_summary.txt

DONE. Best Model: Random Forest | Accuracy: 0.9453


In [None]:
# ── SECTION 11: LOAD SAVED MODEL & PREDICT URL ─
# Run this cell independently anytime without retraining

best_model = joblib.load('best_model.pkl')
le_loaded  = joblib.load('label_encoder.pkl')

print('[11] Model loaded from best_model.pkl')
print('=' * 60)


url_input = input('\nEnter a URL to classify (or "quit" to exit): ').strip()

if url_input.lower() == 'quit':
    print('Exiting URL checker.')
    exit()

if not url_input:
    print('Please enter a valid URL.')
    exit()
    

input_features = extract_features(url_input)
input_df = pd.DataFrame([input_features])

prediction      = best_model.predict(input_df)[0]
probabilities   = best_model.predict_proba(input_df)[0]
predicted_class = le_loaded.inverse_transform([prediction])[0]

print(f'\n  URL    : {url_input}')
print(f'  Result : {predicted_class.upper()}')
print(f'\n  Class Probabilities:')
for cls, prob in zip(le_loaded.classes_, probabilities):
    bar = chr(9608) * int(prob * 30)
    print(f'    {cls:<12} : {prob:.4f}  {bar}')

[11] Model loaded from best_model.pkl

  URL    : keygen-generator-free.net/download/patch.exe
  Result : BENIGN

  Class Probabilities:
    benign       : 0.8976  ██████████████████████████
    defacement   : 0.0139  
    malware      : 0.0161  
    phishing     : 0.0723  ██
