In [59]:
import pandas as pd
import re, math, tldextract
from urllib.parse import urlparse
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
import xgboost as xgb
import pickle

# 1. Load your dataset
df = pd.read_csv("malicious_phish.csv")  # Adjust file path
print("Initial class counts:\n", df['type'].value_counts())

Initial class counts:
 type
benign        428103
defacement     96457
phishing       94111
malware        32520
Name: count, dtype: int64


In [60]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['type_encoded'] = le.fit_transform(df['type'])

print("Label mapping:", dict(zip(le.classes_, le.transform(le.classes_))))


Label mapping: {'benign': np.int64(0), 'defacement': np.int64(1), 'malware': np.int64(2), 'phishing': np.int64(3)}


In [None]:
import re
import math
import tldextract
from urllib.parse import urlparse

TRUSTED_DOMAINS = {
    'google.com', 'youtube.com', 'facebook.com',
    'instagram.com', 'reddit.com', 'wikipedia.org',
    'twitter.com', 'amazon.com', 'linkedin.com',
    'netflix.com', 'microsoft.com', 'github.com',
    'paypal.com', 'apple.com', 'bing.com', 'chatgpt.com'
}

def extract_features_dict(url):
    try:
        p = urlparse(url)
        ext = tldextract.extract(url)
        domain = ext.domain or ""
        sub = ext.subdomain or ""
        full = ext.top_domain_under_public_suffix.lower()  # updated

        url_len = len(url)
        specials = len(re.findall(r'[^a-zA-Z0-9]', url))
        entropy = -sum((url.count(c)/url_len) * math.log2(url.count(c)/url_len) for c in set(url)) if url_len else 0

        features = {
            'url_length': url_len,
            'domain_length': len(domain),
            'path_length': len(p.path),
            'count_dot': url.count('.'),
            'count_hyphen': url.count('-'),
            'count_at': url.count('@'),
            'count_question': url.count('?'),
            'count_equal': url.count('='),
            'count_slash': url.count('/'),
            'has_login': int('login' in url.lower()),
            'has_bank': int('bank' in url.lower()),
            'has_verify': int('verify' in url.lower()),
            'has_ip': int(bool(re.match(r'(?:\d{1,3}\.){3}\d{1,3}', p.netloc))),
            'digit_ratio': sum(c.isdigit() for c in url) / url_len,
            'special_char_ratio': specials / url_len,
            'url_entropy': entropy,
            'has_https': int(p.scheme.lower() == 'https'),
            'subdomain_length': len(sub),
            'count_sensitive_words': sum(w in url.lower() for w in ['login','bank','verify','secure','account','update']),
            'is_free_hosting': int(any(h in url.lower() for h in ['000webhost','freenom','infinityfree'])),
            'is_shortened': int(any(s in url.lower() for s in ['bit.ly','tinyurl.com','goo.gl','ow.ly','is.gd'])),
            'brand_in_subdomain': int(any(b in sub.lower() for b in ['paypal','citi','facebook','google','amazon'])),
            'is_trusted': int(full in TRUSTED_DOMAINS)  
        }
        return features
    except Exception as e:
        print(f"[ERROR] Feature extraction failed: {e}")
        return None


In [62]:
# 3. Apply extractor
feat_rows = df['url'].apply(extract_features_dict)
feat_df = pd.DataFrame(feat_rows.tolist())
feat_df['type_encoded'] = df['type_encoded']
feat_df.dropna(inplace=True)

In [63]:
# Apply extractor (23 features!)
feat_rows = df['url'].apply(extract_features_dict)
feat_df = pd.DataFrame(feat_rows.tolist())

# If you had previously assigned labels, ensure using correct column:
feat_df['type_encoded'] = df.loc[feat_df.index, 'type_encoded']

# Drop incomplete entries
feat_df.dropna(inplace=True)


In [64]:
# 4. Split/train
X = feat_df.drop(['type_encoded'], axis=1)
y = feat_df['type_encoded']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

print("Before SMOTE:", y_train.value_counts())
X_tr, y_tr = SMOTE(random_state=42).fit_resample(X_train, y_train)
print("After SMOTE:", y_tr.value_counts())
from xgboost import XGBClassifier

model = XGBClassifier(
    n_estimators=150,
    max_depth=6,
    learning_rate=0.1,
    random_state=42,
    eval_metric="mlogloss"
)
model.fit(X_train, y_train)


print(classification_report(y_test, model.predict(X_test)))


Before SMOTE: type_encoded
0    342482
1     77165
3     75289
2     26016
Name: count, dtype: int64
After SMOTE: type_encoded
0    342482
1    342482
3    342482
2    342482
Name: count, dtype: int64
              precision    recall  f1-score   support

           0       0.96      0.98      0.97     85621
           1       0.94      0.98      0.96     19292
           2       0.97      0.86      0.91      6504
           3       0.89      0.81      0.85     18822

    accuracy                           0.95    130239
   macro avg       0.94      0.91      0.92    130239
weighted avg       0.95      0.95      0.95    130239



In [65]:
# 5. Save model
with open("phishcatcher_model.pkl", "wb") as f:
    pickle.dump(model, f)

print("✅ Training complete. Model saved.")


✅ Training complete. Model saved.


In [66]:
with open("label_encoder.pkl", "wb") as f:
    pickle.dump(le, f)  # do this again from your training notebook


In [67]:
print(extract_features_dict("https://google.com"))


{'url_length': 18, 'domain_length': 6, 'path_length': 0, 'count_dot': 1, 'count_hyphen': 0, 'count_at': 0, 'count_question': 0, 'count_equal': 0, 'count_slash': 2, 'has_login': 0, 'has_bank': 0, 'has_verify': 0, 'has_ip': 0, 'digit_ratio': 0.0, 'special_char_ratio': 0.2222222222222222, 'url_entropy': 3.5724312513221195, 'has_https': 1, 'subdomain_length': 0, 'count_sensitive_words': 0, 'is_free_hosting': 0, 'is_shortened': 0, 'brand_in_subdomain': 0, 'is_trusted': 1}


In [68]:
print(extract_features_dict("http://paypalsecure-login.fake.com"))


{'url_length': 34, 'domain_length': 4, 'path_length': 0, 'count_dot': 2, 'count_hyphen': 1, 'count_at': 0, 'count_question': 0, 'count_equal': 0, 'count_slash': 2, 'has_login': 1, 'has_bank': 0, 'has_verify': 0, 'has_ip': 0, 'digit_ratio': 0.0, 'special_char_ratio': 0.17647058823529413, 'url_entropy': 4.314972767530033, 'has_https': 0, 'subdomain_length': 18, 'count_sensitive_words': 2, 'is_free_hosting': 0, 'is_shortened': 0, 'brand_in_subdomain': 1, 'is_trusted': 0}


In [69]:
print("Model expects features:", model.n_features_in_)
print("Feature names in model:", model.get_booster().feature_names)


Model expects features: 23
Feature names in model: ['url_length', 'domain_length', 'path_length', 'count_dot', 'count_hyphen', 'count_at', 'count_question', 'count_equal', 'count_slash', 'has_login', 'has_bank', 'has_verify', 'has_ip', 'digit_ratio', 'special_char_ratio', 'url_entropy', 'has_https', 'subdomain_length', 'count_sensitive_words', 'is_free_hosting', 'is_shortened', 'brand_in_subdomain', 'is_trusted']
