# URL Phishing Classifier
# This notebook builds a simple classification model to detect phishing URLs


In [2]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder
import joblib
import warnings
warnings.filterwarnings('ignore')


## Load and Explore Dataset


In [3]:
# Load the dataset
data_path = r"D:\Work\Projects\LinkShield\data\dataset_phishing.csv"
df = pd.read_csv(data_path)

print("Dataset Information:")
print(f"Shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nFirst few rows:")
print(df.head())

print(f"\nTarget distribution:")
print(df['status'].value_counts())

print(f"\nMissing values:")
print(df.isnull().sum().sum())


Dataset Information:
Shape: (11430, 89)

Columns: ['url', 'length_url', 'length_hostname', 'ip', 'nb_dots', 'nb_hyphens', 'nb_at', 'nb_qm', 'nb_and', 'nb_or', 'nb_eq', 'nb_underscore', 'nb_tilde', 'nb_percent', 'nb_slash', 'nb_star', 'nb_colon', 'nb_comma', 'nb_semicolumn', 'nb_dollar', 'nb_space', 'nb_www', 'nb_com', 'nb_dslash', 'http_in_path', 'https_token', 'ratio_digits_url', 'ratio_digits_host', 'punycode', 'port', 'tld_in_path', 'tld_in_subdomain', 'abnormal_subdomain', 'nb_subdomains', 'prefix_suffix', 'random_domain', 'shortening_service', 'path_extension', 'nb_redirection', 'nb_external_redirection', 'length_words_raw', 'char_repeat', 'shortest_words_raw', 'shortest_word_host', 'shortest_word_path', 'longest_words_raw', 'longest_word_host', 'longest_word_path', 'avg_words_raw', 'avg_word_host', 'avg_word_path', 'phish_hints', 'domain_in_brand', 'brand_in_subdomain', 'brand_in_path', 'suspecious_tld', 'statistical_report', 'nb_hyperlinks', 'ratio_intHyperlinks', 'ratio_extHype

## Data Preprocessing


In [4]:
# Separate features and target
# Drop the 'url' column as it's not needed for feature-based classification
X = df.drop(columns=['status', 'url'])
y = df['status']

# Encode target variable (legitimate=0, phishing=1)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print(f"Features shape: {X.shape}")
print(f"Target shape: {y_encoded.shape}")
print(f"\nFeature columns ({len(X.columns)}):")
print(X.columns.tolist())
print(f"\nClass encoding: {dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))}")


Features shape: (11430, 87)
Target shape: (11430,)

Feature columns (87):
['length_url', 'length_hostname', 'ip', 'nb_dots', 'nb_hyphens', 'nb_at', 'nb_qm', 'nb_and', 'nb_or', 'nb_eq', 'nb_underscore', 'nb_tilde', 'nb_percent', 'nb_slash', 'nb_star', 'nb_colon', 'nb_comma', 'nb_semicolumn', 'nb_dollar', 'nb_space', 'nb_www', 'nb_com', 'nb_dslash', 'http_in_path', 'https_token', 'ratio_digits_url', 'ratio_digits_host', 'punycode', 'port', 'tld_in_path', 'tld_in_subdomain', 'abnormal_subdomain', 'nb_subdomains', 'prefix_suffix', 'random_domain', 'shortening_service', 'path_extension', 'nb_redirection', 'nb_external_redirection', 'length_words_raw', 'char_repeat', 'shortest_words_raw', 'shortest_word_host', 'shortest_word_path', 'longest_words_raw', 'longest_word_host', 'longest_word_path', 'avg_words_raw', 'avg_word_host', 'avg_word_path', 'phish_hints', 'domain_in_brand', 'brand_in_subdomain', 'brand_in_path', 'suspecious_tld', 'statistical_report', 'nb_hyperlinks', 'ratio_intHyperlinks

## Split Data into Training and Testing Sets


In [5]:
# Split the data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded,
    test_size=0.2,
    random_state=42,
    stratify=y_encoded
)

print(f"Training set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")
print(f"\nTraining set class distribution:")
print(pd.Series(y_train).value_counts())
print(f"\nTest set class distribution:")
print(pd.Series(y_test).value_counts())


Training set size: (9144, 87)
Test set size: (2286, 87)

Training set class distribution:
0    4572
1    4572
Name: count, dtype: int64

Test set class distribution:
1    1143
0    1143
Name: count, dtype: int64


## Train Classification Model


In [6]:
# Initialize Random Forest Classifier (simple but effective model)
model = RandomForestClassifier(
    n_estimators=100,      # Number of trees
    max_depth=20,          # Maximum depth of trees
    min_samples_split=5,   # Minimum samples required to split a node
    min_samples_leaf=2,    # Minimum samples required at leaf node
    random_state=42,
    n_jobs=-1,             # Use all available cores
    verbose=1
)

# Train the model
print("Training Random Forest Classifier...")
model.fit(X_train, y_train)
print("\nTraining completed!")


Training Random Forest Classifier...


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    0.0s



Training completed!


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.2s finished


## Model Evaluation


In [8]:
# Make predictions on test set
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]  # Probability of phishing class

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

print("=" * 60)
print("MODEL EVALUATION RESULTS")
print("=" * 60)
print(f"\nAccuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"ROC-AUC Score: {roc_auc:.4f}")

print("\n" + "=" * 60)
print("Classification Report:")
print("=" * 60)
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

print("\n" + "=" * 60)
print("Confusion Matrix:")
print("=" * 60)
cm = confusion_matrix(y_test, y_pred)
print(f"\n                Predicted")
print(f"              Legitimate  Phishing")
print(f"Actual Legitimate    {cm[0][0]:6d}     {cm[0][1]:6d}")
print(f"       Phishing      {cm[1][0]:6d}     {cm[1][1]:6d}")


MODEL EVALUATION RESULTS

Accuracy: 0.9615 (96.15%)
ROC-AUC Score: 0.9930

Classification Report:
              precision    recall  f1-score   support

  legitimate       0.96      0.96      0.96      1143
    phishing       0.96      0.97      0.96      1143

    accuracy                           0.96      2286
   macro avg       0.96      0.96      0.96      2286
weighted avg       0.96      0.96      0.96      2286


Confusion Matrix:

                Predicted
              Legitimate  Phishing
Actual Legitimate      1095         48
       Phishing          40       1103


[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished


## Feature Importance Analysis


In [9]:
# Get feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 15 Most Important Features:")
print("=" * 60)
print(feature_importance.head(15).to_string(index=False))



Top 15 Most Important Features:
             feature  importance
        google_index    0.180353
           page_rank    0.103512
       nb_hyperlinks    0.090628
         web_traffic    0.086888
              nb_www    0.040548
 ratio_extHyperlinks    0.029987
          domain_age    0.028306
   longest_word_path    0.027288
         phish_hints    0.027056
         safe_anchor    0.026868
 ratio_intHyperlinks    0.025212
ratio_extRedirection    0.021485
    ratio_digits_url    0.019863
   longest_words_raw    0.015918
     length_hostname    0.015894


## Save the Model


In [10]:
# Save the trained model and label encoder
model_path = r"D:\Work\Projects\LinkShield\models\url_classifier.pkl"
encoder_path = r"D:\Work\Projects\LinkShield\models\label_encoder.pkl"

joblib.dump(model, model_path)
joblib.dump(label_encoder, encoder_path)

print(f"\nModel saved to: {model_path}")
print(f"Label encoder saved to: {encoder_path}")



Model saved to: D:\Work\Projects\LinkShield\models\url_classifier.pkl
Label encoder saved to: D:\Work\Projects\LinkShield\models\label_encoder.pkl


## Test Predictions on Sample Data


In [11]:
# Make predictions on a few test samples
sample_indices = [0, 10, 20, 30, 40]
samples = X_test.iloc[sample_indices]
true_labels = y_test[sample_indices]
predictions = model.predict(samples)
probabilities = model.predict_proba(samples)

print("\nSample Predictions:")
print("=" * 80)
for i, idx in enumerate(sample_indices):
    true_label = label_encoder.inverse_transform([true_labels[i]])[0]
    pred_label = label_encoder.inverse_transform([predictions[i]])[0]
    prob_phishing = probabilities[i][1]

    print(f"\nSample {i+1}:")
    print(f"  True Label: {true_label}")
    print(f"  Predicted: {pred_label}")
    print(f"  Confidence (Phishing): {prob_phishing:.2%}")
    print(f"  Confidence (Legitimate): {(1-prob_phishing):.2%}")
    print(f"  Result: {'✓ CORRECT' if true_label == pred_label else '✗ WRONG'}")

print("\n" + "=" * 80)
print(f"Model training and evaluation complete!")
print(f"Overall Accuracy: {accuracy*100:.2f}%")
print("=" * 80)



Sample Predictions:

Sample 1:
  True Label: phishing
  Predicted: phishing
  Confidence (Phishing): 97.14%
  Confidence (Legitimate): 2.86%
  Result: ✓ CORRECT

Sample 2:
  True Label: phishing
  Predicted: phishing
  Confidence (Phishing): 99.67%
  Confidence (Legitimate): 0.33%
  Result: ✓ CORRECT

Sample 3:
  True Label: legitimate
  Predicted: legitimate
  Confidence (Phishing): 3.19%
  Confidence (Legitimate): 96.81%
  Result: ✓ CORRECT

Sample 4:
  True Label: legitimate
  Predicted: legitimate
  Confidence (Phishing): 1.26%
  Confidence (Legitimate): 98.74%
  Result: ✓ CORRECT

Sample 5:
  True Label: phishing
  Predicted: phishing
  Confidence (Phishing): 93.44%
  Confidence (Legitimate): 6.56%
  Result: ✓ CORRECT

Model training and evaluation complete!
Overall Accuracy: 96.15%


[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished
