In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Configure pandas to display all columns
pd.set_option('display.max_columns', None)

In [3]:
# Define the path to our labeled dataset
DATA_PATH = 'D:/Documents/Projects/phishing-detection-challenge/data/processed/labeled_features.csv'

# Load the data into a DataFrame
df = pd.read_csv(DATA_PATH)

print("Dataset Shape:", df.shape)
print("\nLabel Distribution:")
print(df['is_phishing'].value_counts())
print("\nDataset Head:")
df.head()

Dataset Shape: (5000, 11)

Label Distribution:
is_phishing
0    4606
1     394
Name: count, dtype: int64

Dataset Head:


Unnamed: 0,url,domain,creation_date,url_length,domain_length,dots_count,hyphens_count,special_chars_count,domain_entropy,domain_age_days,is_phishing
0,xn--yonbusiess-14b6800g.sbi,xn--yonbusiess-14b6800g.sbi,,27,27,1,3,0,3.884155,9999.0,0
1,xn--sbicarc-tjb.com,xn--sbicarc-tjb.com,,19,19,1,3,0,3.64215,9999.0,0
2,clinic-poc.skills.ninja,clinic-poc.skills.ninja,,23,23,2,1,0,3.38162,9999.0,0
3,www.cloud1.visibleintel.com,www.cloud1.visibleintel.com,2025-08-15 23:42:23+00:00,27,27,3,0,0,3.828238,51.0,0
4,xn--hdfero-yua041a.com,xn--hdfero-yua041a.com,,22,22,1,3,0,4.061482,9999.0,0


In [4]:
# Define our target variable
target = 'is_phishing'

# Define our features by dropping non-numeric/identifier columns and the target
# We'll keep domain_age_days and domain_entropy but drop the raw date string
features_to_drop = ['url', 'domain', 'creation_date', target]
features = df.drop(columns=features_to_drop).columns

# Create our feature matrix (X) and target vector (y)
X = df[features]
y = df[target]

# Fill any remaining missing values with the median of their column
for col in X.columns:
    if X[col].isnull().any():
        median_val = X[col].median()
        X[col].fillna(median_val, inplace=True)
        print(f"Filled missing values in '{col}' with median value: {median_val}")

print("\nFeatures being used for training:")
print(features)


Features being used for training:
Index(['url_length', 'domain_length', 'dots_count', 'hyphens_count',
       'special_chars_count', 'domain_entropy', 'domain_age_days'],
      dtype='object')


In [5]:
# Split the data: 80% for training, 20% for testing
# stratify=y ensures the proportion of phishing/benign is the same in both sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Training set size:", len(X_train))
print("Testing set size:", len(X_test))

Training set size: 4000
Testing set size: 1000


In [6]:
# Initialize the Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced', n_jobs=-1)

print("--> Training the model...")
model.fit(X_train, y_train)
print("[+] Model training complete!")

--> Training the model...
[+] Model training complete!


In [7]:
# Make predictions on the unseen test data
y_pred = model.predict(X_test)

# Print the evaluation report
print("--- Model Evaluation Report ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}\n")
print(classification_report(y_test, y_pred, target_names=['Benign (0)', 'Phishing (1)']))

--- Model Evaluation Report ---
Accuracy: 0.9980

              precision    recall  f1-score   support

  Benign (0)       1.00      1.00      1.00       921
Phishing (1)       1.00      0.97      0.99        79

    accuracy                           1.00      1000
   macro avg       1.00      0.99      0.99      1000
weighted avg       1.00      1.00      1.00      1000



In [10]:
# Cell 7: Analyze Feature Importances

# Create a pandas Series to view feature importances
feature_importances = pd.Series(model.feature_importances_, index=features)

# Sort the feature importances in descending order
sorted_importances = feature_importances.sort_values(ascending=False)

print("--- Top 10 Most Important Features ---")
print(sorted_importances.head(10))

# Optional: Plot the feature importances for a visual representation
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 8))
sns.barplot(x=sorted_importances, y=sorted_importances.index)
plt.xlabel('Importance Score')
plt.ylabel('Features')
plt.title("Feature Importance")
plt.show()


--- Top 10 Most Important Features ---
domain_length          0.317494
url_length             0.290833
dots_count             0.180530
domain_entropy         0.096571
hyphens_count          0.090509
domain_age_days        0.024064
special_chars_count    0.000000
dtype: float64


ModuleNotFoundError: No module named 'seaborn'

In [9]:
pip install matplotlib

Collecting matplotlib
  Downloading matplotlib-3.10.6-cp313-cp313-win_amd64.whl.metadata (11 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Downloading contourpy-1.3.3-cp313-cp313-win_amd64.whl.metadata (5.5 kB)
Collecting cycler>=0.10 (from matplotlib)
  Downloading cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Downloading fonttools-4.60.1-cp313-cp313-win_amd64.whl.metadata (114 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib)
  Downloading kiwisolver-1.4.9-cp313-cp313-win_amd64.whl.metadata (6.4 kB)
Collecting pyparsing>=2.3.1 (from matplotlib)
  Downloading pyparsing-3.2.5-py3-none-any.whl.metadata (5.0 kB)
Downloading matplotlib-3.10.6-cp313-cp313-win_amd64.whl (8.1 MB)
   ---------------------------------------- 0.0/8.1 MB ? eta -:--:--
   --- ------------------------------------ 0.8/8.1 MB 6.2 MB/s eta 0:00:02
   --------- ------------------------------ 1.8/8.1 MB 4.2 MB/s eta 0:00:02
   ------------ ---------------------

In [11]:
pip install seaborn

Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Downloading seaborn-0.13.2-py3-none-any.whl (294 kB)
Installing collected packages: seaborn
Successfully installed seaborn-0.13.2
Note: you may need to restart the kernel to use updated packages.


In [12]:
pip install joblib

Note: you may need to restart the kernel to use updated packages.


In [13]:
# Cell 8: Save the Trained Model

import joblib

# Define the file path for the saved model
MODEL_PATH = '../src/models/phishing_detector_model.joblib'

# Save the model object to the file
joblib.dump(model, MODEL_PATH)

print(f"[+] Model saved successfully to: {MODEL_PATH}")

[+] Model saved successfully to: ../src/models/phishing_detector_model.joblib
