In [19]:
# 📦 Imports
from google.colab import drive


import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score

In [27]:

# Mount Google Drive
drive.mount('/content/drive')

# Define dataset path
dataset_path = "/content/drive/My Drive/spamDetection/phishing.csv"

# Load dataset
# The encoding parameter is added to specify the correct encoding for the CSV file.
# 📊 Load dataset (assumes file is named exactly 'phishing.csv')
df = pd.read_csv(dataset_path)

# 🔍 Basic data overview
print("Dataset shape:", df.shape)
display(df.describe())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Dataset shape: (11430, 89)


Unnamed: 0,length_url,length_hostname,ip,nb_dots,nb_hyphens,nb_at,nb_qm,nb_and,nb_or,nb_eq,...,empty_title,domain_in_title,domain_with_copyright,whois_registered_domain,domain_registration_length,domain_age,web_traffic,dns_record,google_index,page_rank
count,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,...,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0
mean,61.126684,21.090289,0.150569,2.480752,0.99755,0.022222,0.141207,0.162292,0.0,0.293176,...,0.124759,0.775853,0.439545,0.072878,492.532196,4062.543745,856756.6,0.020122,0.533946,3.185739
std,55.297318,10.777171,0.357644,1.369686,2.087087,0.1555,0.364456,0.821337,0.0,0.998317,...,0.33046,0.417038,0.496353,0.259948,814.769415,3107.7846,1995606.0,0.140425,0.498868,2.536955
min,12.0,4.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,-1.0,-12.0,0.0,0.0,0.0,0.0
25%,33.0,15.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,84.0,972.25,0.0,0.0,0.0,1.0
50%,47.0,19.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,242.0,3993.0,1651.0,0.0,1.0,3.0
75%,71.0,24.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,449.0,7026.75,373845.5,0.0,1.0,5.0
max,1641.0,214.0,1.0,24.0,43.0,4.0,3.0,19.0,0.0,19.0,...,1.0,1.0,1.0,1.0,29829.0,12874.0,10767990.0,1.0,1.0,10.0


In [30]:
from google.colab import drive
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OrdinalEncoder # Import OrdinalEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score

# ✅ Drop target column and prepare features
X_selected = df.drop(columns=['status'])  # Assuming 'status' is the target
y = df['status']

# 🔀 Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_selected, y, test_size=0.2, random_state=42, shuffle=True
)

# 🔢 Convert categorical features to numerical using OrdinalEncoder
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1) # Create an OrdinalEncoder object with handle_unknown and unknown_value parameters
categorical_features = X_train.select_dtypes(include=['object']).columns # Select categorical columns

# Fit and transform on training data
X_train[categorical_features] = encoder.fit_transform(X_train[categorical_features])
# Transform testing data using the same encoder
X_test[categorical_features] = encoder.transform(X_test[categorical_features])


# 🌳 Train Random Forest classifier
model = RandomForestClassifier(n_estimators=350, random_state=42)
model.fit(X_train, y_train) # Now the model should train without the ValueError

In [32]:
# 📈 Evaluation function
def evaluate_model(model, X, y_true, dataset_name="Dataset"):
    lb = preprocessing.LabelBinarizer()
    lb.fit(y_true)

    y_pred = model.predict(X)

    accuracy = accuracy_score(y_true, y_pred)
    print(f"\n📊 {dataset_name} Accuracy: {accuracy:.4f}")

    cm = confusion_matrix(y_true, y_pred)
    print(f"🧾 Confusion Matrix:\n{cm}")

    # ROC AUC Score (only if binary classification)
    if len(lb.classes_) == 2:
        roc_score = roc_auc_score(lb.transform(y_true), lb.transform(y_pred))
        print(f"🧠 ROC AUC Score: {roc_score:.4f}")

# 🧪 Evaluate on train and test sets
evaluate_model(model, X_train, y_train, "Training")
evaluate_model(model, X_test, y_test, "Testing")

# 💾 Save the model to file
with open("model_phishing_webpage_classifier.pkl", "wb") as f:
    pickle.dump(model, f)



📊 Training Accuracy: 1.0000
🧾 Confusion Matrix:
[[4558    0]
 [   0 4586]]
🧠 ROC AUC Score: 1.0000

📊 Testing Accuracy: 0.9698
🧾 Confusion Matrix:
[[1120   37]
 [  32 1097]]
🧠 ROC AUC Score: 0.9698
