In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
    confusion_matrix
)

from sklearn.neighbors import KNeighborsClassifier

import re



In [3]:


DATA_PATH = "phiusiil_phishing_url.csv"

df = pd.read_csv(DATA_PATH)
df.head()


target_col = "label"
URL_COL = "URL"


def engineer_url_features(df_in: pd.DataFrame) -> pd.DataFrame:
    df = df_in.copy()
    urls = df[URL_COL].astype(str)
    df["URLRawLength"] = urls.str.len()
    df["NumDigitsInURL"] = urls.str.count(r"\d")
    df["NumSpecialCharsInURL"] = urls.str.count(r"[\/\?\&\=\%\_\-\.\@\!\:]+")
    df["HasHTTPS"] = urls.str.startswith("https").astype(int)
    return df


df = engineer_url_features(df)

drop_cols = [target_col]

for col in ["FILENAME", "FileName", "file_name"]:
    if col in df.columns:
        drop_cols.append(col)

X = df.drop(columns=drop_cols)
y = df[target_col]



FileNotFoundError: [Errno 2] No such file or directory: 'phiusiil_phishing_url.csv'

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = X.select_dtypes(include=["object", "bool"]).columns.tolist()

numeric_transformer = Pipeline(steps=[
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

models = {
    "knn": KNeighborsClassifier()
}

param_grids = {
    "knn": {
        "clf__n_neighbors": [3, 5, 11],
        "clf__weights": ["uniform", "distance"],
        "clf__metric": ["minkowski", "manhattan"]
    }
}



In [None]:
def plot_confusion_matrix(cm, classes, title="Confusion matrix"):
    plt.figure(figsize=(4, 3))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
                xticklabels=classes, yticklabels=classes)
    plt.ylabel("True label")
    plt.xlabel("Predicted label")
    plt.title(title)
    plt.tight_layout()
    plt.show()


for name, model in best_models.items():
    y_pred = model.predict(X_test)
    print(name.upper())
    print(classification_report(y_test, y_pred, digits=4))
    cm = confusion_matrix(y_test, y_pred)
    plot_confusion_matrix(
        cm,
        classes=["phishing (0)", "legitimate (1)"],
        title=f"{name.upper()} - Confusion Matrix"
    )


metrics_to_plot = ["test_accuracy", "test_precision", "test_recall", "test_f1"]

plt.figure(figsize=(8, 5))
for metric in metrics_to_plot:
    plt.plot(
        results_df["model"],
        results_df[metric],
        marker="o",
        label=metric.replace("test_", "").capitalize()
    )

plt.ylim(0.5, 1.0)
plt.title("Model Comparison on Test Set")
plt.xlabel("Model")
plt.ylabel("Score")
plt.legend()
plt.grid(True)
plt.show()

results_df.sort_values("test_f1", ascending=False)