In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
#Machine Learning Models
from xgboost import XGBClassifier, plot_importance
from lightgbm import LGBMClassifier, plot_importance
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report,
)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

import time

from collections import Counter

In [2]:
#URL
df1 = pd.read_csv('url_features.csv').sort_values(by='url')
# df = pd.get_dummies(df, columns=['tld_type'], prefix='tld')
# X = df.drop(['url','label', 'tld_type'], axis=1)
# y = df['label']

In [3]:
#HTML
df2 = pd.read_csv('new_html_features.csv').sort_values(by='url')
# X = df.drop(['label', 'url'], axis=1)
# y = df['label']

In [4]:
df3 = pd.merge(df1, df2, on="url")

df3 = pd.get_dummies(df3, columns=['tld_type'], prefix='tld')
df3 = df3.rename(columns={'label_x': 'label'})
# X3 = df3.drop(['url','label', 'label_y'], axis=1)
# y3 = df3['label']

In [5]:
# 80,000
# n_legit = 77600
# n_phish = 2400


# 25% - 174,268
# n_legit = 168040
# n_phish = 5228


# 50% - 348,536
n_legit = 337080
n_phish = 10456

In [6]:
legit_df3 = df3[df3['label'] == 0]
phish_df3 = df3[df3['label'] == 1]

In [7]:
sampled_legit3 = legit_df3.sample(n=n_legit, random_state=42)
sampled_phish3 = phish_df3.sample(n=n_phish, random_state=42)

In [8]:
reduced_df3 = pd.concat([sampled_legit3, sampled_phish3]).sample(frac=1, random_state=42).reset_index(drop=True)

In [9]:
X3 = reduced_df3.drop(['url','label', 'label_y'], axis=1)
y3 = reduced_df3['label']

In [10]:
X_train3, X_test3, y_train3, y_test3 = train_test_split(X3, y3, stratify=y3, test_size=0.2, random_state=42)

In [11]:
models = {
    "XGBoost": XGBClassifier(random_state=42),
    "LightGBM": LGBMClassifier(random_state=42),
    "Random Forest": RandomForestClassifier( random_state=42),
    "K-Nearest Neighbor": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier( random_state=42)
}

In [12]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def run_cv_block(X_train, y_train, X_test, y_test, models, dataset_name):
    print(f"\n\n===== {dataset_name} =====")
    cv_results = {}

    for name, model in models.items():
        print(f"\n{name} - 5-Fold CV on 80% Training Set")

        fold_accuracies = []
        fold_precisions = []
        fold_recalls = []
        fold_f1s = []
        fold_reports = []

        training_times = []
        prediction_times = []

        for train_idx, val_idx in kfold.split(X_train, y_train):
            X_train_fold, X_val_fold = X_train.iloc[train_idx], X_train.iloc[val_idx]
            y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]

            start_train = time.time()
            model.fit(X_train_fold, y_train_fold)
            end_train = time.time()

            start_pred = time.time()
            y_val_pred = model.predict(X_val_fold)
            end_pred = time.time()

            fold_accuracies.append(accuracy_score(y_val_fold, y_val_pred))
            fold_precisions.append(precision_score(y_val_fold, y_val_pred, average='weighted', zero_division=0))
            fold_recalls.append(recall_score(y_val_fold, y_val_pred, average='weighted', zero_division=0))
            fold_f1s.append(f1_score(y_val_fold, y_val_pred, average='weighted', zero_division=0))

            report = classification_report(y_val_fold, y_val_pred, output_dict=True, zero_division=0)
            fold_reports.append(pd.DataFrame(report).transpose())

            training_times.append(end_train - start_train)
            prediction_times.append(end_pred - start_pred)

        avg_report_df = pd.concat(fold_reports).groupby(level=0).mean()

        model.fit(X_train, y_train)
        y_test_pred = model.predict(X_test)

        test_accuracy = accuracy_score(y_test, y_test_pred)
        test_conf_matrix = confusion_matrix(y_test, y_test_pred)
        test_class_report = classification_report(y_test, y_test_pred)

        cv_results[name] = {
            "cv_avg_accuracy": np.mean(fold_accuracies),
            "cv_avg_precision": np.mean(fold_precisions),
            "cv_avg_recall": np.mean(fold_recalls),
            "cv_avg_f1": np.mean(fold_f1s),
            "cv_classification_report_df": avg_report_df,
            "avg_training_time": np.mean(training_times),
            "avg_prediction_time": np.mean(prediction_times),
            "test_accuracy": test_accuracy,
            "test_conf_matrix": test_conf_matrix,
            "test_class_report": test_class_report,
        }

    for name, result in cv_results.items():
        print(f"\n{'='*70}\n{name} | DATASET: {dataset_name}")
        print(f"CV Average Accuracy     : {result['cv_avg_accuracy']:.4f}")
        print(f"CV Avg Precision        : {result['cv_avg_precision']:.4f}")
        print(f"CV Avg Recall           : {result['cv_avg_recall']:.4f}")
        print(f"CV Avg F1-Score         : {result['cv_avg_f1']:.4f}")
        print(f"Average Training Time   : {result['avg_training_time']:.4f} s")
        print(f"Average Prediction Time : {result['avg_prediction_time']:.4f} s")
        
        print("\nAverage Classification Report from 5-Fold CV:")
        print(result["cv_classification_report_df"])

        print(f"\nTest Set Accuracy (20%) : {result['test_accuracy']:.4f}")
        print("Test Set Confusion Matrix:")
        print(result["test_conf_matrix"])
        print("Test Set Classification Report:")
        print(result["test_class_report"])

    return cv_results

results_combined = run_cv_block(X_train3, y_train3, X_test3, y_test3, models, "URL + HTML")



===== URL + HTML =====

XGBoost - 5-Fold CV on 80% Training Set

LightGBM - 5-Fold CV on 80% Training Set
[LightGBM] [Info] Number of positive: 6692, number of negative: 215730
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.027349 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6652
[LightGBM] [Info] Number of data points in the train set: 222422, number of used features: 216
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.030087 -> initscore=-3.473115
[LightGBM] [Info] Start training from score -3.473115
[LightGBM] [Info] Number of positive: 6692, number of negative: 215730
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.025566 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6640
[