In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
#Machine Learning Models
from xgboost import XGBClassifier, plot_importance
from lightgbm import LGBMClassifier, plot_importance
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report,
)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

import time

In [2]:
#URL
df1 = pd.read_csv('url_features.csv').sort_values(by='url')
df1 = pd.get_dummies(df1, columns=['tld_type'], prefix='tld')
X1 = df1.drop(['url','label'], axis=1)
y1 = df1['label']

In [3]:
#HTML
df2 = pd.read_csv('new_html_features.csv').sort_values(by='url')
X2 = df2.drop(['label', 'url'], axis=1)
y2 = df2['label']

In [4]:
#DOM
df3 = pd.read_csv('dom_features_output.csv').sort_values(by='url')
X3 = df3.drop(['url','label'], axis=1)
y3 = df3['label']

In [5]:
#Combined
df4 = pd.read_csv('combined_features.csv').sort_values(by='url')
df4 = pd.get_dummies(df4, columns=['tld_type'], prefix='tld')
X4 = df4.drop(['url','label'], axis=1)
y4 = df4['label']

In [6]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, stratify=y1, test_size=0.2, random_state=42)

In [7]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, stratify=y2, test_size=0.2, random_state=42)

In [8]:
X_train3, X_test3, y_train3, y_test3 = train_test_split(X3, y3, stratify=y3, test_size=0.2, random_state=42)

In [9]:
X_train4, X_test4, y_train4, y_test4 = train_test_split(X4, y4, stratify=y4, test_size=0.2, random_state=42)

In [10]:

models = {
    "XGBoost": XGBClassifier(random_state=42),
    "LightGBM": LGBMClassifier( random_state=42),
    "Random Forest": RandomForestClassifier( random_state=42),
    "K-Nearest Neighbor": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier( random_state=42)
}

In [11]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def run_cv_block(X_train, y_train, X_test, y_test, models, dataset_name):
    print(f"\n\n===== {dataset_name} =====")
    cv_results = {}

    for name, model in models.items():
        print(f"\n{name} - 5-Fold CV on 80% Training Set")

        fold_accuracies = []
        fold_precisions = []
        fold_recalls = []
        fold_f1s = []
        fold_reports = []

        training_times = []
        prediction_times = []

        for train_idx, val_idx in kfold.split(X_train, y_train):
            X_train_fold, X_val_fold = X_train.iloc[train_idx], X_train.iloc[val_idx]
            y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]

            start_train = time.time()
            model.fit(X_train_fold, y_train_fold)
            end_train = time.time()

            start_pred = time.time()
            y_val_pred = model.predict(X_val_fold)
            end_pred = time.time()

            fold_accuracies.append(accuracy_score(y_val_fold, y_val_pred))
            fold_precisions.append(precision_score(y_val_fold, y_val_pred, average='weighted', zero_division=0))
            fold_recalls.append(recall_score(y_val_fold, y_val_pred, average='weighted', zero_division=0))
            fold_f1s.append(f1_score(y_val_fold, y_val_pred, average='weighted', zero_division=0))

            report = classification_report(y_val_fold, y_val_pred, output_dict=True, zero_division=0)
            fold_reports.append(pd.DataFrame(report).transpose())

            training_times.append(end_train - start_train)
            prediction_times.append(end_pred - start_pred)

        # Average classification report across folds
        avg_report_df = pd.concat(fold_reports).groupby(level=0).mean()

        # Final model on all 80% training data
        model.fit(X_train, y_train)
        y_test_pred = model.predict(X_test)

        test_accuracy = accuracy_score(y_test, y_test_pred)
        test_conf_matrix = confusion_matrix(y_test, y_test_pred)
        test_class_report = classification_report(y_test, y_test_pred)

        cv_results[name] = {
            "cv_avg_accuracy": np.mean(fold_accuracies),
            "cv_avg_precision": np.mean(fold_precisions),
            "cv_avg_recall": np.mean(fold_recalls),
            "cv_avg_f1": np.mean(fold_f1s),
            "cv_classification_report_df": avg_report_df,
            "avg_training_time": np.mean(training_times),
            "avg_prediction_time": np.mean(prediction_times),
            "test_accuracy": test_accuracy,
            "test_conf_matrix": test_conf_matrix,
            "test_class_report": test_class_report,
        }

    for name, result in cv_results.items():
        print(f"\n{'='*70}\n{name} | DATASET: {dataset_name}")
        print(f"CV Average Accuracy     : {result['cv_avg_accuracy']:.4f}")
        print(f"CV Avg Precision        : {result['cv_avg_precision']:.4f}")
        print(f"CV Avg Recall           : {result['cv_avg_recall']:.4f}")
        print(f"CV Avg F1-Score         : {result['cv_avg_f1']:.4f}")
        print(f"Average Training Time   : {result['avg_training_time']:.4f} s")
        print(f"Average Prediction Time : {result['avg_prediction_time']:.4f} s")
        
        print("\nAverage Classification Report from 5-Fold CV:")
        print(result["cv_classification_report_df"])

        print(f"\nTest Set Accuracy (20%) : {result['test_accuracy']:.4f}")
        print("Test Set Confusion Matrix:")
        print(result["test_conf_matrix"])
        print("Test Set Classification Report:")
        print(result["test_class_report"])

    return cv_results

results_url = run_cv_block(X_train1, y_train1, X_test1, y_test1, models, "URL")
results_html = run_cv_block(X_train2, y_train2, X_test2, y_test2, models, "HTML")
results_dom = run_cv_block(X_train3, y_train3, X_test3, y_test3, models, "DOM")
results_combined = run_cv_block(X_train4, y_train4, X_test4, y_test4, models, "COMBINED")




===== URL =====

XGBoost - 5-Fold CV on 80% Training Set

LightGBM - 5-Fold CV on 80% Training Set
[LightGBM] [Info] Number of positive: 13511, number of negative: 432613
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.023481 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 774
[LightGBM] [Info] Number of data points in the train set: 446124, number of used features: 228
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.030285 -> initscore=-3.466339
[LightGBM] [Info] Start training from score -3.466339
[LightGBM] [Info] Number of positive: 13512, number of negative: 432613
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.024295 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 782
[LightGB