In [5]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, recall_score, roc_auc_score, precision_score, f1_score
)
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler


# Load the dataset
file_path = "./Datasets/merged_UNSW_NB15.csv"
df = pd.read_csv(file_path)

# Drop the 'label' column as it's considered a leak
if 'label' in df.columns:
    df = df.drop(columns=['label'])

# Drop rows with missing target
df = df.dropna(subset=['attack_cat'])

# Optional encoding for target variable
le_target = LabelEncoder()
y = le_target.fit_transform(df['attack_cat'])

# Drop target from features
X = df.drop(columns=['attack_cat'])

# Encode categorical features if needed
for col in ["proto", "service", "state"]:
    if X[col].dtype == 'object':
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col].astype(str))

# Impute and scale features
X_imputed = SimpleImputer(strategy='mean').fit_transform(X)
X_scaled = StandardScaler().fit_transform(X_imputed)

# Split into train (70%), test (20%), final test (10%)
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X_scaled, y, test_size=0.3, random_state=42, stratify=y
)
X_test_final, X_test_small, y_test_final, y_test_small = train_test_split(
    X_test, y_test, test_size=1/3, random_state=42, stratify=y_test
)

# Train Random Forest
clf = RandomForestClassifier(
    n_estimators=30, max_depth=None, max_features=17, random_state=42
)
clf.fit(X_train_val, y_train_val)

# Predict and calculate metrics
y_pred_full = clf.predict(X_test_final)
y_prob_full = clf.predict_proba(X_test_final)
y_pred_small = clf.predict(X_test_small)

metrics = {
    "global_accuracy": accuracy_score(y_test_final, y_pred_full),
    "global_recall": recall_score(y_test_final, y_pred_full, average='macro'),
    "global_roc_auc": roc_auc_score(y_test_final, y_prob_full, multi_class='ovr'),
    "global_precision": precision_score(y_test_final, y_pred_full, average='macro'),
    "global_f1_score": f1_score(y_test_final, y_pred_full, average='macro'),
    "10_percent_accuracy": accuracy_score(y_test_small, y_pred_small)
}

metrics


{'global_accuracy': 0.8672526875460861,
 'global_recall': 0.6243773546290332,
 'global_roc_auc': np.float64(0.894564061270964),
 'global_precision': 0.6438974434263338,
 'global_f1_score': 0.6333688610664886,
 '10_percent_accuracy': 0.8681698230363242}