# Traditional Anomaly Detection with Hyperparameter Tuning

This notebook applies classical anomaly detection algorithms on UNSW-NB15 dataset and includes hyperparameter tuning using only **normal samples for training**.

In [1]:
# Install required packages
#!pip install -q scikit-learn
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.covariance import EllipticEnvelope
import warnings
warnings.filterwarnings('ignore')

In [2]:
# 1. Connect to Drive
from google.colab import drive
drive.mount('/content/drive')

data_path = '/content/drive/MyDrive/projects/ae-vae-anomaly-detection/data/processed/cleaned.csv'

Mounted at /content/drive


In [3]:

# 2. Load Processed Data
import pandas as pd
cleaned_df = pd.read_csv(data_path)
print(f'Loaded cleaned data: {cleaned_df.shape[0]} rows, {cleaned_df.shape[1]} columns')

# get a small sample for training model
df = cleaned_df.sample(frac=0.1, random_state=42)
print(f'Sampled data: {df.shape[0]} rows, {df.shape[1]} columns')

# Train-test split
X = df.drop(columns=['label'])
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

X_full = cleaned_df.drop(columns=['label'])
y_full = cleaned_df['label']
X_train_full, X_test_full, y_train_full, y_test_full = train_test_split(X_full, y_full, test_size=0.2, random_state=42, stratify=y_full)
X_train_full_norm = X_train_full[y_train_full == 0]

# Use only normal samples for training
X_train_norm = X_train[y_train == 0]

Loaded cleaned data: 640788 rows, 178 columns
Sampled data: 64079 rows, 178 columns


In [4]:
# Helper to evaluate
def evaluate_model(y_true, y_pred):
    return {
        'precision': precision_score(y_true, y_pred),
        'recall': recall_score(y_true, y_pred),
        'f1': f1_score(y_true, y_pred),
        'roc_auc': roc_auc_score(y_true, y_pred),
        'conf_matrix': confusion_matrix(y_true, y_pred).tolist()
    }

In [5]:
# 1. One-Class SVM
params = {'nu': [0.01, 0.05, 0.1], 'kernel': ['rbf', 'sigmoid'], 'gamma': ['scale', 0.1]}
svm = GridSearchCV(OneClassSVM(), params, scoring='f1', cv=3)
svm.fit(X_train_norm)

In [6]:
pred_all = svm.best_estimator_.predict(X_test_full)
pred_all = np.where(pred_all == -1, 1, 0)
result_svm = evaluate_model(y_test_full, pred_all)

In [7]:
# 2. Isolation Forest
params = {'n_estimators': [50, 100], 'contamination': [0.01, 0.05]}
iso = GridSearchCV(IsolationForest(random_state=42), params, scoring='f1', cv=3)
iso.fit(X_train_norm)

In [8]:
pred_iso = iso.best_estimator_.predict(X_test_full)
pred_iso = np.where(pred_iso == -1, 1, 0)
result_iso = evaluate_model(y_test_full, pred_iso)

In [9]:
# 3. Elliptic Envelope
params = {'contamination': [0.01, 0.05]}
ell = GridSearchCV(EllipticEnvelope(), params, scoring='f1', cv=3)
ell.fit(X_train_norm)

In [10]:
pred_ell = ell.best_estimator_.predict(X_test_full)
pred_ell = np.where(pred_ell == -1, 1, 0)
result_ell = evaluate_model(y_test_full, pred_ell)

In [11]:
# 4. Local Outlier Factor (only evaluation)
lof = LocalOutlierFactor(n_neighbors=20, contamination=0.01, novelty=True)
lof.fit(X_train_norm)
pred_lof = lof.predict(X_test_full)
pred_lof = np.where(pred_lof == -1, 1, 0)
result_lof = evaluate_model(y_test_full, pred_lof)

In [12]:
# Print summary
print("One-Class SVM:", result_svm)
print("Isolation Forest:", result_iso)
print("Elliptic Envelope:", result_ell)
print("Local Outlier Factor:", result_lof)

One-Class SVM: {'precision': 0.5538954108858057, 'recall': 0.5451680672268907, 'f1': 0.5494970884065643, 'roc_auc': np.float64(0.7675801230613392), 'conf_matrix': [[124048, 1254], [1299, 1557]]}
Isolation Forest: {'precision': 0.3259817105970952, 'recall': 0.21218487394957983, 'f1': 0.25705196182396606, 'roc_auc': np.float64(0.6010925167819758), 'conf_matrix': [[124049, 1253], [2250, 606]]}
Elliptic Envelope: {'precision': 0.4388813712223726, 'recall': 0.34068627450980393, 'f1': 0.3835994480583481, 'roc_auc': np.float64(0.6653791302957155), 'conf_matrix': [[124058, 1244], [1883, 973]]}
Local Outlier Factor: {'precision': 0.5539285714285714, 'recall': 0.5430672268907563, 'f1': 0.5484441301272984, 'roc_auc': np.float64(0.7665496546897317), 'conf_matrix': [[124053, 1249], [1305, 1551]]}
