In [None]:
from models.iforest import IsolationForest
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, f1_score, average_precision_score
from util.utils import find_TPR_threshold
from sklearn.ensemble import IsolationForest as skIsolationForest
import matplotlib.pyplot as plt
import time
import os
from dataset.cicids17 import CICIDS17

In [None]:
# Hyperparameters
n_estimators = 100
max_samples = 50000
test_size = 0.7 # Unused currently
seed = 42

In [None]:
# Load data
dataset = CICIDS17()

In [None]:
# Train Isolation Forest
clf_forests = []
for X, y, _, filename in dataset.datasets:
    print(f'Training Isolation Forest on {filename}')
    iforest = IsolationForest(sample_size=max_samples, n_trees=n_estimators)
    start_time = time.time()
    iforest.fit(X, improved=False)
    clf_forests.append((iforest, filename))
    end_time = time.time()
    print(f'{max_samples=}, {n_estimators=}, training time: {end_time - start_time:.3f}s\n')

In [None]:
for (iforest, filename), (X, y, _, _) in zip(clf_forests, dataset.datasets):
    print(f'Predicting on {filename}')
    print(f'Dataset shape: {X.shape}')
    print(f'Benign samples: {y.value_counts()[0]}')
    print(f'Anomalies in dataset: {y.value_counts().get(1, 0)}')
    print(f'Contamination rate: {y.value_counts().get(1, 0) / y.count() * 100:.4f} %\n')
    
    # Predict
    start_time = time.time()
    scores = iforest.anomaly_score(X)
    threshold, FPR = find_TPR_threshold(y, scores, 0.5)
    y_pred = iforest.predict_from_anomaly_scores(scores, threshold)
    end_time = time.time()
    
    # Metrics
    plt.hist(scores, bins=30, label=f'{filename}')
    print(f'Prediction time: {end_time - start_time:.3f}s')
    print('Predictions: ')
    print(pd.Series(y_pred).value_counts())
    print(confusion_matrix(y, y_pred))
    print(classification_report(y, y_pred))
    print(f'ROC AUC: {roc_auc_score(y, y_pred):.4f}')
    print(f'PR AUC: {average_precision_score(y, y_pred):.4f}')
    print(f'F1 Score: {f1_score(y, y_pred):.4f}')
    print(f'Threshold: {threshold:.4f}\n')

In [None]:
# Train using sklearn
sk_iforests = []
for (X, y, _, filename) in dataset.datasets:
    print(f'Training Isolation Forest on {filename}')
    iforest = skIsolationForest(n_estimators=n_estimators, max_samples=max_samples, contamination='auto', n_jobs=-1, random_state=seed)
    start_time = time.time()
    iforest.fit(X)
    sk_iforests.append((iforest, filename))
    end_time = time.time()
    print(f'{n_estimators=}, {max_samples=}, training time: {end_time - start_time:.3f}s\n')

In [None]:
# Predict using sklearn
for (iforest, filename), (X, y, _, _) in zip(sk_iforests, dataset.datasets):
    print(f'Predicting on {filename}')
    print(f'Dataset shape: {X.shape}')
    print(f'Benign samples: {y.value_counts()[0]}')
    print(f'Anomalies in dataset: {y.value_counts().get(1, 0)}')
    print(f'Contamination rate: {y.value_counts().get(1, 0) / y.count() * 100:.4f} %\n')
    
    # Predict
    start_time = time.time()
    y_pred = [1 if x == -1 else 0 for x in iforest.predict(X)]
    end_time = time.time()
    
    # Metrics
    print(f'Prediction time: {end_time - start_time:.3f}s')
    print('Predictions: ')
    print(pd.Series(y_pred).value_counts())
    print(confusion_matrix(y, y_pred))
    print(classification_report(y, y_pred))
    print(f'ROC AUC: {roc_auc_score(y, y_pred):.4f}')
    print(f'PR AUC: {average_precision_score(y, y_pred):.4f}')
    print(f'F1 Score: {f1_score(y, y_pred):.4f}')
    print(f'Threshold: {threshold:.4f}\n')