In [1]:
from models.iforest import IsolationForest
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, average_precision_score
from util.utils import find_TPR_threshold
from sklearn.ensemble import IsolationForest as skIsolationForest
from sklearn.model_selection import train_test_split
import time
import matplotlib.pyplot as plt

In [2]:
# Hyperparameters
n_estimators = 50
test_size = 0.7 # 30% for training, 70% for testing
seed = 42

In [3]:
# Print functions
def print_stats(X, y, name):
    print(f'\nDataset: {name}')
    print(f'Dataset shape: {X.shape}')
    print(f'Anomalies in dataset: {y.value_counts()[1]}')
    print(f'Contains NaN: {X.isnull().values.any()}')
    print(f'Contamination rate: {y.value_counts()[1]/len(y) * 100:0.3f}%')

def print_metrics(y, y_pred):
    print(f'Confusion matrix:\n{confusion_matrix(y, y_pred)}')
    print(f'Classification report:\n{classification_report(y, y_pred)}')
    print(f'ROC AUC score: {roc_auc_score(y, y_pred):.5f}')
    print(f'PR AUC score: {average_precision_score(y, y_pred):.5f}')

In [4]:
# Load data and split into train and test
data = pd.read_csv('../datasets/HTTP.csv')
X = data.drop('3', axis=1)
y = data['3']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)

print_stats(X, y, 'HTTP')
print_stats(X_train, y_train, 'HTTP Train')
print_stats(X_test, y_test, 'HTTP Test')


Dataset: HTTP
Dataset shape: (567497, 3)
Anomalies in dataset: 2211
Contains NaN: False
Contamination rate: 0.390%

Dataset: HTTP Train
Dataset shape: (170249, 3)
Anomalies in dataset: 686
Contains NaN: False
Contamination rate: 0.403%

Dataset: HTTP Test
Dataset shape: (397248, 3)
Anomalies in dataset: 1525
Contains NaN: False
Contamination rate: 0.384%


In [5]:
# Train Isolation Forest
iforest = IsolationForest(sample_size=X_train.shape[0], n_trees=n_estimators)
start_time = time.time()
iforest.fit(X_train)
end_time = time.time()
print(f'Training time: {end_time - start_time:0.3f}s')

Training time: 11.335s


In [6]:
# Predict
scores = iforest.anomaly_score(X_test)
threshold, FPR = find_TPR_threshold(y_test, scores, 0.8)
y_pred = iforest.predict_from_anomaly_scores(scores, threshold)

print(f'Threshold: {threshold:0.3f}')
print(f'FPR: {FPR:0.3f}')

Threshold: 0.607
FPR: 0.000


In [7]:
print_metrics(y_test, y_pred)

Confusion matrix:
[[395603    120]
 [     6   1519]]
Classification report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    395723
           1       0.93      1.00      0.96      1525

    accuracy                           1.00    397248
   macro avg       0.96      1.00      0.98    397248
weighted avg       1.00      1.00      1.00    397248

ROC AUC score: 0.99788
PR AUC score: 0.92315


In [8]:
# Compare with sklearn
sk_iforest = skIsolationForest(n_estimators=n_estimators, max_samples=1.0, n_jobs=-1, random_state=seed)
start_time = time.time()
sk_iforest.fit(X_train)
end_time = time.time()
print(f'Sklearn training time: {end_time - start_time:0.3f}s')

Sklearn training time: 0.863s


In [9]:
# Predict
sk_y_pred = [1 if x == -1 else 0 for x in sk_iforest.predict(X_test)]

In [10]:
print_metrics(y_test, sk_y_pred)

Confusion matrix:
[[395085    638]
 [     4   1521]]
Classification report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    395723
           1       0.70      1.00      0.83      1525

    accuracy                           1.00    397248
   macro avg       0.85      1.00      0.91    397248
weighted avg       1.00      1.00      1.00    397248

ROC AUC score: 0.99788
PR AUC score: 0.70266
