In [1]:
from models.iforest import IsolationForest
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
from util.utils import find_TPR_threshold
from sklearn.ensemble import IsolationForest as skIsolationForest
import time

In [2]:
# Load data
data = pd.read_csv('../datasets/HTTP.csv')
X = data.drop('3', axis=1)
y = data['3']

print(f'Dataset: HTTP')
print(f'Dataset shape: {X.shape}')
print(f'Anomalies in dataset: {y.value_counts()[1]}')

Dataset: HTTP
Dataset shape: (567497, 3)
Anomalies in dataset: 2211


In [3]:
# Train Isolation Forest
clf = IsolationForest(sample_size=256, n_trees=100)
start_time = time.time()
clf.fit(X)
end_time = time.time()
print(f'Training time: {end_time - start_time}')

Training time: 0.036493778228759766


In [4]:
# Predict
scores = clf.anomaly_score(X)
threshold, FPR = find_TPR_threshold(y, scores, 0.8)
y_pred = clf.predict_from_anomaly_scores(scores, threshold)

print(f'Confusion matrix:\n {confusion_matrix(y, y_pred)}')
print(f'Classification report:\n {classification_report(y, y_pred)}')
print(f'ROC AUC score: {roc_auc_score(y, y_pred)}')
print(f'Threshold: {threshold}')
print(f'FPR: {FPR}')

Confusion matrix:
 [[564715    571]
 [    55   2156]]
Classification report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    565286
           1       0.79      0.98      0.87      2211

    accuracy                           1.00    567497
   macro avg       0.90      0.99      0.94    567497
weighted avg       1.00      1.00      1.00    567497

ROC AUC score: 0.9870571349759061
Threshold: 0.8449999999999999
FPR: 0.001010108157640557


In [5]:
# Compare with sklearn
sk_clf = skIsolationForest(contamination=0.2, n_estimators=100, max_samples=256, n_jobs=-1, random_state=42)
start_time = time.time()
sk_clf.fit(X)
end_time = time.time()
print(f'Sklearn training time: {end_time - start_time}')

Sklearn training time: 2.2484138011932373


In [6]:
# Predict
sk_scores = sk_clf.decision_function(X).reshape(-1, 1)
sk_threshold, sk_FPR = find_TPR_threshold(y, sk_scores, 0.8)
sk_y_pred = [1 if score >= sk_threshold else 0 for score in sk_scores]

print(f'Confusion matrix:\n {confusion_matrix(y, sk_y_pred)}')
print(f'Classification report:\n {classification_report(y, sk_y_pred)}')
print(f'ROC AUC score: {roc_auc_score(y, sk_y_pred)}')
print(f'Threshold: {sk_threshold}')
print(f'FPR: {sk_FPR}')

Confusion matrix:
 [[111288 453998]
 [  2211      0]]
Classification report:
               precision    recall  f1-score   support

           0       0.98      0.20      0.33    565286
           1       0.00      0.00      0.00      2211

    accuracy                           0.20    567497
   macro avg       0.49      0.10      0.16    567497
weighted avg       0.98      0.20      0.33    567497

ROC AUC score: 0.09843512841287422
Threshold: -8.81239525796218e-16
FPR: 0.8009856957363175
