In [1]:
from models.iforest import IsolationForest
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, f1_score
from util.utils import find_TPR_threshold
from sklearn.ensemble import IsolationForest as skIsolationForest
import time
import os
import numpy as np

In [2]:
#Load data
train_data = pd.read_csv('../datasets/UNSW/UNSW_NB15_training-set.csv')
test_data = pd.read_csv('../datasets/UNSW/UNSW_NB15_testing-set.csv')
#Train data
print('Training Data:-')
print('Shape:', train_data.shape)
X_train = train_data.drop(columns=['label'])
y_train = train_data['label']
print('Benign:', y_train[y_train == 0].count())
print('Anomalies:', y_train[y_train == 1].count())
contamination_rate = y_train[y_train == 1].count() / y_train.count() * 100
print('Contamination Rate:', contamination_rate)
print('\n')
#Test data  
print('Testing Data:-')
print('Shape:', test_data.shape)
X_test = test_data.drop(columns=['label'])
y_test = test_data['label']
print('Benign:', y_test[y_test == 0].count())
print('Anomalies:', y_test[y_test == 1].count())
contamination_rate = y_test[y_test == 1].count() / y_test.count() * 100
print('Contamination Rate:', contamination_rate)

Training Data:-
Shape: (175341, 45)
Benign: 56000
Anomalies: 119341
Contamination Rate: 68.06223302022916


Testing Data:-
Shape: (82332, 45)
Benign: 37000
Anomalies: 45332
Contamination Rate: 55.06000097167566


In [None]:
#Train Isolation Forest
sample_size = 256
n_trees = 300
clf = IsolationForest(sample_size, n_trees)
start = time.time()
clf.fit(X_train)
end = time.time()
print(f'Sample Size: {sample_size}, N Estimators: {n_trees}, Training Time: {end - start}')

In [None]:
#Predict
start = time.time()
scores = clf.anomaly_score(X_test)
threshold, FPR = find_TPR_threshold(y_test, scores, 0.9)
y_pred = clf.predict_from_anomaly_scores(scores, threshold)
end = time.time()

#Metrics
print('Prediction Time:', end - start)
print('Predictions: ')
print(pd.Series(y_pred).value_counts())
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))
print('Classification Report:')
print(classification_report(y_test, y_pred))
print('ROC AUC Score:', roc_auc_score(y_test, y_pred))
print('F1 Score:', f1_score(y_test, y_pred))
print('Threshold:', threshold)

In [None]:
#Train Sklearn Isolation Forest
clf = skIsolationForest(n_estimators=n_trees, contamination=contamination_rate)
start = time.time()
clf.fit(X_train)
end = time.time()
print(f'Sample Size: {sample_size}, N Estimators: {n_trees}, Training Time: {end - start}')

In [None]:
#Predict
start = time.time()
sk_scores = clf.decision_function(X_test).reshape(-1, 1)
sk_threshold, sk_FPR = find_TPR_threshold(y_test, scores, 0.9)
sk_y_pred = clf.predict(X_test)
sk_y_pred = np.where(y_pred == -1, 1, 0)
end = time.time()

#Metrics
print('Prediction Time:', end - start)
print('Predictions: ')
print(pd.Series(sk_y_pred).value_counts())
print('Confusion Matrix:')
print(confusion_matrix(y_test, sk_y_pred))
print('Classification Report:')
print(classification_report(y_test, sk_y_pred))
print('ROC AUC Score:', roc_auc_score(y_test, sk_y_pred))
print('F1 Score:', f1_score(y_test, sk_y_pred))
print('Threshold:', sk_threshold)

