In [2]:
from models.iforest import IsolationForest
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, f1_score
from util.utils import find_TPR_threshold
from sklearn.ensemble import IsolationForest as skIsolationForest
import time
import os
import numpy as np
from data.cicids import preprocess

In [3]:
# Load data
filenames = os.listdir('../datasets/CIC-IDS-17')
datasets = []
for filename in filenames:
    data = pd.read_csv(f'../datasets/CIC-IDS-17/{filename}')
    data = preprocess(data)

    X = data.drop(' Label', axis=1)
    y = pd.Series([0 if x == 'BENIGN' else 1 for x in data[' Label']])
    datasets.append((X, y, filename))

    print(f'Dataset: {filename}')
    print(f'Dataset shape: {X.shape}')
    print(f'Benign samples: {y.value_counts()[0]}')
    print(f'Anomalies in dataset: {y.value_counts().get(1, 0)}')
    print(f'Contamination rate: {y.value_counts().get(1, 0) / y.count() * 100:.4f} %\n')

complete_dataset = (pd.concat([x[0] for x in datasets]), pd.concat([x[1] for x in datasets]))
print(f'Complete dataset shape: {complete_dataset[0].shape}')
print(f'Benign samples: {complete_dataset[1].value_counts()[0]}')
print(f'Anomalies in dataset: {complete_dataset[1].value_counts().get(1, 0)}')
print(f'Contamination rate: {complete_dataset[1].value_counts().get(1, 0) / complete_dataset[1].count() * 100:.4f} %\n')

Preprocessing the data...
Data preprocessing done.

Dataset: Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv
Dataset shape: (225745, 70)
Benign samples: 97718
Anomalies in dataset: 128027
Contamination rate: 56.7131 %

Preprocessing the data...
Data preprocessing done.

Dataset: Wednesday-workingHours.pcap_ISCX.csv
Dataset shape: (692703, 70)
Benign samples: 440031
Anomalies in dataset: 252672
Contamination rate: 36.4762 %

Preprocessing the data...
Data preprocessing done.

Dataset: Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv
Dataset shape: (170366, 70)
Benign samples: 168186
Anomalies in dataset: 2180
Contamination rate: 1.2796 %

Preprocessing the data...
Data preprocessing done.

Dataset: Tuesday-WorkingHours.pcap_ISCX.csv
Dataset shape: (445909, 70)
Benign samples: 432074
Anomalies in dataset: 13835
Contamination rate: 3.1027 %

Preprocessing the data...
Data preprocessing done.

Dataset: Monday-WorkingHours.pcap_ISCX.csv
Dataset shape: (529918, 70)
Benign samples: 529

In [None]:
# Train Isolation Forest
clf_forests = []
sample_size = 50000
n_trees = 300
for X, y, filename in datasets:
    print(f'Training Isolation Forest on {filename}')
    clf = IsolationForest(sample_size, n_trees)
    start_time = time.time()
    clf.fit(X, improved=False)
    clf_forests.append((clf, filename))
    end_time = time.time()
    print(f'{sample_size=}, {n_trees=}, training time: {end_time - start_time:.3f}s\n')

In [None]:
import matplotlib.pyplot as plt

for (clf, filename), (X, y, _) in zip(clf_forests, datasets):
    print(f'Predicting on {filename}')
    print(f'Dataset shape: {X.shape}')
    print(f'Benign samples: {y.value_counts()[0]}')
    print(f'Anomalies in dataset: {y.value_counts().get(1, 0)}')
    print(f'Contamination rate: {y.value_counts().get(1, 0) / y.count() * 100:.4f} %\n')
    
    # Predict
    start_time = time.time()
    scores = clf.anomaly_score(X)
    threshold, FPR = find_TPR_threshold(y, scores, 0.9)
    y_pred = clf.predict_from_anomaly_scores(scores, threshold)
    end_time = time.time()
    
    # Metrics
    plt.hist(scores, bins=30, label=f'{filename}')
    print(f'Prediction time: {end_time - start_time:.3f}s')
    print('Predictions: ')
    print(pd.Series(y_pred).value_counts())
    print(confusion_matrix(y, y_pred))
    print(classification_report(y, y_pred))
    print(f'ROC AUC: {roc_auc_score(y, y_pred):.4f}')
    print(f'F1 Score: {f1_score(y, y_pred):.4f}')
    print(f'Threshold: {threshold:.4f}\n')

In [None]:
print('Training on complete dataset')
print(f'Dataset shape: {complete_dataset.shape}')
print(f'Benign samples: {complete_dataset[1].value_counts()[0]}')
print(f'Anomalies in dataset: {complete_dataset[1].value_counts().get(1, 0)}')
print(f'Contamination rate: {complete_dataset[1].value_counts().get(1, 0) / complete_dataset[1].count() * 100:.4f} %\n')

# Train Isolation Forest
clf_complete = IsolationForest(sample_size, n_estimators)
start_time = time.time()
clf_complete.fit(complete_dataset[0], improved=False)
end_time = time.time()
print(f'{sample_size=}, {n_estimators=}, training time: {end_time - start_time:.3f}s\n')

print('Predicting on complete dataset')
start_time = time.time()
scores = clf_complete.anomaly_score(complete_dataset[0])
threshold, FPR = find_TPR_threshold(complete_dataset[1], scores, 0.9)
y_pred = clf_complete.predict_from_anomaly_scores(scores, threshold)
end_time = time.time()

# Metrics
plt.hist(scores, bins=30, label='Complete dataset')
print(f'Prediction time: {end_time - start_time:.3f}s')
print('Predictions: ')
print(pd.Series(y_pred).value_counts())
print(confusion_matrix(complete_dataset[1], y_pred))
print(classification_report(complete_dataset[1], y_pred))
print(f'ROC AUC: {roc_auc_score(complete_dataset[1], y_pred):.4f}')
print(f'F1 Score: {f1_score(complete_dataset[1], y_pred):.4f}')
print(f'Threshold: {threshold:.4f}\n')

In [4]:
# Compare with sklearn
sk_clf = skIsolationForest(contamination=0.2, n_estimators=100, max_samples=256, n_jobs=-1, random_state=42)
start_time = time.time()
sk_clf.fit(complete_dataset[0])
end_time = time.time()
print(f'Sklearn training time: {end_time - start_time}')

: 

In [None]:
# Predict
sk_scores = sk_clf.decision_function(X).reshape(-1, 1)
sk_threshold, sk_FPR = find_TPR_threshold(y, sk_scores, 0.8)
sk_y_pred = [1 if score >= sk_threshold else 0 for score in sk_scores]

print(f'Confusion matrix: {confusion_matrix(y, sk_y_pred)}')
print(f'Classification report: {classification_report(y, sk_y_pred)}')
print(f'ROC AUC score: {roc_auc_score(y, sk_y_pred)}')
print(f'Threshold: {sk_threshold}')
print(f'FPR: {sk_FPR}')