In [1]:
import numpy as np
from sklearn.ensemble import IsolationForest as IForest
from sklearn.cluster import DBSCAN, KMeans
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import pickle
import torch
import torch.nn as nn
from global_var import *
from normalize import *
from utils import *
from data_load import load_data
from AE import AutoEncoder
from VAE import VAE
import ExtBound
import KITree
import importlib
import time

  from .autonotebook import tqdm as notebook_tqdm


In [20]:
dataset = 'cicids_improved'
subset = 'Wednesday'

In [4]:
X_train, X_eval, y_train, y_testy_eval = load_data(dataset, subset, mode='train')
X_test, y_test = load_data(dataset, subset, mode='test')
with open(os.path.join(NORMALIZER_DIR, f'{dataset}_{subset}.norm'), 'rb') as f:
    normalizer = pickle.load(f)
X_train, X_eval = normalizer.transform(X_train), normalizer.transform(X_eval)
X_test = normalizer.transform(X_test)
X_train.shape, X_eval.shape, X_test.shape

((85519, 40), (62811, 40), (62811, 40))

In [9]:
idx_rand = np.random.randint(0, X_train.shape[0], 5000)
X_train, y_train = X_train[idx_rand], y_train[idx_rand]

In [11]:
model_name = 'IForest'

In [12]:
with open(os.path.join(TARGET_MODEL_DIR, f'{model_name}_{dataset}_{subset}.model'), 'rb') as f:
    model = pickle.load(f)

In [13]:
score = -model.score_samples(X_train)
thres = -model.offset_
func = lambda x: -model.score_samples(x)

In [14]:
kdt = KITree.KITree(func, thres)
start = time.time()
kdt.fit(X_train, score)
time.time() - start

1863.7503445148468

In [16]:
def print_metrics(y_true, y_pred, y_model):
    prec, rec, f1 = precision_score(y_true, y_pred), recall_score(y_true, y_pred), f1_score(y_true, y_pred)
    fid = (y_pred == y_model).sum() / y_pred.shape[0]
    tpr = TP(y_true, y_pred) / (y_true == 1).sum()
    tnr = TN(y_true, y_pred) / (y_true == 0).sum()
    print('prec:', prec, 'rec:', rec, 'f1:', f1, 'tpr', tpr, 'tnr', tnr, 'fid:', fid)
    return prec, rec, f1, tpr, tnr, fid

In [15]:
y_pred = kdt.predict(X_test)
print(TP(y_test, y_pred) / (y_test == 1).sum(), FP(y_test, y_pred) / (y_test == 0).sum())

1.0 0.030717921713182376


In [18]:
y_model = model.predict(X_test)
y_model[y_model == 1] = 0
y_model[y_model == -1] = 1
print_metrics(y_test, y_pred, y_model)

prec: 0.9751427516263742 rec: 1.0 f1: 0.9874149610942511 tpr 1.0 tnr 0.9692820782868177 fid: 0.9928197290283549


(0.9751427516263742,
 1.0,
 0.9874149610942511,
 1.0,
 0.9692820782868177,
 0.9928197290283549)

## Computational cost affected by dimension

In [21]:
FPR_list = np.arange(0.02, 0.051, 0.002)

def train_iforest(X_train, X_eval, y_eval):
    best_model, best_score = None, 0
    for FPR in FPR_list:
        model = IForest(n_estimators=500, contamination=FPR, random_state=SEED)
        model.fit(X_train)
        y_pred = model.predict(X_eval)
        y_pred[y_pred == 1] = 0
        y_pred[y_pred == -1] = 1
        score = f1_score(y_eval, y_pred)
        if score > best_score:
            best_model = model
            best_score = score
    return best_model