In [1]:
%load_ext autoreload
%autoreload 2

from pathlib import Path
from datetime import datetime

from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

import sys
sys.path.append('./datasets')
sys.path.append('./models')

from UNSW_NB15 import UNSWNB15Dataset
from KDDCUP99 import KDDCUP99Dataset
from CICIDS2017 import CICIDS2017Dataset

from StandardAE import StandardAE
from BAE import BAE

In [2]:
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median').set_output(transform='pandas')),
    ('scaler', StandardScaler().set_output(transform='pandas'))
])

In [7]:
data_dir = Path("../data/KDDCUP99/preprocessed")
KDDCUP99 = {
    'train' : KDDCUP99Dataset(data_dir, type = "train", transformer = pipeline),
    'val' : KDDCUP99Dataset(data_dir, type = "val", transformer = pipeline),
    'test' : KDDCUP99Dataset(data_dir, type = "test", transformer = pipeline)
}



Loaded train data with 912188 samples.
Data shape: (912188, 125)
num_anomalies: 0
num_normal: 912188
anomaly ratio: 0.0

Loaded val data with 311029 samples.
Data shape: (311029, 125)
num_anomalies: 250436
num_normal: 60593
anomaly ratio: 0.8051853685669182

Loaded test data with 311029 samples.
Data shape: (311029, 125)
num_anomalies: 250436
num_normal: 60593
anomaly ratio: 0.8051853685669182


In [9]:
data_dir = Path("../data/CIC-IDS2017/preprocessed")
CICIDS2017 = {
    'train' : CICIDS2017Dataset(data_dir, type = "train", transformer = pipeline),
    'val' : CICIDS2017Dataset(data_dir, type = "val", transformer = pipeline),
    'test' : CICIDS2017Dataset(data_dir, type = "test", transformer = pipeline)
}


Loaded train data with 1715451 samples.
Data shape: (1715451, 80)
num_anomalies: 0
num_normal: 1715451
anomaly ratio: 0.0
columns with missing values:
['Flow Bytes/s', 'Flow Packets/s']
columns with infinity values:
[]

Loaded val data with 557646 samples.
Data shape: (557646, 80)
num_anomalies: 278823
num_normal: 278823
anomaly ratio: 0.5
columns with missing values:
['Flow Bytes/s', 'Flow Packets/s']
columns with infinity values:
[]

Loaded test data with 557646 samples.
Data shape: (557646, 80)
num_anomalies: 278823
num_normal: 278823
anomaly ratio: 0.5
columns with missing values:
['Flow Bytes/s', 'Flow Packets/s']
columns with infinity values:
[]


In [3]:
data_dir = Path("../data/UNSW-NB15/preprocessed")
UNSWNB15 = {
    'train' : UNSWNB15Dataset(data_dir, type = "train", transformer = pipeline),
    'val' : UNSWNB15Dataset(data_dir, type = "val", transformer = pipeline),
    'test' : UNSWNB15Dataset(data_dir, type = "test", transformer = pipeline)
}


Loaded train data with 46000 samples.
Data shape: (46000, 196)
num_anomalies: 0
num_normal: 46000
anomaly ratio: 0.0

Loaded val data with 22252 samples.
Data shape: (22252, 196)
num_anomalies: 12252
num_normal: 10000
anomaly ratio: 0.5506021930612979

Loaded test data with 82332 samples.
Data shape: (82332, 196)
num_anomalies: 45332
num_normal: 37000
anomaly ratio: 0.5506000097167566


In [4]:
def run_experiment(model, 
                   dataset,
                   max_epochs=10,
                   experiment_name="undefined",
                   run_name="undefined",
                   dataset_name="undefined",
                   save_model=False):

    model.set_tech_params(
        accelerator='gpu',
        batch_size=1024, 
        num_workers=1, 
        persistent_workers=False
    )

    model.fit(dataset['train'], dataset['val'], max_epochs=max_epochs, log=True, 
                        logger_params = {
                                "experiment_name": experiment_name,
                                "run_name": run_name,
                                "log_model": False,
                                "tags": {"dataset": dataset_name},
                        })
    
    if save_model:
        save_dir = Path(f"../saved_models/{experiment_name}/{run_name}")
        file_name = f"{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}_{model.__class__.__name__}"
        file_num = 0
        while (save_dir / f"{file_name}_{file_num}.pt").exists():
            file_num += 1

        model.save(save_dir / f"{file_name}_{file_num}.pt")

In [5]:
UNSWNB15_input_size = UNSWNB15['train'][0][0].shape[0]

StandardAE_model = StandardAE(input_size=UNSWNB15_input_size, 
                                hidden_sizes=[128,32, 8], 
                                dropout=False, 
                                initial_lr=2e-3, 
                                linear_lr_start_factor=1, 
                                linear_lr_end_factor=0.03, 
                                linear_lr_total_iters=25)

BAE_model = BAE(birch_threshold = 0.1, 
                birch_branching_factor = 50, 
                birch_n_clusters = 3, 
                birch_fit_sample_size = 10000,
                birch_fit_quantile = 0.99,
                base_model = StandardAE, 
                input_size=UNSWNB15_input_size, 
                hidden_sizes=[128,32, 8], 
                dropout=False, 
                initial_lr=2e-3, 
                linear_lr_start_factor=1, 
                linear_lr_end_factor=0.03, 
                linear_lr_total_iters=25)

run_experiment(model=BAE_model,
                dataset=UNSWNB15,
                max_epochs=3,
                experiment_name="IADModel test",
                run_name="BAE test v3",
                dataset_name="UNSW-NB15",
                save_model=True)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Experiment with name IADModel test submodels not found. Creating it.
C:\Users\Szymon\AppData\Roaming\Python\Python311\site-packages\lightning\pytorch\callbacks\model_checkpoint.py:654: Checkpoint directory bin_for_redundant_checkpoints exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type       | Params | Mode 
-----------------------------------------------
0 | encoder | Sequential | 29.4 K | train
1 | decoder | Sequential | 29.5 K | train
-----------------------------------------------
58.9 K    Trainable params
0         Non-trainable params
58.9 K    Total params
0.236     Total estimated model params size (MB)
12        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

C:\Users\Szymon\AppData\Roaming\Python\Python311\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:420: Consider setting `persistent_workers=True` in 'val_dataloader' to speed up the dataloader worker initialization.
C:\Users\Szymon\AppData\Roaming\Python\Python311\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:420: Consider setting `persistent_workers=True` in 'train_dataloader' to speed up the dataloader worker initialization.
C:\Users\Szymon\AppData\Roaming\Python\Python311\site-packages\lightning\pytorch\loops\fit_loop.py:310: The number of training batches (32) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=3` reached.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
The tag mlflow.runName is found in tags. The value will be overridden by BAE test v2 cluster_1.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type       | Params | Mode 
-----------------------------------------------
0 | encoder | Sequential | 29.4 K | train
1 | decoder | Sequential | 29.5 K | train
-----------------------------------------------
58.9 K    Trainable params
0         Non-trainable params
58.9 K    Total params
0.236     Total estimated model params size (MB)
12        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

C:\Users\Szymon\AppData\Roaming\Python\Python311\site-packages\lightning\pytorch\loops\fit_loop.py:310: The number of training batches (12) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=3` reached.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
The tag mlflow.runName is found in tags. The value will be overridden by BAE test v2 cluster_2.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type       | Params | Mode 
-----------------------------------------------
0 | encoder | Sequential | 29.4 K | train
1 | decoder | Sequential | 29.5 K | train
-----------------------------------------------
58.9 K    Trainable params
0         Non-trainable params
58.9 K    Total params
0.236     Total estimated model params size (MB)
12        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

C:\Users\Szymon\AppData\Roaming\Python\Python311\site-packages\lightning\pytorch\loops\fit_loop.py:310: The number of training batches (2) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=3` reached.


In [6]:
anomaly_scores = BAE_model.predict_raw(UNSWNB15['val'])

In [7]:
metrics_a = BAE_model.evaluate(UNSWNB15['val'])

In [8]:
loaded_model = BAE_model.load(Path(f"../saved_models/IADModel test/BAE test v2/2025-04-25_12-41-28_BAE_0.pt"))

  checkpoint = torch.load(path)
  checkpoint = torch.load(path)


In [9]:
metrics_b = loaded_model.evaluate(UNSWNB15['val'])

AttributeError: 'int' object has no attribute 'to'