In [1]:
%load_ext autoreload
%autoreload 2

from pathlib import Path

from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

import sys
sys.path.append('./datasets')
sys.path.append('./models')
sys.path.append('./utils')

from utils.experiment_utils import CSVLogger, run_experiment, load_dataset, load_dataset_folds, run_cross_validation

from models.AE import AE
from models.BAE import BAE
from models.SAE import SAE
from models.CAE import CAE

In [2]:
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median').set_output(transform='pandas')),
    ('scaler', StandardScaler().set_output(transform='pandas'))
])

In [3]:
# KDDCUP99 = load_dataset("KDDCUP99", "../data/KDDCUP99/preprocessed/")
# CICIDS2017 = load_dataset("CICIDS2017", "../data/CIC-IDS2017/preprocessed/")
# UNSWNB15 = load_dataset("UNSW-NB15", "../data/UNSW-NB15/preprocessed/")
CTU13_08 = load_dataset("CTU-13_08", "../data/CTU-13/preprocessed/08", pipeline=pipeline)
CTU13_09 = load_dataset("CTU-13_09", "../data/CTU-13/preprocessed/09", pipeline=pipeline)
CTU13_10 = load_dataset("CTU-13_10", "../data/CTU-13/preprocessed/10", pipeline=pipeline)
CTU13_13 = load_dataset("CTU-13_13", "../data/CTU-13/preprocessed/13", pipeline=pipeline)


CTU-13_08
Loaded train data with 21000 samples.
Data shape: (21000, 69)
num_anomalies: 0
num_normal: 21000
anomaly ratio: 0.0

CTU-13_08
Loaded val data with 27932 samples.
Data shape: (27932, 69)
num_anomalies: 2021
num_normal: 25911
anomaly ratio: 0.07235428898754118

CTU-13_08
Loaded test data with 27932 samples.
Data shape: (27932, 69)
num_anomalies: 2021
num_normal: 25911
anomaly ratio: 0.07235428898754118

CTU-13_09
Loaded train data with 11986 samples.
Data shape: (11986, 56)
num_anomalies: 0
num_normal: 11986
anomaly ratio: 0.0

CTU-13_09
Loaded val data with 64486 samples.
Data shape: (64486, 56)
num_anomalies: 55496
num_normal: 8990
anomaly ratio: 0.8605898954811897

CTU-13_09
Loaded test data with 64486 samples.
Data shape: (64486, 56)
num_anomalies: 55496
num_normal: 8990
anomaly ratio: 0.8605898954811897

CTU-13_10
Loaded train data with 6338 samples.
Data shape: (6338, 75)
num_anomalies: 0
num_normal: 6338
anomaly ratio: 0.0

CTU-13_10
Loaded val data with 36660 samples.

In [5]:
KDDCUP99_folds = load_dataset_folds("KDDCUP99", "../data/KDDCUP99/preprocessed/", kfolds=3, pipeline=pipeline)


KDDCUP99
Loaded train data with 66982 samples.
Data shape: (66982, 77)
num_anomalies: 0
num_normal: 66982
anomaly ratio: 0.0

KDDCUP99
Loaded val data with 155514 samples.
Data shape: (155514, 77)
num_anomalies: 125218
num_normal: 30296
anomaly ratio: 0.8051879573543218

KDDCUP99
Loaded train data with 66982 samples.
Data shape: (66982, 77)
num_anomalies: 0
num_normal: 66982
anomaly ratio: 0.0

KDDCUP99
Loaded val data with 155514 samples.
Data shape: (155514, 77)
num_anomalies: 125218
num_normal: 30296
anomaly ratio: 0.8051879573543218

KDDCUP99
Loaded train data with 66982 samples.
Data shape: (66982, 77)
num_anomalies: 0
num_normal: 66982
anomaly ratio: 0.0

KDDCUP99
Loaded val data with 155514 samples.
Data shape: (155514, 77)
num_anomalies: 125218
num_normal: 30296
anomaly ratio: 0.8051879573543218


In [19]:
datasets = [CTU13_08]
hidden_sizes = [[256, 64, 12], [1024, 256, 12]]
initial_lrs = [2e-3]

logger = CSVLogger("../out/logs/ae_experiments.csv")

for folds in datasets:
    for hidden_size in hidden_sizes:
        for initial_lr in initial_lrs:
            for i in range(1):

                model = AE(input_size=folds['train'][0][0].shape[0], 
                            hidden_sizes=hidden_size,
                            initial_lr=initial_lr,
                            linear_lr_start_factor=1, 
                            linear_lr_end_factor=0.03, 
                            linear_lr_total_iters=100)

                result = run_experiment(model=model,
                            dataset=folds,
                            max_epochs=1,
                            experiment_name=f"new {folds['train'].name} AE tests",
                            run_name=f"hidden_sizes={hidden_size} lr={initial_lr} v{i}",
                            save_model=False)
                
                logger.log(result)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
C:\Users\Szymon\AppData\Roaming\Python\Python311\site-packages\lightning\pytorch\callbacks\model_checkpoint.py:654: Checkpoint directory bin_for_redundant_checkpoints exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type       | Params | Mode 
-----------------------------------------------
0 | encoder | Sequential | 34.6 K | train
1 | decoder | Sequential | 34.7 K | train
-----------------------------------------------
69.3 K    Trainable params
0         Non-trainable params
69.3 K    Total params
0.277     Total estimated model params size (MB)
12        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

C:\Users\Szymon\AppData\Roaming\Python\Python311\site-packages\lightning\pytorch\loops\fit_loop.py:310: The number of training batches (21) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
C:\Users\Szymon\AppData\Roaming\Python\Python311\site-packages\lightning\pytorch\callbacks\model_checkpoint.py:654: Checkpoint directory bin_for_redundant_checkpoints exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type       | Params | Mode 
-----------------------------------------------
0 | encoder | Sequential | 335 K  | train
1 | decoder | Sequential | 335 K  | train
-----------------------------------------------
670 K     Trainable params
0         Non-trainable params
670 K     Total params
2.681     Total estimated model params size (MB)
12        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

C:\Users\Szymon\AppData\Roaming\Python\Python311\site-packages\lightning\pytorch\loops\fit_loop.py:310: The number of training batches (21) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]

In [8]:
datasets = [KDDCUP99_folds]
hidden_sizes = [[256, 64, 12], [1024, 256, 12]]
initial_lrs = [2e-3]

logger = CSVLogger("../out/logs/kdd_experiments.csv")

for folds in datasets:
    for hidden_size in hidden_sizes:
        for initial_lr in initial_lrs:
            for i in range(1):

                model = AE(input_size=folds[0]['train'][0][0].shape[0], 
                            hidden_sizes=hidden_size,
                            initial_lr=initial_lr,
                            linear_lr_start_factor=1, 
                            linear_lr_end_factor=0.03, 
                            linear_lr_total_iters=100)

                result, _ = run_cross_validation(
                    model=model,
                    dataset_folds=folds,
                    max_epochs=10,
                    experiment_name=f"CV test",
                    run_name=f"hidden_sizes={hidden_size} lr={initial_lr} v{i}",
                    save_model=False
                )
                
                logger.log(result)

TypeError: run_cross_validation() got an unexpected keyword argument 'kfolds'