In [1]:
# Test dataset loading

In [1]:
from pathlib import Path

from torch.utils.data import DataLoader

from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

import lightning as L
from lightning.pytorch.loggers import MLFlowLogger

import pandas as pd

import numpy as np

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import sys
sys.path.append('./datasets')
sys.path.append('./models')

from UNSW_NB15 import UNSWNB15Dataset
from UNSW_NB15 import load_UNSWNB15Dataset, split_UNSWNB15Dataset
from StandardAE_Small import StandardAE_Small
from StandardAE_Medium import StandardAE_Medium
from StandardAE_Large import StandardAE_Large
from StandardAE_XS import StandardAE_XS
from BAE import BAE

In [4]:
data_dir = Path("../data/UNSW-NB15/preprocessed")
data = load_UNSWNB15Dataset(data_dir)

In [5]:
num_all_records = len(data)
num_normal_records = len(data[data["Label"] == 0])
num_attack_records = len(data[data["Label"] == 1])

print("total num records:   ", num_all_records)
print("normal records:      ", num_normal_records)
print("attack records:      ", num_attack_records)

total num records:    2540047
normal records:       2218764
attack records:       321283


In [6]:
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median').set_output(transform='pandas')),
    ('scaler', StandardScaler().set_output(transform='pandas'))
])

In [7]:
# full dataset

val_test_size = (num_attack_records // 2) * 2
train_size = num_normal_records - val_test_size

train_dataset, val_dataset, test_dataset = split_UNSWNB15Dataset( 
                                                        data_dir = data_dir,
                                                        data = data,
                                                        records_num = {"train" : train_size, "val" : val_test_size, "test" : val_test_size},
                                                        normal_records_num = {"train" : 1.0, "val" : 0.5, "test" : 0.5},
                                                        transformer = pipeline,
                                                        random_state = 42)

In [7]:
# small dataset
train_dataset, val_dataset, test_dataset = split_UNSWNB15Dataset( 
                                                        data_dir = data_dir,
                                                        data = data,
                                                        records_num = {"train" : 1e5, "val" : 1e5, "test" : 1e5},
                                                        normal_records_num = {"train" : 1.0, "val" : 0.5, "test" : 0.5},
                                                        transformer = pipeline,
                                                        random_state = 42)

In [32]:
train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True, num_workers=11, persistent_workers=True)
val_loader = DataLoader(val_dataset, batch_size=1024, shuffle=False, num_workers=11, persistent_workers=True)
test_loader = DataLoader(test_dataset, batch_size=1024, shuffle=False, num_workers=11, persistent_workers=True)

In [17]:
for x, y, attack_cat in test_loader:
    print(x)
    print(y)
    print(attack_cat)
    break

print("train dataset size:", len(train_dataset))

tensor([[-0.1888, -0.3951, -0.2451,  ...,  0.0000, -0.1515,  0.0000],
        [-0.1888, -0.3951, -0.2451,  ...,  0.0000, -0.1515,  0.0000],
        [-0.1766,  0.3783, -0.1507,  ...,  0.0000, -0.1515,  0.0000],
        ...,
        [-0.1885, -0.3919, -0.2440,  ...,  0.0000, -0.1515,  0.0000],
        [-0.1888, -0.3801, -0.2451,  ...,  0.0000, -0.1515,  0.0000],
        [ 5.3491,  0.5076,  9.8084,  ...,  0.0000, -0.1515,  0.0000]])
tensor([1, 1, 0,  ..., 0, 0, 1])
tensor([ 9,  9, 13,  ..., 13, 13,  8])
train dataset size: 1897482


In [34]:

input_size = train_dataset[0][0].shape[0]
print("Input dim:", input_size)

model = StandardAE_Small(input_size, 8, dropout=0.5)

logger = MLFlowLogger(
    experiment_name="Efficiency test",
    tracking_uri="http://127.0.0.1:8080",
    run_name="b1024p16",
    # log_model=True,
    tags={"dataset": "UNSW-NB15"},
)

trainer = L.Trainer(accelerator='gpu', max_epochs=20, logger=logger)

# for some strange reason, mlflow logger always saves two copies of the checkpoint
# First (desired) in mlartifacts folder and the second (unwanted) in the notebook's dir
# This is a workaround to gitignore second copy and easily remove it after training 
# see https://github.com/Lightning-AI/pytorch-lightning/issues/17904 for more info
trainer.checkpoint_callback.dirpath = "bin_for_redundant_checkpoints"

trainer.fit(model, train_loader, val_loader)

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
C:\Users\Szymon\AppData\Roaming\Python\Python311\site-packages\lightning\pytorch\callbacks\model_checkpoint.py:654: Checkpoint directory bin_for_redundant_checkpoints exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type       | Params | Mode 
-----------------------------------------------
0 | encoder | Sequential | 35.0 K | train
1 | decoder | Sequential | 35.2 K | train
-----------------------------------------------
70.2 K    Trainable params
0         Non-trainable params
70.2 K    Total params
0.281     Total estimated model params size (MB)
16        Modules in train mode
0         Modules in eval mode


Input dim: 204


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=20` reached.


In [15]:
print("threshold:", model.threshold)

model.threshold = 0.04

trainer.validate(model, test_loader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


threshold: 0.03


Validation: |          | 0/? [00:00<?, ?it/s]

[{'val_loss': -0.18045319616794586,
  'val_accuracy': 0.9499499797821045,
  'val_precision': 0.9460161328315735,
  'val_recall': 0.9543600082397461,
  'positive_rate': 0.5044099688529968}]

In [25]:
input_size = train_dataset[0][0].shape[0]
model = BAE(
    birch_threshold=0.5, 
    birch_branching_factor=50, 
    birch_n_clusters=3, 
    base_model=StandardAE_Small, 
    input_size=input_size, 
    latent_size=8, 
    dropout=0.5
    )

model.set_tech_params(
    accelerator='gpu',
    batch_size=1024, 
    num_workers=11, 
    persistent_workers=True
)

model.fit(
    train_dataset, 
    birch_fit_sample_size = 10000,
    experiment_name="BAE test v5",
    run_name="test full ds",
    max_epochs=20,
    )

Fitting BIRCH model...
Fitting BIRCH model done.
Predicting clusters...
Predicting clusters done.


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
C:\Users\Szymon\AppData\Roaming\Python\Python311\site-packages\lightning\pytorch\trainer\configuration_validator.py:70: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
Experiment with name BAE test v5 not found. Creating it.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type       | Params | Mode 
-----------------------------------------------
0 | encoder | Sequential | 35.0 K | train
1 | decoder | Sequential | 35.2 K | train
-----------------------------------------------
70.2 K    Trainable params
0         Non-trainable params
70.2 K    Total params
0.281     Total estimated model params size (MB)
16        Modules in train mode
0         Modules in eval mode


Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=20` reached.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type       | Params | Mode 
-----------------------------------------------
0 | encoder | Sequential | 35.0 K | train
1 | decoder | Sequential | 35.2 K | train
-----------------------------------------------
70.2 K    Trainable params
0         Non-trainable params
70.2 K    Total params
0.281     Total estimated model params size (MB)
16        Modules in train mode
0         Modules in eval mode
C:\Users\Szymon\AppData\Roaming\Python\Python311\site-packages\lightning\pytorch\loops\fit_loop.py:310: The number of training batches (1) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=20` reached.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type       | Params | Mode 
-----------------------------------------------
0 | encoder | Sequential | 35.0 K | train
1 | decoder | Sequential | 35.2 K | train
-----------------------------------------------
70.2 K    Trainable params
0         Non-trainable params
70.2 K    Total params
0.281     Total estimated model params size (MB)
16        Modules in train mode
0         Modules in eval mode


Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=20` reached.


In [26]:
metrics = model.evaluate(val_dataset)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: |          | 0/? [00:00<?, ?it/s]

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: |          | 0/? [00:00<?, ?it/s]

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: |          | 0/? [00:00<?, ?it/s]

In [27]:
metrics

{'accuracy': tensor(0.9532),
 'precision': tensor(0.9565),
 'recall': tensor(0.9496),
 'f1': tensor(0.9530)}