In [None]:
%load_ext autoreload
%autoreload 2
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

import numpy as np

import sys
sys.path.append('./datasets')
sys.path.append('./models')
sys.path.append('./utils')
sys.path.append('./callbacks')
sys.path.append('./visualization')

from utils.experiment_utils import CSVLogger, run_experiment, load_dataset, load_dataset_folds, run_cross_validation

from callbacks.LatentSpacePlotter import LatentSpacePlotter
from callbacks.LatentSpaceEvolutionPlotter import LatentSpaceEvolutionPlotter

from visualization.metric_plots import plot_ROC_curve

from models.AE import AE
from models.BAE import BAE
from models.SAE import SAE
from models.CAE import CAE
from models.KSAE import KSAE
from models.CAEAA import CAEAA

In [None]:
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median').set_output(transform='pandas')),
    ('scaler', StandardScaler().set_output(transform='pandas'))
])

In [None]:
# KDDCUP99 = load_dataset("KDDCUP99", "../data/KDDCUP99/preprocessed/")
# CICIDS2017 = load_dataset("CICIDS2017", "../data/CIC-IDS2017/preprocessed/")
# UNSWNB15 = load_dataset("UNSW-NB15", "../data/UNSW-NB15/preprocessed/")
# CTU13_08 = load_dataset("CTU-13_08", "../data/CTU-13/preprocessed/08", pipeline=pipeline)
# CTU13_09 = load_dataset("CTU-13_09", "../data/CTU-13/preprocessed/09", pipeline=pipeline)
# CTU13_10 = load_dataset("CTU-13_10", "../data/CTU-13/preprocessed/10", pipeline=pipeline)
# CTU13_13 = load_dataset("CTU-13_13", "../data/CTU-13/preprocessed/13", pipeline=pipeline)

In [None]:
KDDCUP99_folds = load_dataset_folds("KDDCUP99", "../data/KDDCUP99/preprocessed/", kfolds=3, pipeline=pipeline)
UNSWNB15_folds = load_dataset_folds("UNSW-NB15", "../data/UNSW-NB15/preprocessed/", kfolds=3, pipeline=pipeline)
CTU13_08_folds = load_dataset_folds("CTU-13_08", "../data/CTU-13/preprocessed/08", kfolds=3, pipeline=pipeline)
CTU13_09_folds = load_dataset_folds("CTU-13_09", "../data/CTU-13/preprocessed/09", kfolds=3, pipeline=pipeline)
CTU13_10_folds = load_dataset_folds("CTU-13_10", "../data/CTU-13/preprocessed/10", kfolds=3, pipeline=pipeline)
CTU13_13_folds = load_dataset_folds("CTU-13_13", "../data/CTU-13/preprocessed/13", kfolds=3, pipeline=pipeline)

In [None]:
dataset = KDDCUP99_folds
model = AE(
                        input_size=dataset[0]['train'].x.shape[1],
                        hidden_sizes=[256, 64, 8],
                        batch_norm=True,
                        initial_lr=0.01,
                        optimizer="Adadelta",
                        optimizer_params={'rho': 0.9},
                        scheduler="StepLR",
                        scheduler_params={'step_size':20, 'gamma':0.5},
                        )

run_experiment(model, 
               dataset[0], 
               max_epochs=1, 
               trainer_callbacks=[
                    # LatentSpacePlotter("../out/figures/test/latent_space_plotter", "test", dataset[0]['train'].name, plot_epochs=[39]),
                    # LatentSpaceEvolutionPlotter("../out/figures/test/latent_space_evolution_plotter", "test", dataset[0]['train'].name, plot_epochs=[0, 19, 39])
                   ]
               )

# fpr, tpr, auc, cluster_sizes = model.calc_ROC(dataset[0]['val'])

# plot_ROC_curve(fpr, tpr, auc, cluster_sizes, model.name, dataset[0]['train'].name, "../out/figures/test/roc_curve.png")

In [None]:
model = BAE(birch_threshold=0.1,
            birch_branching_factor=50,
            birch_fit_quantile=0.9,
            birch_n_clusters=2,
            birch_fit_sample_size=15000,
            base_model=AE,
            input_size=dataset[0]['train'].x.shape[1],
            hidden_sizes=[128, 64, 8],
            batch_norm=True,
            initial_lr=0.01,
            optimizer="Adadelta",
            optimizer_params={'rho': 0.9},
            scheduler="StepLR",
            scheduler_params={'step_size':20, 'gamma':0.5})

run_experiment(model, dataset[0], max_epochs=40)

In [None]:
logger = CSVLogger("../out/logs/test/threshold/ksae.csv")

threshold_quantiles = np.arange(0.8, 1.0, 0.05).tolist()

for threshold_quantile in threshold_quantiles:
    metrics = model.test_threshold_quantile(dataset[0]['train'], dataset[0]['val'], threshold_quantile)
    metrics['quantile'] = threshold_quantile
    logger.log(metrics)
    

In [None]:
#  CAE threshold tuning

fold = 1

datasets = [KDDCUP99_folds[fold], UNSWNB15_folds[fold], CTU13_08_folds[fold], CTU13_09_folds[fold], CTU13_10_folds[fold], CTU13_13_folds[fold]]
nums_clusters = [2, 4, 2, 2, 2, 2]

hidden_sizes = [[256, 64]]
initial_lrs = [5e-2]

threshold_quantiles = [1.0]
# threshold_quantiles = np.arange(0.9, 1.005, 0.005).tolist()


for dataset, num_clusters in zip(datasets, nums_clusters):
    for initial_lr in initial_lrs:
        for hidden_size in hidden_sizes:
            input_size = dataset['train'][0][0].shape[0]
            hidden_size = hidden_size + [int(input_size ** 0.5) + 1]  
            model = CAE(
                    input_size=input_size, 
                    hidden_sizes=hidden_size,
                    batch_norm=True,
                    initial_lr=initial_lr,
                    optimizer="Adadelta",
                    optimizer_params={'rho': 0.9},
                    scheduler="StepLR",
                    scheduler_params={'step_size':30, 'gamma':0.5},
                    num_clusters=num_clusters
                    )

            result = run_experiment(
                model=model,
                dataset=dataset,
                max_epochs=70,
                experiment_name=f"CAE tuning {dataset['train'].name} {hidden_size}",
                run_name=f"lr={initial_lr} num_clusters={num_clusters}",
                save_model=False
            )

            logger = CSVLogger(f"../out/logs/threshold/CAE_{dataset['train'].name}.csv")

            for threshold_quantile in threshold_quantiles:
                metrics = model.test_threshold_quantile(dataset['train'], dataset['val'], threshold_quantile)
                metrics['quantile'] = threshold_quantile
                logger.log(metrics)


In [None]:
#  SAE SVM threshold tuning

datasets = [KDDCUP99_folds[fold], UNSWNB15_folds[fold], CTU13_08_folds[fold], CTU13_09_folds[fold], CTU13_10_folds[fold], CTU13_13_folds[fold]]

hidden_sizes = [[256, 64]]
initial_lrs = [1e-2]

threshold_quantiles = np.arange(0.95, 1.005, 0.005).tolist()


for dataset in datasets:
    for initial_lr in initial_lrs:
        for hidden_size in hidden_sizes:
            input_size = dataset['train'][0][0].shape[0]
            hidden_size = hidden_size + [int(input_size ** 0.5) + 1]  
            model = SAE(
                    input_size=input_size, 
                    hidden_sizes=hidden_size,
                    batch_norm=True,
                    initial_lr=initial_lr,
                    optimizer="Adadelta",
                    optimizer_params={'rho': 0.9},
                    scheduler="StepLR",
                    scheduler_params={'step_size':30, 'gamma':0.5},
                    occ_algorithm='svm',
                    occ_fit_sample_size=15000,
                    fit_occ_once=True,
                    lmb=50,
                    nu=0.05,
                    )

            result = run_experiment(
                model=model,
                dataset=dataset,
                max_epochs=80,
                experiment_name=f"SAE tuning {dataset['train'].name} {hidden_size}",
                run_name=f"lr={initial_lr}",
                save_model=False
            )

            logger = CSVLogger(f"../out/logs/threshold/SAE_svm_{dataset['train'].name}.csv")

            for threshold_quantile in threshold_quantiles:
                metrics = model.test_threshold_quantile(dataset['train'], dataset['val'], threshold_quantile)
                metrics['quantile'] = threshold_quantile
                logger.log(metrics)


In [None]:
#  AE threshold tuning

datasets = [KDDCUP99_folds[0], UNSWNB15_folds[0], CTU13_08_folds[0], CTU13_09_folds[0], CTU13_10_folds[0], CTU13_13_folds[0]]

hidden_sizes = [[256, 64]]
initial_lrs = [1e-1]

threshold_quantiles = [1.0]
# threshold_quantiles = np.arange(0.9, 1.0, 0.005).tolist()


for dataset in datasets:
    for initial_lr in initial_lrs:
        for hidden_size in hidden_sizes:
            input_size = dataset['train'][0][0].shape[0]
            hidden_size = hidden_size + [int(input_size ** 0.5) + 1]  
            model = AE(
                    input_size=input_size, 
                    hidden_sizes=hidden_size,
                    batch_norm=True,
                    initial_lr=initial_lr,
                    optimizer="Adadelta",
                    optimizer_params={'rho': 0.9},
                    scheduler="StepLR",
                    scheduler_params={'step_size':30, 'gamma':0.5},
                    )

            result = run_experiment(
                model=model,
                dataset=dataset,
                max_epochs=70,
                experiment_name=f"AE tuning {dataset['train'].name} {hidden_size}",
                run_name=f"lr={initial_lr}",
                save_model=False
            )

            logger = CSVLogger(f"../out/logs/threshold/AE_svm_{dataset['train'].name}.csv")

            for threshold_quantile in threshold_quantiles:
                metrics = model.test_threshold_quantile(dataset['train'], dataset['val'], threshold_quantile)
                metrics['quantile'] = threshold_quantile
                logger.log(metrics)


In [None]:
for dataset in datasets:
    logger = CSVLogger(f"../out/logs/threshold/AE_svm_{dataset['train'].name}.csv")
    print(f"Results for {dataset['train'].name}:")
    print(logger.df[['quantile', 'test_mcc']])

In [None]:
#  AE tuning

datasets = [CTU13_09_folds, CTU13_10_folds, CTU13_13_folds]

hidden_sizes = [[64, 32], [32, 16]]
initial_lrs = [1e-1]

for dataset in datasets:
    logger = CSVLogger(f"../out/logs/AE_tuning_{dataset[0]['train'].name}.csv")
    for hidden_size in hidden_sizes:
        input_size = dataset[0]['train'][0][0].shape[0]
        hidden_size = hidden_size + [int(input_size ** 0.5) + 1]  
        model = AE(
                input_size=input_size, 
                hidden_sizes=hidden_size,
                batch_norm=True,
                initial_lr=1e-1,
                optimizer="Adadelta",
                optimizer_params={'rho': 0.9},
                scheduler="StepLR",
                scheduler_params={'step_size':20, 'gamma':0.5},
                threshold_quantile=0.95,
                )

        result, _ = run_cross_validation(
            model=model,
            dataset_folds=dataset,
            max_epochs=40,
            experiment_name=f"AE th tuning {dataset[0]['train'].name}",
            run_name=f"{hidden_size}",
            save_model=False
        )

        logger.log(result)

In [None]:
#  SAE cen tuning

datasets = [KDDCUP99_folds, UNSWNB15_folds, CTU13_08_folds, CTU13_09_folds, CTU13_10_folds, CTU13_13_folds]

hidden_sizes = [[256, 64]]
initial_lrs = [1e-2]
lmbs = [1, 10]

for dataset in datasets:
    logger = CSVLogger(f"../out/logs/SAE_cen_tuning_{dataset[0]['train'].name}.csv")
    for initial_lr in initial_lrs:
        for lmb in lmbs:
            for hidden_size in hidden_sizes:
                input_size = dataset[0]['train'][0][0].shape[0]
                hidden_size = hidden_size + [int(input_size ** 0.5) + 1]  
                model = SAE(
                        input_size=input_size, 
                        hidden_sizes=hidden_size,
                        batch_norm=True,
                        initial_lr=initial_lr,
                        optimizer="Adadelta",
                        optimizer_params={'rho': 0.9},
                        scheduler="StepLR",
                        scheduler_params={'step_size':20, 'gamma':0.5},
                        lmb=lmb,
                        occ_algorithm='centroid',
                        fit_occ_once=True,
                        occ_fit_sample_size=15000,
                        )

                result, _ = run_cross_validation(
                    model=model,
                    dataset_folds=dataset,
                    max_epochs=10,
                    experiment_name=f"SAE cen tuning {dataset[0]['train'].name} {hidden_size}",
                    run_name=f"lr={initial_lr} lmb={lmb}",
                    save_model=False
                )
    
                logger.log(result)

In [None]:
#  SAE lof tuning

datasets = [KDDCUP99_folds, UNSWNB15_folds, CTU13_08_folds, CTU13_09_folds, CTU13_10_folds, CTU13_13_folds]

hidden_sizes = [128, 64]
initial_lrs = [1e-3]
lmbs = [50]

for dataset in datasets:
    logger = CSVLogger(f"../out/logs/SAE_lof_tuning_{dataset[0]['train'].name}.csv")
    for initial_lr in initial_lrs:
        for lmb in lmbs:
            for hidden_size in hidden_sizes:
                input_size = dataset[0]['train'][0][0].shape[0]
                hidden_size = hidden_size + [int(input_size ** 0.5) + 1]  
                model = SAE(
                        input_size=input_size, 
                        hidden_sizes=hidden_size,
                        batch_norm=True,
                        initial_lr=initial_lr,
                        optimizer="Adadelta",
                        optimizer_params={'rho': 0.9},
                        scheduler="StepLR",
                        scheduler_params={'step_size':20, 'gamma':0.5},
                        lmb=lmb,
                        occ_algorithm='lof',
                        fit_occ_once=True,
                        occ_fit_sample_size=15000,
                        n_neighbors=200,
                        )

                result, _ = run_cross_validation(
                    model=model,
                    dataset_folds=dataset,
                    max_epochs=30,
                    experiment_name=f"SAE lof tuning {dataset[0]['train'].name} {hidden_size}",
                    run_name=f"lr={initial_lr} lmb={lmb}",
                    save_model=False
                )
    
                logger.log(result)

In [None]:
#  SAE svm tuning

datasets = [KDDCUP99_folds, UNSWNB15_folds, CTU13_13_folds, CTU13_08_folds, CTU13_09_folds, CTU13_10_folds]

hidden_sizes = [[64, 32], [32, 16]]
initial_lrs = [1e-2]
lmbs = [50]
nus = [0.05]

for dataset in datasets:
    logger = CSVLogger(f"../out/logs/SAE_svm_tuning_{dataset[0]['train'].name}.csv")
    for initial_lr in initial_lrs:
        for lmb in lmbs:
            for nu in nus:
                for hidden_size in hidden_sizes:
                    input_size = dataset[0]['train'][0][0].shape[0]
                    hidden_size = hidden_size + [int(input_size ** 0.5) + 1]  
                    model = SAE(
                            input_size=input_size, 
                            hidden_sizes=hidden_size,
                            batch_norm=True,
                            initial_lr=initial_lr,
                            optimizer="Adadelta",
                            optimizer_params={'rho': 0.9},
                            scheduler="StepLR",
                            scheduler_params={'step_size':20, 'gamma':0.5},
                            lmb=lmb,
                            occ_algorithm='svm',
                            nu=nu,
                            fit_occ_once=True,
                            occ_fit_sample_size=15000,
                            )

                    result, _ = run_cross_validation(
                        model=model,
                        dataset_folds=dataset,
                        max_epochs=40,
                        experiment_name=f"SAE svm tuning {dataset[0]['train'].name} {hidden_size}",
                        run_name=f"lr={initial_lr} lmb={lmb}, nu={nu}",
                        save_model=False
                    )
        
                    logger.log(result)

In [None]:
#  CAE tuning

datasets = [KDDCUP99_folds, UNSWNB15_folds]

hidden_sizes = [[256, 64]]
initial_lrs = [5e-2]
nums_clusters = [2, 3, 4]

for dataset in datasets:
    logger = CSVLogger(f"../out/logs/CAE_tuning_{dataset[0]['train'].name}.csv")
    for initial_lr in initial_lrs:
        for num_clusters in nums_clusters:
            for hidden_size in hidden_sizes:
                input_size = dataset[0]['train'][0][0].shape[0]
                hidden_size = hidden_size + [int(input_size ** 0.5) + 1]  
                model = CAE(
                        input_size=input_size, 
                        hidden_sizes=hidden_size,
                        batch_norm=True,
                        initial_lr=initial_lr,
                        optimizer="Adadelta",
                        optimizer_params={'rho': 0.9},
                        scheduler="StepLR",
                        scheduler_params={'step_size':20, 'gamma':0.5},
                        num_clusters=num_clusters
                        )

                result, _ = run_cross_validation(
                    model=model,
                    dataset_folds=dataset,
                    max_epochs=40,
                    experiment_name=f"CAE tuning {dataset[0]['train'].name} {hidden_size}",
                    run_name=f"lr={initial_lr} num_clusters={num_clusters}",
                    save_model=False
                )
    
                logger.log(result)

In [None]:
#  KSAE SEP svm tuning

datasets = [KDDCUP99_folds, UNSWNB15_folds, CTU13_10_folds]

hidden_sizes = [[256, 64, 24]]
initial_lrs = [1e-2]
lmbs = [50]
nus = [0.05]
nums_clusters = [2, 3, 4]

for dataset in datasets:
    logger = CSVLogger(f"../out/logs/KSAE_svm_tuning_{dataset[0]['train'].name}.csv")
    for initial_lr in initial_lrs:
        for lmb in lmbs:
            for nu in nus:
                for num_clusters in nums_clusters:
                    for hidden_size in hidden_sizes:
                        input_size = dataset[0]['train'][0][0].shape[0]
                        hidden_size = hidden_size + [int(input_size ** 0.5) + 1]  
                        model = KSAE(
                                kmeans_n_clusters=num_clusters,
                                kmeans_fit_quantile=0.9,
                                kmeans_fit_sample_size=15000,
                                base_model=SAE,
                                input_size=input_size, 
                                hidden_sizes=hidden_size,
                                batch_norm=True,
                                initial_lr=initial_lr,
                                optimizer="Adadelta",
                                optimizer_params={'rho': 0.9},
                                scheduler="StepLR",
                                scheduler_params={'step_size':20, 'gamma':0.5},
                                lmb=lmb,
                                occ_algorithm='svm',
                                nu=nu,
                                fit_occ_once=True,
                                occ_fit_sample_size=15000,
                                )

                        result, _ = run_cross_validation(
                            model=model,
                            dataset_folds=dataset,
                            max_epochs=40,
                            experiment_name=f"KSAE svm tuning {dataset[0]['train'].name} {hidden_size}",
                            run_name=f"lr={initial_lr} lmb={lmb}, nu={nu}",
                            save_model=False,
                            fit_params={'adjust_epochs': True}
                        )
            
                        logger.log(result)

In [None]:
#  KSAE SEP cen tuning

datasets = [UNSWNB15_folds, CTU13_08_folds, CTU13_09_folds, CTU13_10_folds, CTU13_13_folds]
# datasets = [UNSWNB15_folds]

hidden_sizes = [[128, 64]]
initial_lrs = [1e-2]
lmbs = [50]
nums_clusters = [2, 3, 4]

for dataset in datasets:
    logger = CSVLogger(f"../out/logs/KSAE_cen_adj_tuning_{dataset[0]['train'].name}.csv")
    for initial_lr in initial_lrs:
        for lmb in lmbs:
            for num_clusters in nums_clusters:
                for hidden_size in hidden_sizes:
                    input_size = dataset[0]['train'][0][0].shape[0]
                    hidden_size = hidden_size + [int(input_size ** 0.5) + 1]  
                    model = KSAE(
                            kmeans_n_clusters=num_clusters,
                            kmeans_fit_quantile=0.9,
                            kmeans_fit_sample_size=15000,
                            base_model=SAE,
                            input_size=input_size, 
                            hidden_sizes=hidden_size,
                            batch_norm=True,
                            initial_lr=initial_lr,
                            optimizer="Adadelta",
                            optimizer_params={'rho': 0.9},
                            scheduler="StepLR",
                            scheduler_params={'step_size':20, 'gamma':0.5},
                            lmb=lmb,
                            occ_algorithm='centroid',
                            fit_occ_once=True,
                            occ_fit_sample_size=15000,
                            )

                    result, _ = run_cross_validation(
                        model=model,
                        dataset_folds=dataset,
                        max_epochs=15,
                        experiment_name=f"KSAE cen adj4 tuning {dataset[0]['train'].name} {hidden_size}",
                        run_name=f"lr={initial_lr} lmb={lmb}",
                        save_model=False,
                        fit_params={'adjust_epochs': True},
                    )
        
                    logger.log(result)

In [None]:
#  BAE tuning

# datasets = [KDDCUP99_folds, UNSWNB15_folds, CTU13_09_folds]
datasets = [UNSWNB15_folds]

hidden_sizes = [[256, 64, 24]]
initial_lrs = [1e-1]
nums_clusters = [2]

for dataset in datasets:
    logger = CSVLogger(f"../out/logs/BAE_tuning_{dataset[0]['train'].name}.csv")
    for initial_lr in initial_lrs:
        for num_clusters in nums_clusters:
            for hidden_size in hidden_sizes:
                input_size = dataset[0]['train'][0][0].shape[0]
                hidden_size = hidden_size + [int(input_size ** 0.5) + 1]  
                model = BAE(
                    birch_branching_factor=50,
                    birch_threshold=0.1,
                        birch_n_clusters=num_clusters,
                        birch_fit_quantile=0.9,
                        birch_fit_sample_size=15000,
                        base_model=SAE,
                        input_size=input_size, 
                        hidden_sizes=hidden_size,
                        batch_norm=True,
                        initial_lr=initial_lr,
                        optimizer="Adadelta",
                        optimizer_params={'rho': 0.9},
                        scheduler="StepLR",
                        scheduler_params={'step_size':20, 'gamma':0.5},
                        )

                result, _ = run_cross_validation(
                    model=model,
                    dataset_folds=dataset,
                    max_epochs=40,
                    experiment_name=f"BAE adj4 tuning {dataset[0]['train'].name} {hidden_size}",
                    run_name=f"lr={initial_lr}",
                    save_model=False,
                    fit_params={'adjust_epochs': True},
                )
    
                logger.log(result)