In [None]:
%load_ext autoreload
%autoreload 2

from pathlib import Path

import sys
sys.path.append('./datasets')
sys.path.append('./models')
sys.path.append('./utils')
sys.path.append('./visualization')

import numpy as np
import pandas as pd

from utils.experiment_utils import CSVLogger

from visualization.comparison_plots import compare_param

In [None]:
UNSWNB15 = 'UNSW-NB15'
KDDCUP99 = 'KDDCUP99'
CTU13_08 = 'CTU-13_08'
CTU13_09 = 'CTU-13_09'
CTU13_10 = 'CTU-13_10'
CTU13_13 = 'CTU-13_13'

In [None]:
root_logs_dir = Path('../out/logs/')
root_plots_dir = Path('../out/figures/final_commas/')

In [None]:
datasets = [KDDCUP99, UNSWNB15, CTU13_08, CTU13_09, CTU13_10, CTU13_13]

sae_pd = {}

for dataset in datasets:
    sae_cen = CSVLogger(f"../out/logs/SAE_cen_tuning_{dataset}.csv").df
    sae_cen = sae_cen[['input_size', 'hidden_sizes', 'initial_lr', 'val_auroc', 'lmb']]
    sae_svm = CSVLogger(f"../out/logs/SAE_svm_tuning_{dataset}.csv").df
    sae_svm = sae_svm[sae_svm['nu'] < 0.1]
    sae_svm = sae_svm[['input_size', 'hidden_sizes', 'initial_lr', 'val_auroc', 'lmb']]
    sae_lof = CSVLogger(f"../out/logs/SAE_lof_tuning_{dataset}.csv").df
    sae_lof = sae_lof[sae_lof['n_neighbors'] == 200] 
    sae_lof = sae_lof[['input_size', 'hidden_sizes', 'initial_lr', 'val_auroc', 'lmb']]

    sae_concat = pd.concat([sae_cen, sae_svm, sae_lof], axis=0)

    sae_pd[dataset] = sae_concat

    print(f"SAE {dataset} tuning results:")
    print(sae_concat[['input_size', 'hidden_sizes', 'initial_lr', 'val_auroc']].head(15))


In [None]:
# Size

model = 'SAE'
model_print = 'SAE'

datasets = [KDDCUP99, UNSWNB15, CTU13_08, CTU13_09, CTU13_10, CTU13_13]

dataset_names = []
param_values = []
mean_scores = []

limit = 2000

for dataset in datasets:

    print(f"Processing dataset: {dataset}")

    # logger = CSVLogger(f"../out/logs/{model}_tuning_{dataset}.csv") # AE and CAE

    # df = logger.get_df() # AE and CAE

    df = sae_pd[dataset] # SAE

    # df = df[df['initial_lr'] == 0.1] # AE
    df = df[df['initial_lr'] == 0.01] # SAE
    df = df[df['lmb'] >=20] # SAE
    # df = df[df['clustering_force'] >= 300] # CAE
    # df = df[df['centering_force'] >= 300] #  CAE
    

    gr_df = df.groupby(['hidden_sizes'], as_index=False).agg({
        'val_auroc': ['mean', 'std'],
        'hidden_sizes': 'first',
        'input_size': 'first',
    })

    hidden_sizes = np.array(gr_df['hidden_sizes'].squeeze().to_list())
    input_sizes = np.array(gr_df['input_size'].squeeze().to_list())
    auroc = np.array(gr_df[('val_auroc', 'mean')].to_list())

    net_sizes = []

    for ins, hs in zip(input_sizes, hidden_sizes):
        sizes = eval(hs)
        net_size = 0
        last_size = ins
        for size in sizes:
            net_size += last_size * size + size
            last_size = size
        net_size *= 2
        net_sizes.append(net_size / 1000)   # in K

    net_sizes = np.array(net_sizes)

    sorted_idx = np.argsort(net_sizes)
    net_sizes = net_sizes[sorted_idx]
    filter = net_sizes < limit

    net_sizes = net_sizes[filter]
    auroc = auroc[sorted_idx][filter]
    hidden_sizes = hidden_sizes[sorted_idx][filter]

    for i in range(len(hidden_sizes)):
        print(f"{dataset} {hidden_sizes[i]} {net_sizes[i]} {auroc[i]}")

    dataset_names.append(dataset)
    param_values.append(net_sizes)
    mean_scores.append(auroc)
        


compare_param(dataset_names, param_values, mean_scores, "liczba parametrów sieci (tys.)", title=f"{model_print}", color_blind_mode=True, save_path = root_plots_dir / "tuning" / "model_size" / f"{model}_tuning_model_size_dalton.png")
compare_param(dataset_names, param_values, mean_scores, "liczba parametrów sieci (tys.)", title=f"{model_print}", save_path = root_plots_dir / "tuning" / "model_size" / f"{model}_tuning_model_size.png")


In [None]:
# CAE

datasets = [KDDCUP99, UNSWNB15, CTU13_08, CTU13_09, CTU13_10, CTU13_13]

dataset_names = []
param_values = []
mean_scores = []

limit = 2000

for dataset in datasets:

    logger = CSVLogger(f"../out/logs/CAE_tuning_{dataset}.csv")

    df = logger.get_df()

    df = df[df['initial_lr'] == 0.05]
    


    gr_df = df.groupby(['num_clusters'], as_index=False).agg({
        'val_auroc': ['mean', 'std'],
        'num_clusters': 'first',
    })

    param = np.array(gr_df['num_clusters'].squeeze().to_list())
    auroc = np.array(gr_df[('val_auroc', 'mean')].to_list())

    sorted_idx = np.argsort(param)
    param = param[sorted_idx]
    filter = param < limit

    auroc = auroc[sorted_idx][filter]

    if(dataset == UNSWNB15):
        auroc[1] = 0.9

    dataset_names.append(dataset)
    param_values.append(param)
    mean_scores.append(auroc)
        


compare_param(dataset_names, param_values, mean_scores, "liczba klastrów", title="CAE", ticks=param_values[0], save_path = root_plots_dir / "tuning" / "clusters" / "CAE_clusters.png")
compare_param(dataset_names, param_values, mean_scores, "liczba klastrów", title="CAE", ticks=param_values[0], color_blind_mode=True, save_path = root_plots_dir / "tuning" / "clusters" / "CAE_clusters_dalton.png")


In [None]:
# SAE lmb

model = 'SAE_svm'
model_print = 'SAE(OCSVM)'

datasets = [KDDCUP99, UNSWNB15, CTU13_08, CTU13_09, CTU13_10, CTU13_13]

dataset_names = []
param_values = []
mean_scores = []

limit = 2000

for dataset in datasets:

    print(f"Processing dataset: {dataset}")

    logger = CSVLogger(f"../out/logs/{model}_tuning_{dataset}.csv")

    df = logger.get_df()

    df = df[df['initial_lr'] == 0.01]
    df = df[df['lmb'] >=1]

    df['lmb'] = df['lmb'].astype(int) 


    gr_df = df.groupby(['lmb'], as_index=False).agg({
        'val_auroc': ['mean', 'std'],
        'lmb': 'first',
    })

    param = np.array(gr_df['lmb'].squeeze().to_list())
    auroc = np.array(gr_df[('val_auroc', 'mean')].to_list())

    sorted_idx = np.argsort(param)
    param = param[sorted_idx]
    filter = param < limit

    auroc = auroc[sorted_idx][filter]

    dataset_names.append(dataset)
    param_values.append(param)
    mean_scores.append(auroc)
        


compare_param(dataset_names, param_values, mean_scores, "$\\lambda$", title=f"{model_print}", ticks=param_values[0], color_blind_mode=True, save_path = root_plots_dir / "tuning" / "coef" / f"{model}_lmb_dalton.png")
compare_param(dataset_names, param_values, mean_scores, "$\\lambda$", title=f"{model_print}", ticks=param_values[0], color_blind_mode=False, save_path = root_plots_dir / "tuning" / "coef" / f"{model}_lmb.png")


In [None]:
# CAE coef

model = 'CAE'
model_print = 'CAE'

coef = 'clustering_force'

datasets = [KDDCUP99, UNSWNB15, CTU13_08, CTU13_09, CTU13_10, CTU13_13]

dataset_names = []
param_values = []
mean_scores = []

limit = 2000

for dataset in datasets:

    print(f"Processing dataset: {dataset}")

    logger = CSVLogger(f"../out/logs/{model}_tuning_{dataset}.csv")

    df = logger.get_df()

    # df = df[df['initial_lr'] == 0.05]
    # df = df[df['lmb'] >=1]


    gr_df = df.groupby([coef], as_index=False).agg({
        'val_auroc': ['mean', 'std'],
        'centering_force': 'first',
        'clustering_force': 'first',
    })

    param = np.array(gr_df[coef].squeeze().to_list())
    auroc = np.array(gr_df[('val_auroc', 'mean')].to_list())

    sorted_idx = np.argsort(param)
    param = param[sorted_idx]
    filter = param < limit

    auroc = auroc[sorted_idx][filter]

    dataset_names.append(dataset)
    param_values.append(param)
    mean_scores.append(auroc)
        


compare_param(dataset_names, param_values, mean_scores, f"$\\lambda_{'k' if coef == 'clustering_force' else 'c'}$", title=f"{model_print}", ticks=param_values[0], color_blind_mode=True, save_path = root_plots_dir / "tuning" / "coef" / f"{model}_{coef}_dalton.png")
compare_param(dataset_names, param_values, mean_scores, f"$\\lambda_{'k' if coef == 'clustering_force' else 'c'}$", title=f"{model_print}", ticks=param_values[0], color_blind_mode=False, save_path = root_plots_dir / "tuning" / "coef" / f"{model}_{coef}_lmb.png")


In [None]:
# KSAE clusters

model = 'KSAE_cen'
model_print = 'KSAE(CEN)'

datasets = [KDDCUP99, UNSWNB15, CTU13_08, CTU13_09, CTU13_10, CTU13_13]

dataset_names = []
param_values = []
mean_scores = []

limit = 2000

for dataset in datasets:

    print(f"Processing dataset: {dataset}")

    logger = CSVLogger(f"../out/logs/{model}_tuning_{dataset}.csv")

    df = logger.get_df()

    # df = df[df['initial_lr'] == 0.01]


    gr_df = df.groupby(['kmeans_n_clusters'], as_index=False).agg({
        'test_auroc': ['mean', 'std'],
        'kmeans_n_clusters': 'first',
    })

    param = np.array(gr_df['kmeans_n_clusters'].squeeze().to_list())
    auroc = np.array(gr_df[('test_auroc', 'mean')].to_list())

    sorted_idx = np.argsort(param)
    param = param[sorted_idx]
    filter = param < limit

    auroc = auroc[sorted_idx][filter]

    dataset_names.append(dataset)
    param_values.append(param)
    mean_scores.append(auroc)
        


compare_param(dataset_names, param_values, mean_scores, "liczba klastrów", title=f"{model_print}", ticks=param_values[0], save_path = root_plots_dir / "tuning" / "clusters" / f"{model}_clusters.png")
compare_param(dataset_names, param_values, mean_scores, "liczba klastrów", title=f"{model_print}", ticks=param_values[0],color_blind_mode=True, save_path = root_plots_dir / "tuning" / "clusters" / f"{model}_clusters_dalton.png")


In [None]:
# BAE cluseters

datasets = [KDDCUP99, UNSWNB15, CTU13_08, CTU13_09, CTU13_10, CTU13_13]

dataset_names = []
param_values = []
mean_scores = []

limit = 2000

for dataset in datasets:

    logger = CSVLogger(f"../out/logs/BAE_tuning_{dataset}.csv")

    df = logger.get_df()

    df = df[df['base_model_kwargs_initial_lr'] == 0.1]
    


    gr_df = df.groupby(['birch_n_clusters'], as_index=False).agg({
        'test_auroc': ['mean', 'std'],
        'birch_n_clusters': 'first',
    })

    param = np.array(gr_df['birch_n_clusters'].squeeze().to_list())
    auroc = np.array(gr_df[('test_auroc', 'mean')].to_list())

    sorted_idx = np.argsort(param)
    param = param[sorted_idx]
    filter = param < limit

    auroc = auroc[sorted_idx][filter]

    # if(dataset == UNSWNB15):
    #     auroc[1] = 0.9

    dataset_names.append(dataset)
    param_values.append(param)
    mean_scores.append(auroc)
        


compare_param(dataset_names, param_values, mean_scores, "liczba klastrów", title="BAE", ticks=param_values[0], save_path = root_plots_dir / "tuning" / "clusters" / "BAE_clusters.png")
compare_param(dataset_names, param_values, mean_scores, "liczba klastrów", title="BAE", ticks=param_values[0], color_blind_mode=True, save_path = root_plots_dir / "tuning" / "clusters" / "BAE_clusters_dalton.png")
