In [None]:
%load_ext autoreload
%autoreload 2

from pathlib import Path

import sys
sys.path.append('./datasets')
sys.path.append('./models')
sys.path.append('./utils')
sys.path.append('./visualization')

import pandas as pd

from utils.experiment_utils import CSVLogger
from visualization.comparison_plots import compare_with_literature, compare_models

In [None]:
root_logs_dir = Path('../out/logs/final/')
root_plots_dir = Path('../out/figures/final_commas/')

datasets_order = ['KDDCUP99', 'UNSW-NB15', 'CTU-13_08', 'CTU-13_09', 'CTU-13_10', 'CTU-13_13']

def handle_model(log_files, metric_field, model_name, literature_scores, literature_index, per_dataset_stats):

    gr_dfs = []
    for log_file in log_files:
        logger = CSVLogger(root_logs_dir / log_file)
        df = logger.get_df()

        df = df[df['dataset_name'].isin(datasets_order)]

        gr_df = df.groupby(['dataset_name'], as_index=False).agg({
            'dataset_name': 'first',
            metric_field: ['mean', 'std'],
        })

        gr_dfs.append(gr_df)

    gr_df = pd.concat(gr_dfs, ignore_index=True)

    names = gr_df['dataset_name'].squeeze().to_list()
    means = gr_df[metric_field]['mean'].to_list()
    stds = gr_df[metric_field]['std'].to_list()

    sorting_indices = sorted(range(len(names)), key=lambda k: datasets_order.index(names[k]))
    names = [names[i] for i in sorting_indices]
    means = [means[i] for i in sorting_indices]
    stds = [stds[i] for i in sorting_indices]


    for idx, name in enumerate(names):
        if name not in per_dataset_stats:
            per_dataset_stats[name] = {'models': [], 'mean': [], 'std': []}

        per_dataset_stats[name]['models'].append(model_name)
        per_dataset_stats[name]['mean'].append(means[idx])
        per_dataset_stats[name]['std'].append(stds[idx])

    # Compare with literature if any literature scores are provided
    if len([l for l in literature_scores if l is not None]) > 0:
        compare_with_literature(means, stds, literature_scores, names, literature_index, model_name, save_path= root_plots_dir / f"literature/{model_name}.png")

In [None]:
per_dataset_stats = {}

handle_model(["AE.csv"], 'val_auroc', 'AE(RE)', [None, None, None, None, None, None], 0, per_dataset_stats)
handle_model(["CAE.csv"], 'val_auroc', 'CAE(M-CEN)', [None, None, 0.994, 0.959, 0.996, 0.979], 3, per_dataset_stats)
handle_model(["SAE_cen.csv"], 'val_auroc', 'SAE(CEN)', [None, 0.886, 0.991, 0.950, 0.999, 0.969], 1, per_dataset_stats)
handle_model(["SAE_svm.csv"], 'val_auroc', 'SAE(OCSVM)', [None, 0.893, 0.990, 0.950, 0.999, 0.971], 1, per_dataset_stats)
handle_model(["SAE_lof.csv"], 'val_auroc', 'SAE(LOF)', [None, 0.894, 0.983, 0.960, 1.000, 0.975], 1, per_dataset_stats)
handle_model(["KSAE_SEP_cen.csv"], 'test_auroc', 'KSAE(CEN)', [None, 0.885, None, 0.946, 0.989, 0.962], 2, per_dataset_stats)
handle_model(["KSAE_SEP_svm.csv", "KSAE_SEP_svm_2.csv"], 'test_auroc', 'KSAE(OCSVM)', [None, 0.885, None, 0.946, 0.989, 0.962], 2, per_dataset_stats)
handle_model(["KSAE_SEP_lof.csv", "KSAE_SEP_lof_2.csv"], 'test_auroc', 'KSAE(LOF)', [None, 0.885, None, 0.946, 0.989, 0.962], 2, per_dataset_stats)
handle_model(["BAE_2.csv"], 'test_auroc', 'BAE', [None, None, None, None, None, None], 4, per_dataset_stats)


for name, stats in per_dataset_stats.items():
    compare_models(name, stats['models'], stats['mean'], stats['std'], save_path = root_plots_dir / f"models/{name}.png")


In [None]:
logger = CSVLogger("../out/logs/final/AE.csv")

df = logger.get_df()

df.head()

gr_df = df.groupby(['pack_id'], as_index=False).agg({
    'dataset_name': 'first',
    'val_auroc': ['mean', 'std'],
    'val_mcc': ['mean', 'std'],
    'hidden_sizes': 'first',
    'initial_lr': 'first',
    'fit_duration' : ['mean', 'std'],
})


names = gr_df['dataset_name'].squeeze().to_list()
means = gr_df['val_auroc']['mean'].to_list()
stds = gr_df['val_auroc']['std'].to_list()

literature = [None, None, None, None, None, None]

# compare_with_literature(means, stds,literature, names, 'AE(RE)', save_path='../out/figures/pre-final/AE.png')

for idx, name in enumerate(names):
    if name not in per_dataset_stats:
        per_dataset_stats[name] = {'models': [], 'mean': [], 'std': []}

    per_dataset_stats[name]['models'].append('AE(RE)')
    per_dataset_stats[name]['mean'].append(means[idx])
    per_dataset_stats[name]['std'].append(stds[idx])

gr_df.head(50)


In [None]:
logger = CSVLogger("../out/logs/final/CAE.csv")

df = logger.get_df()

df.head()

gr_df = df.groupby(['pack_id'], as_index=False).agg({
    'dataset_name': 'first',
    'val_auroc': ['mean', 'std'],
    'val_mcc': ['mean', 'std'],
    'hidden_sizes': 'first',
    'initial_lr': 'first',
    'fit_duration' : ['mean', 'std'],
})


names = gr_df['dataset_name'].squeeze().to_list()
means = gr_df['val_auroc']['mean'].to_list()
stds = gr_df['val_auroc']['std'].to_list()

literature = [None, None, 0.994, 0.959, 0.996, 0.979]

for idx, name in enumerate(names):
    if name not in per_dataset_stats:
        per_dataset_stats[name] = {'models': [], 'mean': [], 'std': []}

    per_dataset_stats[name]['models'].append('CAE(M-CEN)')
    per_dataset_stats[name]['mean'].append(means[idx])
    per_dataset_stats[name]['std'].append(stds[idx])


compare_with_literature(means, stds, literature, names, 39, 'CAE(M-CEN)', save_path='../out/figures/final_commas/literature/CAE(M-CEN).png')

gr_df.head(50)

In [None]:
logger = CSVLogger("../out/logs/final/SAE_lof.csv")

df = logger.get_df()

df.head()

gr_df = df.groupby(['pack_id'], as_index=False).agg({
    'dataset_name': 'first',
    'val_auroc': ['mean', 'std'],
    'hidden_sizes': 'first',
    'initial_lr': 'first',
    'fit_duration' : ['mean', 'std'],
})


names = gr_df['dataset_name'].squeeze().to_list()
means = gr_df['val_auroc']['mean'].to_list()
stds = gr_df['val_auroc']['std'].to_list()

literature = [None, 0.894, 0.983, 0.960, 1.000, 0.975]

for idx, name in enumerate(names):
    if name not in per_dataset_stats:
        per_dataset_stats[name] = {'models': [], 'mean': [], 'std': []}

    per_dataset_stats[name]['models'].append('SAE(LOF)')
    per_dataset_stats[name]['mean'].append(means[idx])
    per_dataset_stats[name]['std'].append(stds[idx])

compare_with_literature(means, stds, literature, names, 8, 'SAE(LOF)', save_path='../out/figures/final_commas/literature/SAE(LOF).png')

gr_df.head(50)

In [None]:
logger = CSVLogger("../out/logs/final/SAE_cen.csv")

df = logger.get_df()

df.head()

gr_df = df.groupby(['pack_id'], as_index=False).agg({
    'dataset_name': 'first',
    'val_auroc': ['mean', 'std'],
    'val_mcc': ['mean', 'std'],
    'hidden_sizes': 'first',
    'initial_lr': 'first',
    'fit_duration' : ['mean', 'std'],
})


names = gr_df['dataset_name'].squeeze().to_list()
means = gr_df['val_auroc']['mean'].to_list()
stds = gr_df['val_auroc']['std'].to_list()

literature = [None, 0.886, 0.991, 0.950, 0.999, 0.969]

for idx, name in enumerate(names):
    if name not in per_dataset_stats:
        per_dataset_stats[name] = {'models': [], 'mean': [], 'std': []}

    per_dataset_stats[name]['models'].append('SAE(CEN)')
    per_dataset_stats[name]['mean'].append(means[idx])
    per_dataset_stats[name]['std'].append(stds[idx])

# compare_with_literature(means, stds, literature, names, 8, 'SAE(CEN)', save_path='../out/figures/final/literature/SAE(CEN).png')

gr_df.head(50)

In [None]:
logger = CSVLogger("../out/logs/final/SAE_svm.csv")

df = logger.get_df()

df.head()

gr_df = df.groupby(['pack_id'], as_index=False).agg({
    'dataset_name': 'first',
    'val_auroc': ['mean', 'std'],
    'hidden_sizes': 'first',
    'initial_lr': 'first',
    'fit_duration' : ['mean', 'std'],
})


names = gr_df['dataset_name'].squeeze().to_list()
means = gr_df['val_auroc']['mean'].to_list()
stds = gr_df['val_auroc']['std'].to_list()

literature = [None, 0.893, 0.990, 0.950, 0.999, 0.971]

for idx, name in enumerate(names):
    if name not in per_dataset_stats:
        per_dataset_stats[name] = {'models': [], 'mean': [], 'std': []}

    per_dataset_stats[name]['models'].append('SAE(OCSVM)')
    per_dataset_stats[name]['mean'].append(means[idx])
    per_dataset_stats[name]['std'].append(stds[idx])

# compare_with_literature(means, stds, literature, names, 8, 'SAE(OCSVM)', save_path='../out/figures/final/literature/SAE(OCSVM).png')

gr_df.head(50)

In [None]:
logger = CSVLogger("../out/logs/final/KSAE_SEP_cen.csv")

df = logger.get_df()

df.head()

gr_df = df.groupby(['pack_id'], as_index=False).agg({
    'dataset_name': 'first',
    'test_auroc': ['mean', 'std'],
    'fit_duration' : ['mean', 'std'],
})


names = gr_df['dataset_name'].squeeze().to_list()
means = gr_df['test_auroc']['mean'].to_list()
stds = gr_df['test_auroc']['std'].to_list()

literature = [None, 0.885, None, 0.946, 0.989, 0.962]

for idx, name in enumerate(names):
    if name not in per_dataset_stats:
        per_dataset_stats[name] = {'models': [], 'mean': [], 'std': []}

    per_dataset_stats[name]['models'].append('KSAE(CEN)')
    per_dataset_stats[name]['mean'].append(means[idx])
    per_dataset_stats[name]['std'].append(stds[idx])

# compare_with_literature(means, stds, literature, names, 5, 'KSAE(CEN)', save_path='../out/figures/final/literature/KSAE(CEN).png')

gr_df.head(50)

In [None]:
logger = CSVLogger("../out/logs/final/KSAE_SEP_lof.csv")
logger2 = CSVLogger("../out/logs/final/KSAE_SEP_lof_2.csv")

df = logger.get_df()
df2 = logger2.get_df()


gr_df = df.groupby(['dataset_name'], as_index=False).agg({
    'dataset_name': 'first',
    'test_auroc': ['mean', 'std'],
    'fit_duration' : ['mean', 'std'],
})

gr_df2 = df2.groupby(['dataset_name'], as_index=False).agg({
    'dataset_name': 'first',
    'test_auroc': ['mean', 'std'],
    'fit_duration' : ['mean', 'std'],
})

gr_df = pd.concat([gr_df, gr_df2], ignore_index=True)


gr_df = gr_df.iloc[[3, 4, 0, 5, 1, 2]]


names = gr_df['dataset_name'].squeeze().to_list()
means = gr_df['test_auroc']['mean'].to_list()
stds = gr_df['test_auroc']['std'].to_list()

literature = [None, 0.885, None, 0.946, 0.989, 0.962]

for idx, name in enumerate(names):
    if name not in per_dataset_stats:
        per_dataset_stats[name] = {'models': [], 'mean': [], 'std': []}

    per_dataset_stats[name]['models'].append('KSAE(LOF)')
    per_dataset_stats[name]['mean'].append(means[idx])
    per_dataset_stats[name]['std'].append(stds[idx])

# compare_with_literature(means, stds, literature, names, 5, 'KSAE(LOF)', save_path='../out/figures/final/literature/KSAE(SEP,LOF).png')

gr_df.head(50)

In [None]:
logger = CSVLogger("../out/logs/final/KSAE_SEP_svm.csv")
logger2 = CSVLogger("../out/logs/final/KSAE_SEP_svm_2.csv")

df = logger.get_df()
df2 = logger2.get_df()

gr_df = df.groupby(['dataset_name'], as_index=False).agg({
    'dataset_name': 'first',
    'test_auroc': ['mean', 'std'],
    'fit_duration' : ['mean', 'std'],
})

gr_df2 = df2.groupby(['dataset_name'], as_index=False).agg({
    'dataset_name': 'first',
    'test_auroc': ['mean', 'std'],
    'fit_duration' : ['mean', 'std'],
})

gr_df = pd.concat([gr_df, gr_df2], ignore_index=True)

gr_df = gr_df.iloc[[4, 5, 0, 6, 2, 3]]

names = gr_df['dataset_name'].squeeze().to_list()
means = gr_df['test_auroc']['mean'].to_list()
stds = gr_df['test_auroc']['std'].to_list()

literature = [None, 0.885, None, 0.946, 0.989, 0.962]

for idx, name in enumerate(names):
    if name not in per_dataset_stats:
        per_dataset_stats[name] = {'models': [], 'mean': [], 'std': []}

    per_dataset_stats[name]['models'].append('KSAE(OCSVM)')
    per_dataset_stats[name]['mean'].append(means[idx])
    per_dataset_stats[name]['std'].append(stds[idx])

# compare_with_literature(means, stds, literature, names, 5, 'KSAE(OCSVM)', save_path='../out/figures/final/literature/KSAE(OCSVM).png')

gr_df.head(50)

In [None]:
logger = CSVLogger("../out/logs/final/BAE_2.csv")

df = logger.get_df()

df.head()

gr_df = df.groupby(['pack_id'], as_index=False).agg({
    'dataset_name': 'first',
    'test_auroc': ['mean', 'std'],
    'fit_duration' : ['mean', 'std'],
})


names = gr_df['dataset_name'].squeeze().to_list()
means = gr_df['test_auroc']['mean'].to_list()
stds = gr_df['test_auroc']['std'].to_list()

literature = [None, None, None, None, None, None]

for idx, name in enumerate(names):
    if name not in per_dataset_stats:
        per_dataset_stats[name] = {'models': [], 'mean': [], 'std': []}

    per_dataset_stats[name]['models'].append('BAE')
    per_dataset_stats[name]['mean'].append(means[idx])
    per_dataset_stats[name]['std'].append(stds[idx])

# compare_with_literature(means, stds, literature, names, 56, 'BAE', save_path='../out/figures/final/literature/BAE.png')

gr_df.head(50)

In [None]:
for name, stats in per_dataset_stats.items():
    compare_models(name, stats['models'], stats['mean'], stats['std'], save_path=f'../out/figures/final/models/{name}.png')