In [1]:
import os
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import mplhep as hep
import torch

In [2]:
from src.models import TranAD, iTransformer


In [3]:
plt.style.use([hep.style.ROOT, hep.style.firamath])

In [4]:
config = {
 'iTransformer1': {'window': 10, 'steps': 1, 'latent': 2, 'eps': 100, 'lab': 'iTransformer: w10 s1 l2'},
 'iTransformer2': {'window': 100, 'steps': 50, 'latent': 2, 'eps': 100, 'lab': 'iTransformer: w100 s50 l2'},
 'iTransformer3': {'window': 10, 'steps': 5, 'latent': 2, 'eps': 100, 'lab': 'iTransformer: w10 s5 l2'},
 'iTransformer4': {'window': 100, 'steps': 10, 'latent': 2, 'eps': 100, 'lab': 'iTransformer w100 s10 l2'},
 'TranAD': {'window': 10, 'steps': 1, 'eps': 100, 'latent': '', 'lab': 'TranAD: w10 s1'},
#  'MAD_GAN': {'lab': 'MAD_GAN'},
#  'OmniAnomaly': {'lab': 'OmniAnomaly'},
#  'LSTM_AE': {'lab': 'LSTM_AE'},
#  'DAGMM': {'lab': 'DAGMM'},
#  'USAD': {'lab': 'USAD'},
#  'IF': {'lab': 'IF'},
#  'None': {'lab': 'None'}
}

## validation losses

In [None]:
datasets = ['GECCO', 'IEEECIS_new2.2', 'UCR', 'SWaT_1D', 'SMAP_new', 'MSL_new', 'SMD'] 
models = ['iTransformer2'] # ['iTransformer1', 'iTransformer2', 'iTransformer3', 'iTransformer4', 'TranAD']


all_paths = []
losses = {}
for dataset in datasets:
    for model in models:
        if 'iTransformer' in model and model != 'iTransformer':
            # paths = glob.glob(f'iTransformer_results_lxplus/iTransformer_{dataset}')
            paths = glob.glob(f'iTransformer/iTransformer_{dataset}')
        else:
            # paths = glob.glob(f'{model}_results_lxplus/{model}_{dataset}')
            paths = glob.glob(f'{model}/{model}_{dataset}')
        all_paths.extend(paths)
        if not paths:
            print(f'No paths found for {model} on {dataset}')
        feats = 30 if dataset == 'IEEECIS_new2.2' else -1

        for path in paths:
            if model in config.keys():
                if model == 'TranAD':
                    res_path = glob.glob(f"{path}/*n_window{config[model]['window']}_steps{config[model]['steps']}*feats{feats}*/checkpoints/model_final.ckpt")
                else:
                    res_path = glob.glob(f"{path}/*n_window{config[model]['window']}_steps{config[model]['steps']}_feats{feats}_eps{config[model]['eps']}_latent{config[model]['latent']}_7fold*/checkpoints/model_final.ckpt")
            else:
                res_path = glob.glob(f'{path}/*n_window10_steps1*feats{feats}*/checkpoints/model_final.ckpt')

            if res_path:
                res_path = np.sort(res_path)
                print(model, dataset, len(res_path))
                # print(res_path)
                tmp = pd.DataFrame()
                key = res_path[0].split('/')[1]
                if 'iTransformer' in model and model != 'iTransformer':
                        idx = len('iTransformer')  # len(model) - 1
                        diff = len(model) - len('iTransformer')
                        # insert a number in the key to distinguish between the models at position idx
                        key = key[:idx] + model[-diff:] + key[idx:]
                losses[key] = {'val_loss': [], 'train_loss': []}
                for i, p in enumerate(res_path):
                    model = torch.load(p)
                    accuracy_list = model['accuracy_list']
                    lossT = [i[0] for i in accuracy_list]
                    lossV = [i[1] for i in accuracy_list] 
                    losses[key]['val_loss'].append(lossV)
                    losses[key]['train_loss'].append(lossT)
            else:
                print(f'No results found for {model} on {dataset}')

print(losses.keys())
# print(len(losses['iTransformer_GECCO']['val_loss']), len(losses['iTransformer_GECCO']['val_loss'][0]))

In [49]:
# Plot training and validation losses
def plot_losses(losses, dataset):
    # colors = current_cycler.by_key()['color']
    colors = plt.cm.viridis(np.linspace(0, 1, 7))

    for key in losses.keys():
        if dataset in key:
            print(key)
            train_losses = losses[key]['train_loss']
            val_losses = losses[key]['val_loss']
            for i in range(len(train_losses)):
                plt.plot(train_losses[i], '-o', label=f'Train loss fold {i+1}', color=colors[i])
                plt.plot(val_losses[i], '-*', label=f'Val loss fold {i+1}', color=colors[i], markersize=8)
            plt.xlabel('Epochs')
            plt.ylabel('Loss')
            plt.title(f'Losses on {dataset}')
            plt.legend()
            plt.tight_layout()
            # plt.savefig(f'./studies_results_lxplus/losses/{key}_losses.png', facecolor='w')
            plt.show()
            plt.close()


In [None]:
# Usage
for dataset in datasets:
    plot_losses(losses, dataset)

## performance plots

In [None]:
datasets = ['IEEECIS_new2.2', 'SMAP_new', 'MSL_new', 'UCR', 'SMD', 'SWaT_1D', 'GECCO', 'SMD']  #, 'ATLAS_TS']
models = ['iTransformer1', 'iTransformer2', 'iTransformer3', 'TranAD']
        #   'OmniAnomaly', 'MAD_GAN', 'LSTM_AE', 'DAGMM', 'USAD', 'IF', 'None'] 

all_paths = []
results_mean_std = {}
for dataset in datasets:
    for model in models:
        if 'iTransformer' in model and model != 'iTransformer':
            paths = glob.glob(f'iTransformer_results_lxplus/iTransformer_{dataset}')
        else:
            paths = glob.glob(f'{model}_results_lxplus/{model}_{dataset}')
        all_paths.extend(paths)
        if not paths:
            print(f'No paths found for {model} on {dataset}')
        feats = 30 if dataset == 'IEEECIS_new2.2' else -1

        for path in paths:
            if model in config.keys():
                if model == 'TranAD':
                    res_path = glob.glob(f"{path}/*n_window{config[model]['window']}_steps{config[model]['steps']}*feats{feats}*/results/res.csv")
                else:
                    res_path = glob.glob(f"{path}/*n_window{config[model]['window']}_steps{config[model]['steps']}_feats{feats}_eps{config[model]['eps']}_latent{config[model]['latent']}*/results/res.csv")
            elif model == 'None':
                res_path = glob.glob(f'{path}/*feats{feats}*/results/res.csv')
            elif model == 'IF':
                res_path = glob.glob(f'{path}/*feats{feats}*/results/res.csv')
            else:
                res_path = glob.glob(f'{path}/*n_window10_steps1*feats{feats}*/results/res.csv')

            if res_path:
                res_path = np.sort(res_path)
                print(model, dataset, len(res_path))
                # print(res_path)
                tmp = pd.DataFrame()
                for p in res_path:
                    res = pd.read_csv(p)
                    tmp = pd.concat((tmp, res.iloc[-3:]))

                key = path.split('/')[1]
                if 'iTransformer' in model and model != 'iTransformer':
                    idx = len('iTransformer')  # len(model) - 1
                    diff = len(model) - len('iTransformer')
                    # insert a number in the key to distinguish between the models at position idx
                    key = key[:idx] + model[-diff:] + key[idx:]

                mean_values = tmp.groupby('Unnamed: 0').mean()
                std_values = tmp.groupby('Unnamed: 0').std()
                mean_values = mean_values.reindex(['local_all', 'local_all_maj', 'global'])
                std_values = std_values.reindex(['local_all', 'local_all_maj', 'global'])
                results_mean_std[key] = {'mean': mean_values, 'std': std_values}
                # print(results_mean_std[key])
            else:
                print(f'No results found for {model} on {dataset}')

            break

# print(len(all_paths))
print(results_mean_std.keys())

In [9]:
plt.rcParams.update({'lines.markersize': 6})
plt.rcParams.update({'errorbar.capsize': 8})
plt.rcParams.update({'lines.linewidth': 2})

In [10]:
modes = ['local (incl. OR)', 'local (maj. voting)', 'global']

In [12]:
def plot_scores_mean_std_err(results, modes, datasets, models, metric='MCC', name=None, labels=None):
    # colors = plt.cm.plasma(np.linspace(0, 1, len(models)+1))
    fig, axs = plt.subplots(3, 1, figsize=(22, 16), sharex=True, sharey=True)

    for i, mode in enumerate(modes):
        for j, model in enumerate(models):
            scores = {'mean': np.empty(0), 'std': np.empty(0)}
            for dataset in datasets:
                key = f'{model}_{dataset}'
               
                for val in ['mean', 'std']:
                    if key in results:
                        scores[val] = np.append(scores[val], results[key][val][metric].iloc[i])
                    else:
                        scores[val] = np.append(scores[val], 0)
            
            x_positions = np.arange(len(datasets)) + j * 0.1  # add offset for each model
            if labels:
                axs[i].errorbar(x_positions, scores['mean'], yerr=scores['std'], fmt='o', label=labels[j])  #, color=colors[j % len(colors)], capsize=5)
            else:
                axs[i].errorbar(x_positions, scores['mean'], yerr=scores['std'], fmt='o', label=model)  #, color=colors[j % len(colors)], capsize=5)
            
            axs[i].set_xticks(np.arange(len(datasets)) + 0.1 * (len(models) - 1) / 2)
            axs[i].set_xticklabels(labels=datasets)
            if metric == 'MCC':
                axs[i].set_ylim(-1, 1)
            else:
                axs[i].set_ylim(top=1.0)
            if metric == 'ROC/AUC':
                axs[i].set_ylabel('ROC AUC')
            elif metric == 'f1':
                axs[i].set_ylabel('F1')
            else:
                axs[i].set_ylabel(metric)
            axs[i].set_title(mode)
        axs[i].legend(bbox_to_anchor=(1.02, 1), loc='upper left')

    plt.tight_layout()
    if name:
        if metric == 'ROC/AUC':
            metric = 'rocauc'
        plt.savefig(f'./studies_results_lxplus/{name}_{metric}.png', facecolor='w')
    plt.show()


In [13]:
def plot_best_scores_mean_std_err(results, datasets, models, metric='MCC', name=None, labels=None):
    fig, ax = plt.subplots(figsize=(20, 6))

    for j, model in enumerate(models):
        scores = {'mean': np.empty(0), 'std': np.empty(0)}
        for dataset in datasets:
            key = f'{model}_{dataset}'
            
            if key in results:
                scores['mean'] = np.append(scores['mean'], results[key]['mean'][metric].max())
                idx = np.where(results[key]['mean'][metric] == results[key]['mean'][metric].max())[0][0]
                scores['std'] = np.append(scores['std'], results[key]['std'][metric].iloc[idx])
            else:
                scores['mean'] = np.append(scores['mean'], 0)
                scores['std'] = np.append(scores['std'], 0)
        
        x_positions = np.arange(len(datasets)) + j * 0.1  # add offset for each model
        if labels:
            ax.errorbar(x_positions, scores['mean'], yerr=scores['std'], fmt='o', label=labels[j])  
        else:
            ax.errorbar(x_positions, scores['mean'], yerr=scores['std'], fmt='o', label=model) 
            
    ax.set_xticks(np.arange(len(datasets)) + 0.1 * (len(models) - 1) / 2)
    ax.set_xticklabels(datasets)
    # ax.set_ylim(top=1.0)
    if metric == 'ROC/AUC':
        ax.set_ylabel('ROC AUC')
        ax.set_title(f'Best ROC AUC Scores')
        metric = 'rocauc'
    elif metric == 'f1':
        ax.set_ylabel('F1')
        ax.set_title(f'Best F1 Scores')
    else:
        ax.set_ylabel(metric)    
        ax.set_title(f'Best {metric} scores')
    ax.legend(bbox_to_anchor=(1.02, 1), loc='upper left')

    plt.tight_layout()
    if name:
        plt.savefig(f'./studies_results_lxplus/{name}_{metric}best.png', facecolor='w')
    plt.show()

In [None]:
data_plot = ['IEEECIS_new2.2', 'GECCO', 'SMD', 'SMAP_new', 'MSL_new', 'SWaT_1D', 'UCR'] 
models_plot = ['iTransformer4'] #, 'MAD_GAN', 'OmniAnomaly', 'LSTM_AE', 'DAGMM', 'USAD']
lab = [config[m]['lab'] for m in models_plot]

name = None  # 'new'  # 'rep5_new_all'
plot_best_scores_mean_std_err(results_mean_std, data_plot, models_plot, metric='MCC', labels=lab, name=name)
plot_scores_mean_std_err(results_mean_std, modes, data_plot, models_plot, metric='MCC', labels=lab, name=name)

In [None]:
def list_results(results, datasets, models, metric='MCC', labels=None, val='mean'):
    dict = {}
    for j, model in enumerate(models):
        mcc_scores = []
        for dataset in datasets:
            key = f'{model}_{dataset}'
            print(key)
            if key in results:
                mcc_scores.append(results[key][val][metric].max().round(3))
            else:
                mcc_scores.append(0)  # If no data, append 0

            dict[labels[j]] = mcc_scores

    return dict

In [None]:
def list_results(results, datasets, models, metric='MCC', labels=None):
    dict = {}
    for j, model in enumerate(models):
        scores = []
        for dataset in datasets:
            key = f'{model}_{dataset}'
            print(key)
            if key in results:
                scores.append(rf"${results[key]['mean'][metric].round(3)} \pm {results[key]['mean'][metric].round(3)}$")
            else:
                scores.append(0)  # If no data, append 0

            dict[labels[j]] = scores

    return dict

In [None]:
val = 'mean'
metric = 'train_loss'
namee = 'latent2_rep5_new'
data3 = ['IEEECIS_new2.2', 'GECCO', 'SMD', 'SMAP_new', 'MSL_new','UCR']
# ['ATLAS_TS', 'IEEECIS_new2.2', 'GECCO6', 'SMD', 'SMAP_new', 'MSL_new', 'SWaT', 'UCR'] #'ATLAS_TS',
models3 = ['iTransformer3', 'iTransformer4', 'TranAD']
# ['iTransformer4', 'iTransformer8', 'iTransformer6', 'iTransformer9', 'iTransformer11', 'TranAD']
# ['iTransformer3', 'iTransformer5', 'iTransformer7', 'iTransformer10', 'TranAD']
# ['iTransformer3', 'iTransformer5', 'iTransformer7', 'TranAD']
# ['iTransformer4', 'iTransformer8', 'iTransformer6', 'iTransformer9', 'TranAD']
lab = [config[m]['lab'] for m in models3]

dict_bestMCC = list_results(results_mean_std, data3, models3, metric, labels=lab, val=val)
df_bestMCC = pd.DataFrame(dict_bestMCC, index=data3).T
print(df_bestMCC)
print(df_bestMCC.values)
metric = metric.replace(' ', '_')
# df_bestMCC.to_csv(f'studies_earlystopping/data/best{metric}_{namee}_{val}.csv')

## plot valid vs train sets

In [5]:
from src.data_loader import MyDataset
from src.plotting import features_dict

In [6]:
%matplotlib inline

In [7]:
def plot_train_valid(dataset):
    for k in range(1, 8):
        # window and step size don't matter because we use complete data
        train = MyDataset(dataset, 100, 100, 'iTransformer', flag='train', feats=-1, enc=False, k=k)
        valid = MyDataset(dataset,  100, 100, 'iTransformer', flag='valid', feats=-1, enc=False, k=k)
        x_train = train.get_complete_data()
        x_valid = valid.get_complete_data()
        feats = x_train.shape[1]
        # print(train.__len__(), valid.__len__())
        # print(x_train.shape, x_valid.shape)
        if feats > 30:
            feats = 30
            x_train = x_train[:, :feats]
            x_valid = x_valid[:, :feats]

        if dataset in features_dict.keys():
            features = features_dict[dataset]
        else:
            features = [f'Dim {i}' for i in range(feats)]
        size = int(len(features))

        if feats > 1:
            fig, axs = plt.subplots(feats, 1, figsize=(15, size), sharex=True, constrained_layout=True)
            for i, feat in enumerate(features):
                axs[i].plot(x_train[:, i], label=f'Train')
                axs[i].plot(x_valid[:, i], '--', label=f'Valid fold {k}', color='tab:orange')
                # axs[2*i].plot(x_train[:, i], label=f'Train dim {i}')
                # axs[2*i+1].plot(x_valid[:, i], label=f'Valid dim {i}', color='tab:orange')
                axs[i].set_ylabel(feat, rotation=0, ha='right', rotation_mode='default', labelpad=5)
                axs[i].yaxis.set_label_coords(-0.1, 0.5)
            axs[0].set_title(f'Train and Validation data for {dataset}')
            if dataset == 'GECCO':
                axs[0].legend(ncol=2, bbox_to_anchor=(0.5, -0.1), loc='lower center', borderaxespad=0., frameon=False)
            else:
                axs[0].legend(ncol=2, bbox_to_anchor=(0.98, 0.25), loc='lower right', borderaxespad=0., frameon=False)
            axs[-1].set_xlabel('Timestamp')
        else:
            fig, ax = plt.subplots(figsize=(15, 5), constrained_layout=True)
            ax.plot(x_train, label='Train')
            ax.plot(x_valid, '--', label=f'Valid fold {k}', color='tab:orange')
            ax.set_ylabel(features[0], rotation=0, ha='right', rotation_mode='default', labelpad=5)
            ax.yaxis.set_label_coords(0.3, -0.3)
            ax.set_title(f'Train and Validation data for {dataset}')
            ax.legend(ncol=2, loc='upper right', borderaxespad=0., frameon=False)
            ax.set_xlabel('Timestamp')
        
        plt.savefig(f'./studies_results_lxplus/train_valid/{dataset}_valid_7fold{k}.png', facecolor='w')
        # plt.show()
        plt.close()

In [8]:
datasets = ['IEEECIS_new2.2']  #, 'MSL_new', 'SMD', 'GECCO', 'IEEECIS_new2.2'] #  'GECCO', 'IEEECIS_new2.2', 'UCR', 'SWaT_1D'

for dataset in datasets:
    plot_train_valid(dataset)

## study MSE vs MCC evolution

In [5]:
from main import backprop, local_pot, local_anomaly_labels
from src.data_loader import MyDataset, DataLoader
from src.pot import pot_eval
from src.diagnosis import hit_att, ndcg
import math

In [6]:
def load_model(modelname, dims, n_window, step_size=None, path=None, prob=False, weighted=False):
	import src.models
	model_class = getattr(src.models, modelname)
	if modelname == 'iTransformer':
		model = model_class(dims, n_window, step_size, prob, weighted).double()
	else:
		model = model_class(dims, n_window, prob).double()
	
	optimizer = torch.optim.AdamW(model.parameters() , lr=model.lr, weight_decay=1e-5)
	scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 5, 0.9)

	print(f"Loading pre-trained model: {model.name} from {path}")
	checkpoint = torch.load(path)
	model.load_state_dict(checkpoint['model_state_dict'])
	optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
	scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
	epoch = checkpoint['epoch']
	accuracy_list = checkpoint['accuracy_list']

	return model, optimizer, scheduler, epoch, accuracy_list

In [7]:
def get_scores(dataset, model, index, modeltype):
    if modeltype == 'iTransformer':
        checkpoints_path = glob.glob(f'{modeltype}_results_lxplus/{modeltype}_{dataset}/n_window{config[model]["window"]}_steps{config[model]["steps"]}_feats*_eps{config[model]["eps"]}_latent{config[model]["latent"]}_{index}/checkpoints/model_epoch*.ckpt')
    else:
        checkpoints_path = glob.glob(f'{model}_results_lxplus/{model}_{dataset}/n_window{config[model]["window"]}_steps{config[model]["steps"]}_feats*_eps{config[model]["eps"]}_{index}/checkpoints/model_epoch*.ckpt')
    checkpoints_path = sorted(checkpoints_path, key=lambda x: int(x.split('_')[-1].split('.')[0].replace('epoch', '')))
    print(len(checkpoints_path))
    res_path = os.path.join(checkpoints_path[0].split('/')[0], checkpoints_path[0].split('/')[1], 'results', checkpoints_path[0].split('/')[2])
    print(res_path)
    if not os.path.exists(res_path):
        os.makedirs(res_path)

    flag_less = False
    feats = -1
    if dataset == 'IEEECIS_new2.2':
        feats = 30
    elif dataset in ['SMAP_new', 'SMD']:
        flag_less = True

    test = MyDataset(dataset, config[model]['window'], config[model]['window'], modeltype, flag='test', feats=feats, enc=False, less=flag_less, k=-1)
    train_test = MyDataset(dataset, config[model]['window'], config[model]['window'], modeltype, flag='train', feats=feats, less=flag_less, enc=False, k=-1)
    feats = test.feats
    enc_feats = test.enc_feats
    labels = test.get_labels()

    for path in checkpoints_path:
        trained_model, optimizer, scheduler, epoch, accuracy_list = load_model(modeltype, feats, config[model]['window'], config[model]['steps'], path, prob=False, weighted=False)
        trained_model.eval()
        print(f'Loaded model from epoch {epoch}')

        if os.path.exists(f'{res_path}/res_epoch{epoch}.csv') and os.path.exists(f'{res_path}/pred_labels_epoch{epoch}.csv'):
            print(f'Epoch {epoch} already evaluated')
            continue

        data_loader_train_test = DataLoader(train_test, batch_size=trained_model.batch, shuffle=False)
        data_loader_test = DataLoader(test, batch_size=trained_model.batch, shuffle=False)
        
        lossT = backprop(-1, trained_model, data_loader_train_test, feats, optimizer, scheduler, training=False, enc_feats=enc_feats, prob=False, pred=False)  # need anomaly scores on training data for POT
        loss, y_pred = backprop(-1, trained_model, data_loader_test, feats, optimizer, scheduler, training=False, enc_feats=enc_feats, prob=False, pred=True)

        if 'iTransformer' in trained_model.name or trained_model.name in ['LSTM_AE']:
            # cut out the padding from test data, loss tensors
            lossT_tmp, loss_tmp, y_pred_tmp = [], [], []
            # print(test.get_ts_lengths(), np.sum(test.get_ts_lengths()), len(test.get_ts_lengths()))
            # print(test.get_ideal_lengths(), np.sum(test.get_ideal_lengths()), len(test.get_ideal_lengths()))
            start = 0
            for i, l in enumerate(test.get_ts_lengths()):
                loss_tmp.append(loss[start:start+l])
                y_pred_tmp.append(y_pred[start:start+l])
                start += test.get_ideal_lengths()[i]
            
            start = 0
            for i, l in enumerate(train_test.get_ts_lengths()):
                lossT_tmp.append(lossT[start:start+l])
                start += train_test.get_ideal_lengths()[i]

            lossT = np.concatenate(lossT_tmp, axis=0)
            loss = np.concatenate(loss_tmp, axis=0)
            y_pred = np.concatenate(y_pred_tmp, axis=0)
        train_loss = np.mean(lossT)
        test_loss = np.mean(loss)
        
        ### anomaly labels
        preds, _ = local_pot(loss, lossT, labels)
        true_labels = (np.sum(labels, axis=1) >= 1) + 0
        # local anomaly labels
        labelspred, result_local1 = local_anomaly_labels(preds, true_labels, nb_adim=1)
        majority = math.ceil(labels.shape[1] / 2)  # do majority voting over dimensions for local results instead of inclusive OR
        labelspred_maj, result_local2 = local_anomaly_labels(preds, true_labels, nb_adim=majority)
        labelspred_all = []
        results_all = pd.DataFrame()

        # global anomaly labels
        lossTfinal, lossFinal = np.mean(lossT, axis=1), np.mean(loss, axis=1)
        true_labels = (np.sum(labels, axis=1) >= 1) + 0
        result_global, pred2 = pot_eval(lossTfinal, lossFinal, true_labels, None, f'all_dim')
        labelspred_glob = (pred2 >= 1) + 0
        result_global.update(hit_att(loss, labels))
        result_global.update(ndcg(loss, labels))
        result_global.update({'detection_level_q': 1e-5})
        result_global.update({'train_loss': train_loss, 'test_loss': test_loss})
        # print('\nglobal results') 
        # print(result_global)

        # saving results
        df_res_global = pd.DataFrame.from_dict(result_global, orient='index').T
        df_res_global.index = ['global']
        result_local1 = pd.DataFrame.from_dict(result_local1, orient='index').T
        result_local2 = pd.DataFrame.from_dict(result_local2, orient='index').T
        result_local1.index = ['local_all']
        result_local2.index = ['local_all_maj']
        df_res_local = pd.concat([result_local1, result_local2])
        df_res = pd.concat([df_res_local, df_res_global]) 
        df_labels = pd.DataFrame({'local': labelspred, 'local_maj': labelspred_maj, 'global': labelspred_glob})

        df_res.to_csv(f'{res_path}/res_epoch{epoch}.csv')    
        df_labels.to_csv(f'{res_path}/pred_labels_epoch{epoch}.csv', index=False)


In [None]:
modeltype = 'TranAD'  #  'iTransformer'
datasets =  ['SMAP_new']  #['GECCO', 'IEEECIS_new2.2', 'UCR', 'SWaT_1D', 'SMAP_new', 'MSL_new', 'SMD'] 
models = ['TranAD']  # , 'iTransformer4', 'iTransformer3']  # 'TranAD'

for dataset in datasets:
    for model in models:
        for i in range(1, 6):
            get_scores(dataset, model, i, modeltype)

In [34]:
def get_data(datasets, models, modeltype):
    data = {f'{dataset}_{model}_fold{i}': {} for dataset in datasets for model in models for i in range(1, 6)}

    for dataset in datasets:
        for model in models:
            print(dataset, model)
            for i in range(1, 6):
                if modeltype == 'iTransformer':
                    datapath = glob.glob(f'{modeltype}_results_lxplus/{modeltype}_{dataset}/results/n_window{config[model]["window"]}_steps{config[model]["steps"]}_feats*_eps{config[model]["eps"]}_latent{config[model]["latent"]}_{i}/res_epoch*.csv')
                else:
                    datapath = glob.glob(f'{model}_results_lxplus/{model}_{dataset}/results/n_window{config[model]["window"]}_steps{config[model]["steps"]}_feats*_eps{config[model]["eps"]}_{i}/res_epoch*.csv')
                datapath = sorted(datapath, key=lambda x: int(x.split('_')[-1].split('.')[0].replace('epoch', '')))
                # print(len(datapath), datapath)

                for e, path in enumerate(datapath):
                    res = pd.read_csv(path, index_col=0)
                    modes = list(res.index)
                    for mode in modes:
                        if e == 0:
                            data[f'{dataset}_{model}_fold{i}'][mode] = pd.DataFrame(res.loc[mode]).T
                        else:
                            data[f'{dataset}_{model}_fold{i}'][mode] = pd.concat([data[f'{dataset}_{model}_fold{i}'][mode], pd.DataFrame(res.loc[mode]).T], axis=0)
                    
                for mode in modes:
                    data[f'{dataset}_{model}_fold{i}'][mode] = data[f'{dataset}_{model}_fold{i}'][mode].reset_index(drop=True) 
                # print(data[f'{dataset}_{model}_fold{i}']['local_all'])
    return data

In [10]:
# Plot MCC scores and test_loss over all epochs for each mode
def plot_mcc_test_loss(data, dataset, model, metric='MCC'):
    modes = data[f'{dataset}_{model}_fold1'].keys()
    # epochs = range(len(data[f'{dataset}_{model}_fold1']['global'][metric]))
    colors = plt.cm.plasma(np.linspace(0, 1, 5))


    plt.figure(figsize=(12, 8))

    for i in range(1, 6):
        for j, mode in enumerate(modes):
            if j == 0:
                mcc_scores = data[f'{dataset}_{model}_fold{i}'][mode][metric]
            else:
                if mcc_scores.max() < data[f'{dataset}_{model}_fold{i}'][mode][metric].max():
                    mcc_scores = data[f'{dataset}_{model}_fold{i}'][mode][metric]
                    labell = f'{metric} {mode} fold {i}'
    
       
        test_losses = data[f'{dataset}_{model}_fold{i}']['global']['test_loss']
        plt.plot(mcc_scores, '-o', label=labell, color=colors[i-1])
        plt.plot(test_losses, '--', label=f'Test MSE fold {i}', color=colors[i-1])
        # ax = plt.gca()
        # ax.plot(mcc_scores, '-', label=labell, color=colors[i-1])
        # ax.set_ylim(0,1)
        # if i==1:
        #     ax.set_ylabel('Scores')
        # ax2 = ax.twinx()
        # ax2.plot(test_losses, label=f'Test MSE', linestyle='--', color=colors[i-1])
        # ax2.set_ylim(bottom=0)

    # ax.set_xlabel('Epochs')
    # ax2.set_ylabel('Test MSE', rotation=270, labelpad=15, ha='left', va='center')

    plt.xlabel('Epochs')
    plt.ylabel('Scores')
    plt.legend(loc='center right')
    plt.title(f'{dataset} - {config[model]["lab"]}')
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(f'./studies_results_lxplus/mcc_vs_mse/{dataset}_{model}_{metric}.png', facecolor='w')
    plt.show()


In [62]:
# Plot MCC scores and test_loss over all epochs for each mode
def plot_mcc_test_loss2(data, dataset, model, modeltype, metric='MCC'):
    modes = data[f'{dataset}_{model}_fold1'].keys()
    colors = plt.cm.plasma(np.linspace(0, 1, 5))

    fig, axs = plt.subplots(2, 3, figsize=(24, 12), sharex=False, sharey=True, constrained_layout=True)
    axs = axs.flatten()
    print(model, dataset)

    for i in range(1, 6):
        for j, mode in enumerate(modes):
            if j == 0:
                mcc_scores = data[f'{dataset}_{model}_fold{i}'][mode][metric]
                labell = f'{metric} {mode}'
            else:
                if mcc_scores.max() < data[f'{dataset}_{model}_fold{i}'][mode][metric].max():
                    mcc_scores = data[f'{dataset}_{model}_fold{i}'][mode][metric]
                    labell = f'{metric} {mode}'
    
        if modeltype == 'iTransformer':
            accuracy_list_path = glob.glob(f'{modeltype}_results_lxplus/{modeltype}_{dataset}/n_window{config[model]["window"]}_steps{config[model]["steps"]}_feats*_eps{config[model]["eps"]}_latent{config[model]["latent"]}_{i}/checkpoints/accuracy_list.npy')
        else:
            accuracy_list_path = glob.glob(f'{model}_results_lxplus/{model}_{dataset}/n_window{config[model]["window"]}_steps{config[model]["steps"]}_feats*_eps{config[model]["eps"]}_{i}/checkpoints/accuracy_list.npy')
        accuracy_list = np.load(accuracy_list_path[0])
        train_losses = [i[0] for i in accuracy_list]
        valid_losses = [i[1] for i in accuracy_list]

        best_model = torch.load(f'{accuracy_list_path[0].replace("accuracy_list.npy", "model_best.ckpt")}')
        best_epoch = best_model['epoch'] + 1 # because numbering began at 0, but in plots we want to start at 1

        axs[i-1].plot(np.arange(1, len(mcc_scores)+1), mcc_scores, 'r-o', label=labell)  #, color=colors[i-1])
        axs[i-1].axvline(x=best_epoch, label=f'Best epoch {best_epoch}', color='k', linestyle=':')

        ax2 = axs[i-1].twinx()
        ax2.plot(np.arange(1, len(train_losses)+1), train_losses, '--', label=f'Train MSE', linewidth=3)  #, color=colors[i-1])
        ax2.plot(np.arange(1, len(valid_losses)+1),valid_losses, '--', label=f'Valid MSE', linewidth=3)  #, color=colors[i-1])
        if metric == 'f1':
            axs[i-1].set_ylim(top=1, bottom=0)
            axs[i-1].set_ylabel(f'F1 score')
        else:
            axs[i-1].set_ylabel(f'{metric}')
            # axs[i-1].set_ylim(top=1, bottom=0.4)
        # ax2.set_ylim(0, 0.1)
        ax2.set_ylabel('Test Loss')
        axs[i-1].legend(loc='upper right')
        ax2.legend(loc='center right')
        # ax2.legend(bbox_to_anchor=(1, 0.9), loc='upper right')
        axs[i-1].set_title(f'Valid fold {i}')
        axs[i-1].grid(True)
        axs[i-1].set_xlabel('Epochs')
    axs[-1].remove()

    fig.suptitle(f'{dataset} - {config[model]["lab"]}')
    plt.xlabel('Epochs')
    if metric == 'ROC/AUC':
        metric = 'rocauc'
    plt.savefig(f'./studies_results_lxplus/mcc_vs_mse/{dataset}_{model}_{metric}2.png', facecolor='w')
    # plt.show()


In [None]:
modeltype = 'iTransformer'  #  'iTransformer'
datasets = ['GECCO', 'IEEECIS_new2.2', 'UCR', 'SWaT_1D', 'SMAP_new', 'MSL_new', 'SMD'] # 'MSL_new',
models =  ['iTransformer1', 'iTransformer2', 'iTransformer3', 'iTransformer4']  # 'TranAD' ['iTransformer1'


for dataset in datasets:
    for model in models:
        # data = get_data(datasets, models, modeltype)
        data = get_data([dataset], [model], modeltype)
        # print(data.keys())
        # plot_mcc_test_loss(data, dataset, model, 'MCC')
        plot_mcc_test_loss2(data, dataset, model, modeltype, 'MCC')
    #     break
    # break

In [16]:
%matplotlib inline

## test box plots

In [None]:
datasets = ['IEEECIS_new2.2', 'UCR', 'SWaT_1D', 'GECCO'] # ['IEEECIS_new2.2', 'SMAP_new', 'MSL_new', 'UCR', 'SMD', 'SWaT_1D', 'GECCO', 'SMD']  #, 'ATLAS_TS']
models = ['iTransformer1', 'iTransformer2', 'iTransformer3'] #, 'TranAD',
        #   'OmniAnomaly', 'MAD_GAN', 'LSTM_AE', 'DAGMM', 'USAD', 'IF', 'None'] 

all_paths = []
results_all = {}
for dataset in datasets:
    for model in models:
        if 'iTransformer' in model and model != 'iTransformer':
            paths = glob.glob(f'iTransformer_results_lxplus/iTransformer_{dataset}')
        else:
            paths = glob.glob(f'{model}_results_lxplus/{model}_{dataset}')
        all_paths.extend(paths)
        if not paths:
            print(f'No paths found for {model} on {dataset}')
        feats = 30 if dataset == 'IEEECIS_new2.2' else -1

        for path in paths:
            if model in config.keys():
                if model == 'TranAD':
                    res_path = glob.glob(f"{path}/*n_window{config[model]['window']}_steps{config[model]['steps']}*feats{feats}*/results/res.csv")
                else:
                    if config[model]['weighted']:
                        res_path = glob.glob(f"{path}/*n_window{config[model]['window']}_steps{config[model]['steps']}_feats{feats}_eps{config[model]['eps']}_latent{config[model]['latent']}_weighted*/results/res.csv")
                    else:
                        res_path = glob.glob(f"{path}/*n_window{config[model]['window']}_steps{config[model]['steps']}_feats{feats}_eps{config[model]['eps']}_latent{config[model]['latent']}*/results/res.csv")
            elif model == 'None':
                res_path = glob.glob(f'{path}/*feats{feats}*/results/res.csv')
            elif model == 'IF':
                res_path = glob.glob(f'{path}/*feats{feats}*/results/res.csv')
            else:
                res_path = glob.glob(f'{path}/*n_window10_steps1*feats{feats}*/results/res.csv')

            if res_path:
                res_path = np.sort(res_path)
                print(model, dataset, len(res_path))
                # print(res_path)
                tmp = pd.DataFrame()
                for p in res_path:
                    res = pd.read_csv(p)
                    tmp = pd.concat((tmp, res.iloc[-3:]))

                key = path.split('/')[1]
                if 'iTransformer' in model and model != 'iTransformer':
                    idx = len('iTransformer')  # len(model) - 1
                    diff = len(model) - len('iTransformer')
                    # insert a number in the key to distinguish between the models at position idx
                    key = key[:idx] + model[-diff:] + key[idx:]
                    results_all[key] = {}

                    # Create a dictionary to store the dataframes
                    grouped = tmp.groupby('Unnamed: 0')
                    grouped_dfs = {name: group for name, group in grouped}
                    grouped_dfs = {name: group.drop(columns='Unnamed: 0') for name, group in grouped_dfs.items()}
                    for mode in grouped_dfs.keys():
                        results_all[key][mode] = grouped_dfs[mode]
                else:
                    print(f'No results found for {model} on {dataset}')

            break

# print(len(all_paths))
print(results_all.keys())

In [None]:
results_all['iTransformer1_GECCO']['global']


In [294]:
def plot_boxplots(results_all, datasets, models, metric='MCC'):
    for model in models:
        modes = list(results_all[f'{model}_{datasets[0]}'].keys())
        fig, axs = plt.subplots(len(modes), 1, figsize=(2 * len(datasets), 3 * len(modes)), 
                                constrained_layout=True, sharex=True, sharey=True)
        
        colors = plt.cm.plasma(np.linspace(0, 1, len(datasets) + 1))
        
        for i, mode in enumerate(modes):
            data_box = []
            for dataset in datasets:
                if f'{model}_{dataset}' in results_all:
                    data_box.append(results_all[f'{model}_{dataset}'][mode][metric])
            
            box = axs[i].boxplot(data_box, labels=datasets, patch_artist=True)
            
            # Change the colors of the boxplots
            for patch, color in zip(box['boxes'], colors):
                patch.set_facecolor(color)
            axs[i].text(0.98, 0.1, f'{mode}', horizontalalignment='right', verticalalignment='bottom', transform=axs[i].transAxes, fontsize=20)
            axs[i].set_ylabel(metric)
            if metric == 'MCC':
                axs[i].set_ylim(-1, 1)
            else:
                axs[i].set_ylim(top=1.0)

        axs[0].set_title(f'{config[model]["lab"]}')
        plt.savefig(f'./studies_results_lxplus/boxplots_{model}_{metric}.png', facecolor='w')
        # plt.show()
        plt.close()

In [295]:
# Usage
plot_boxplots(results_all, datasets, models, metric='MCC')