In [1]:
import os
import sys
import warnings

import math
import random
import pandas as pd
import numpy as np
import glob
import datetime
import time
import datetime
import copy

import matplotlib
import matplotlib.pyplot as plt

from scipy.stats import gaussian_kde

from src.utility.utility_data import *
from src.utility.utility_misc import *

In [2]:
filenames_nz = {
    "MLP": "MLP_STL_[['nz', 'gloria']]_exps_demo__24.03.06_15.56.10___seeds0-49.num_ensembles.lr.csv",
    "MTL": "MLP_MTL_[['nz', 'gloria']]_exps_demo__24.02.29_15.48.33___seeds0-49.num_ensembles.lr.csv"
}
filenames_gloria = { 
    "MLP": "MLP_STL_[['gloria']]_exps_demo__24.03.07_15.14.59___seeds0-49.labels.num_ensembles.lr.csv",
    "MTL": "MLP_MTL_[['gloria']]_exps_demo__24.03.01_10.15.37___seeds0-49.labels.num_ensembles.lr.csv"
}
filenames = filenames_gloria


labels = ["chl", "tss", "cdom"]
metrics = ["rmse", "rmsle", "mape", "mae", "bias", "r2", "r2_intra_group", "slope", "mdsa", "sspb"]
metric_to_tune = "rmsle"
display_task_metrics = False

use_ensembles = True
display_partitions = ["test"]
display_aggs = ["_p"] # ["_mean", "_se", "_p"]#
sf = 3

verbose = False

print("Warning: Only using {}ensemble models.".format("" if use_ensembles else "NO "))


multi_index = pd.MultiIndex.from_tuples([], names=['label', 'algo_name'])
cmp_df = pd.DataFrame(index=multi_index)

for label in labels:
    
    for algo in filenames:
        
        print(f"{label}, {algo}")
        
        filename = filenames[algo]
        results_df = pd.read_csv(r"results/raw/" + filename)
        results_df["algo_name"] = algo
        
        if any([s in algo for s in ["MLP", "MTL", "MDN"]]):
            if use_ensembles:
                results_df = results_df[results_df["num_ensembles"] > 1]
            else:
                results_df = results_df[results_df["num_ensembles"] == 1]
        
        label_df = results_df[results_df['labels'].str.contains(label)].copy()
        if len(label_df.index) == 0:
            print("SKIPPING: Maybe no data for this label.")
            continue
 
        # Hyperparameters of interest
        cols_sens = ["algo_name"]#"labels"]
        cols_group_by = ["labels", "algo_name"] + filename.split("___")[-1].split(".")[1:-1]
        cols_group_by = [c for c in cols_group_by if c != ""]        
            
        cols_tune = [col for col in cols_group_by if col not in cols_sens] 
        

        # Metrics of interest
        cols_to_agg = [col for col in label_df.columns 
                           if any(col.startswith(f"{metric}_{label}_") for metric in metrics)]
        if not display_task_metrics:
            cols_to_agg = [col for col in cols_to_agg if not "task" in col]
        
        seeds = filename.split("___")[-1].split(".")[0][5:].split("-")
        num_seeds = int(seeds[1]) - int(seeds[0]) + 1

        # Aggregate metric columns
        label_df[[c + "_mean" for c in cols_to_agg]] = label_df.groupby(cols_group_by)[cols_to_agg].transform("mean")
        label_df[[c + "_se" for c in cols_to_agg]] = (label_df.groupby(cols_group_by)[cols_to_agg].transform("std")/np.sqrt(num_seeds-1))
        label_df = label_df.groupby(cols_group_by).mean()
        
        # Find best results on tune variables
        partition_tune_by = "vali" if any("vali" in c for c in cols_to_agg) else "train"
        var_tune_by = "{}_{}_{}_mean".format(metric_to_tune, label, partition_tune_by)
        optimal_rows = label_df.groupby(cols_sens)[var_tune_by].idxmin()
        optimal_df = label_df.loc[optimal_rows,:]

        def find_sf(val):
            if np.isnan(val) or np.isinf(val):
                return 0
            return int(np.ceil(np.log10(np.abs(val)-1e-9)))
        
        # Round to appropriate SF and then write new column with mean ± format for simplicity
        for c in cols_to_agg:    
            sf_c = sf - find_sf(optimal_df[c + "_mean"].values)
            if "bias" in c: sf_c = 2
            if "r2" in c: sf_c = 3
            if sf_c > 3:    sf_c = 3
            if sf_c < 0:    sf_c = 0 
            optimal_df[[c + "_p"]] = optimal_df.apply(lambda row: "{:.{precision}f} ± {:.{precision}f}".format(row[c + "_mean"], row[c + "_se"], precision=sf_c), axis=1)

        ### Display results ###
        cols_display = [col for col in optimal_df.columns 
                            if (any(col.startswith(f"{metric}_{label}_{partition}_") for metric in metrics for partition in display_partitions)
                                and any(col.endswith(f"{agg}") for agg in display_aggs))]
        if not display_task_metrics:
            cols_display = [col for col in cols_display if not "task" in col]
                        
                
        ### Store results ###
        if verbose:
            cols_display_label_df = [col for col in optimal_df.columns 
                            if (any(col.startswith(f"{metric}_{label}_{partition}_") for metric in ["rmsle", "mape"] for partition in display_partitions)
                                and any(col.endswith(f"{agg}") for agg in ["mean", "se"]))]
            display_full(label_df.sort_values(var_tune_by, ascending=True)[cols_display_label_df])
            display_full(optimal_df[cols_display])
            
        opt_row = optimal_df[cols_display].iloc[0]
    
        cmp_df = cmp_df.append(opt_row)
        
        

chl, MLP
chl, MTL


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


tss, MLP
tss, MTL


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


cdom, MLP
cdom, MTL


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [3]:
print_cols = {
    "rmsle_{}_test_p": "RMSE", 
    "rmse_{}_test_p": "Real RMSE", 
    "mae_{}_test_p": "MAE", 
    "mape_{}_test_p": "MAPE", 
    "bias_{}_test_p": "Bias", 
    "r2_{}_test_p": "R^2",
    "r2_intra_group_{}_test_p":"R^2_group"
}

for label in ["chl", "tss", "cdom"]:
    print_cols_label = [c.format(label) for c in print_cols]
    print_rows = [r for r in cmp_df.index if label in str(r)]
    print_df = cmp_df.loc[print_rows, print_cols_label]
    print_df.columns = [print_cols[c] for c in print_cols]#print_colnames
    display_full(print_df)

Unnamed: 0_level_0,Unnamed: 1_level_0,RMSE,Real RMSE,MAE,MAPE,Bias,R^2,R^2_group
label,algo_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
['chl'],MLP,0.213 ± 0.001,330 ± 12,1.38 ± 0.00,21.5 ± 0.1,1.00 ± 0.00,0.913 ± 0.001,0.705 ± 0.003
['chl'],MTL,0.199 ± 0.001,204 ± 11,1.33 ± 0.00,17.9 ± 0.1,1.00 ± 0.00,0.924 ± 0.001,0.741 ± 0.003


Unnamed: 0_level_0,Unnamed: 1_level_0,RMSE,Real RMSE,MAE,MAPE,Bias,R^2,R^2_group
label,algo_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
['tss'],MLP,0.332 ± 0.001,89.3 ± 2.2,1.69 ± 0.00,34.2 ± 0.2,0.98 ± 0.01,0.632 ± 0.002,0.025 ± 0.006
['tss'],MTL,0.263 ± 0.001,88.3 ± 2.6,1.47 ± 0.00,22.5 ± 0.1,1.00 ± 0.00,0.769 ± 0.003,0.388 ± 0.005


Unnamed: 0_level_0,Unnamed: 1_level_0,RMSE,Real RMSE,MAE,MAPE,Bias,R^2,R^2_group
label,algo_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
['cdom'],MLP,0.277 ± 0.001,1.44 ± 0.02,1.53 ± 0.00,26.8 ± 0.1,0.99 ± 0.01,0.699 ± 0.002,-0.779 ± 0.013
['cdom'],MTL,0.188 ± 0.001,1.12 ± 0.01,1.31 ± 0.00,15.8 ± 0.1,1.00 ± 0.00,0.861 ± 0.001,0.181 ± 0.005
