In [None]:
import os
import datetime

import numpy as np
import pandas as pd

from src.utility.utility_data import *
from src.utility.utility_misc import *
from src.MTL.train_model import *

In [None]:
# Saves a dataframe of results, deletes the previous dataframe if necessary
# Also saves numpy arrays in dict
def save_results(results):
    global results_df
    
    # Determine naming variables
    tune_cols = [col for col in param_loader.iter_param_keys.copy() if col not in ["seed"]]

    # Results dataframe: update the dataframe
    results_df = results_df.append(results, ignore_index=True)
            
    # Save new dataframe csv if end of seed (all seeds have same number of entries)
    seed_counts = results_df["seed"].value_counts()
    if all(seed_count == seed_counts.iloc[0] for seed_count in seed_counts):
        
        seeds = [int(results_df["seed"].min()), int(results_df["seed"].max())]
        filename_results = r"{}_{}_{}__{}___seeds{:d}-{:d}.{}.csv".format(
                params["algo_name"], dataset_names, exp_name, t0.strftime("%y.%m.%d_%H.%M.%S"), seeds[0], seeds[1], ".".join(tune_cols)
            )
        filename_results_old = r"{}_{}_{}__{}___seeds{:d}-{:d}.{}.csv".format(
                params["algo_name"], dataset_names, exp_name, t0.strftime("%y.%m.%d_%H.%M.%S"), seeds[0], seeds[1]-1, ".".join(tune_cols)
            )
        print("SAVING...\n{}".format(filename_results))

        # Delete old csv
        if os.path.exists(r"results/raw/" + filename_results_old):
            os.remove(r"results/raw/" + filename_results_old)

        # Create new csv
        results_df.to_csv(r"results/raw/" + filename_results, index=False)

        print(datetime.datetime.now() - t0)

### New Zealand

In [None]:
# OVERALL
exp_name = "exps_demo"
dataset_names = [["nz", "gloria"]]              # Must be nested list
data_labels = [["chl"]]                   # Must be nested list
algo_name = "MLP_STL"

assert isinstance(data_labels[0], list), "ERROR: data_labels must be a nested list."
assert isinstance(dataset_names[0], list), "ERROR: dataset_names must be a nested list."
assert len(dataset_names) == 1, "ERROR: Dataset names must be length 1: exps on multiple datasets should be performed separately."

# DATA
data_params = {}
data_params["datasets"] = dataset_names
data_params["labels"] = data_labels

data_params["invalid_label_action"] = "drop_if_all"
data_params["task_selection_criterion"] = "site"
data_params["split_method"] = "test_on_LOO_dataset"
data_params["LOO_dataset"] = 0 # The 0th dataset in the list (NZ) is the test dataset, while the other datasets (gloria) are used for transfer learning
data_params["vali_frac"] = 0 if algo_name in ["Naive"] else 0.1
data_params["test_frac"] = 0.5

# MODEL
params = {}   

params["is_network_model"] = "MLP" in algo_name or "MDN" in algo_name

params["verbose"] = False
params["plot"] = False
params["seed"] = [*range(0,50)]
params["num_ensembles"] = [1, 10] if params["is_network_model"] else [1]
params["agg_ensembles"] = True

params["algo_name"] = algo_name

if params["is_network_model"]:
    params["lr"] = [10**p for p in [-3, -3.5, -4]]
    params["netsize"] = 100
    params["vali_epoch_freq"] = 5
    params["vali_epoch_delay"] = 20
    params["batch_shuffle"] = True
    params["max_epochs"] = 2000   

    if "MDN" in algo_name:
        params["num_gaussians"] = [5]
    if "MTL" in algo_name:
        params["netsize_lastlayer_small"] = [False]
        params["min_site_size"] = [5]#, 10, 20, 50]

if "RF" in algo_name:
    params["rf_max_depth"] = [None, 5, 10]
    params["rf_min_samples_split"] = [5]
    params["rf_max_features"] = ['auto', 'sqrt']
if "XGBoost" in algo_name:
    params["xgb_lr"] = [0.01, 0.03, 0.1, 0.3]
    params["xgb_max_depth"] = [4, 5, 6]
if "SVM" in algo_name:
    params["svm_kernel"] = ['linear', 'rbf']
    params["svm_gamma"] = [0.1, 1, 'scale']
    params["svm_C"] = [0.1, 1, 10]

t0 = datetime.datetime.now()

# Iterate through hyperparameter combinations to find best performing model
results_df = pd.DataFrame()
param_loader = param_iterator(data_params, params)
for i in range(param_loader.num_combinations):
    d_p, p = param_loader.next()

    out_dim = 3 if "MDN" in p["algo_name"] else len(d_p["labels"])
    
    if p["is_network_model"]:
        p["batch_size"] = 16 if d_p["datasets"] == ["nz"] else 64
    if "STL" in p["algo_name"]:
        p["arch"] = [[p["netsize"],p["netsize"],p["netsize"],out_dim],[]]
    if "MTL" in p["algo_name"]:
        if p["netsize_lastlayer_small"]:
            p["arch"] = [[p["netsize"],p["netsize"],10],[out_dim]]
        else:
            p["arch"] = [[p["netsize"],p["netsize"],p["netsize"]],[out_dim]]

    MTL = train_model(p, d_p)

    results = MTL.metrics.copy()
    for key in d_p:
        results[key] = d_p[key]

    mats_to_save_names = [f"{y}_{label}_{partition}" for y in ["pred", "true"] for label in d_p["labels"]
                                                        for partition in ["train", "vali", "test"]]
    mats_to_save = {m: getattr(MTL, m) for m in mats_to_save_names}

    save_results(results)
    

### GLORIA

In [None]:
# OVERALL
exp_name = "exps_demo"
dataset_names = [["gloria"]]                   # Must be nested list
data_labels = [["chl"], ["tss"], ["cdom"]]     # Must be nested list
algo_name = "MLP_STL"

assert isinstance(data_labels[0], list), "ERROR: data_labels must be a nested list."
assert isinstance(dataset_names[0], list), "ERROR: dataset_names must be a nested list."
assert len(dataset_names) == 1, "ERROR: Dataset names must be length 1: exps on multiple datasets should be performed separately."

# DATA
data_params = {}
data_params["datasets"] = dataset_names
data_params["labels"] = data_labels

data_params["invalid_label_action"] = "drop_if_all"
data_params["task_selection_criterion"] = "site"
data_params["split_method"] = "random_equal_tasks"
data_params["vali_frac"] = 0 if algo_name in ["Naive"] else 0.1
data_params["test_frac"] = 0.5
data_params["label_ln_coefs"] = {"chl": 1, "tss": 1, "cdom": 0.1}

# MODEL
params = {}   

params["is_network_model"] = "MLP" in algo_name or "MDN" in algo_name

params["verbose"] = False
params["plot"] = False
params["seed"] = [*range(0,50)]
params["num_ensembles"] = [1, 10] if params["is_network_model"] else [1]
params["agg_ensembles"] = True

params["algo_name"] = algo_name

if params["is_network_model"]:
    params["lr"] = [10**p for p in [-3, -3.5, -4]]
    params["netsize"] = 100
    params["vali_epoch_freq"] = 5
    params["vali_epoch_delay"] = 20
    params["batch_shuffle"] = True
    params["max_epochs"] = 2000   

    if "MDN" in algo_name:
        params["num_gaussians"] = [5]
    if "MTL" in algo_name:
        params["netsize_lastlayer_small"] = [False]
        params["min_site_size"] = [5]#, 10, 20, 50]

if "RF" in algo_name:
    params["rf_max_depth"] = [None, 5, 10]
    params["rf_min_samples_split"] = [5]
    params["rf_max_features"] = ['auto', 'sqrt']
if "XGBoost" in algo_name:
    params["xgb_lr"] = [0.01, 0.03, 0.1, 0.3]
    params["xgb_max_depth"] = [4, 5, 6]
if "SVM" in algo_name:
    params["svm_kernel"] = ['linear', 'rbf']
    params["svm_gamma"] = [0.1, 1, 'scale']
    params["svm_C"] = [0.1, 1, 10]

t0 = datetime.datetime.now()

# Iterate through hyperparameter combinations to find best performing model
results_df = pd.DataFrame()
param_loader = param_iterator(data_params, params)
for i in range(param_loader.num_combinations):
    d_p, p = param_loader.next()

    out_dim = 3 if "MDN" in p["algo_name"] else len(d_p["labels"])
    
    if p["is_network_model"]:
        p["batch_size"] = 16 if d_p["datasets"] == ["nz"] else 64
        print(p["batch_size"])
    if "STL" in p["algo_name"]:
        p["arch"] = [[p["netsize"],p["netsize"],p["netsize"],out_dim],[]]
    if "MTL" in p["algo_name"]:
        if p["netsize_lastlayer_small"]:
            p["arch"] = [[p["netsize"],p["netsize"],10],[out_dim]]
        else:
            p["arch"] = [[p["netsize"],p["netsize"],p["netsize"]],[out_dim]]

    MTL = train_model(p, d_p)

    results = MTL.metrics.copy()
    for key in d_p:
        results[key] = d_p[key]

    mats_to_save_names = [f"{y}_{label}_{partition}" for y in ["pred", "true"] for label in d_p["labels"]
                                                        for partition in ["train", "vali", "test"]]
    mats_to_save = {m: getattr(MTL, m) for m in mats_to_save_names}

    save_results(results)