In [3]:
import json
import logging
import sys
import os
import random
from itertools import product
from multiprocessing import Pool

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from math import floor, ceil
from torch.nn.functional import conv1d
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
import matplotlib
from torch.utils.tensorboard import SummaryWriter

sys.path.append("..")
from fit_sup_utils import *
from networks import Network, VariableNet

from tqdm import tqdm


matplotlib.use("Agg")

plt.rcParams["figure.figsize"] = (7, 5)  # default = (6.4, 4.8)
plt.rcParams["text.usetex"] = True
plt.rcParams["figure.dpi"] = 140  # default = 100
plt.rcParams["font.family"] = "serif"
# plt.rcParams["text.latex.preamble"] = [r"\usepackage{amsmath}"]
plt.style.use("ggplot")
title_font_size = "10"

%load_ext autoreload
%autoreload 2



# Globals

In [4]:
THRESH = 1.1
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if device == torch.device("cuda"):
    torch.set_default_tensor_type("torch.cuda.FloatTensor")

# Notes

## What affects overfitting here ?
* Optimizer - Adam fits faster and more than SGD
* Learning rate - Lower (0.001) seems better than default (1e-2)
* Batch size - Lower batch size seems to lead to more overfitting. Larger ones seem to average out extremes in the input dataset during backprop
* Net width - Obviously
* Hidden layers - Obviously

## What helps fitting "outliers" ?
Fitting is meant in a broad sense here and just means "Not predicting the mean"

* Lower LR (?) 0.001 is better than 0.01
* Label dist. smoothing, but seems to affect validation perf for the "common" cluster (not outlier)
* Quantile loss seems to help over estimating values properly when using a quantile at 0.75 (i.e. low risk cluster is overestimated to a lesser extent than high risk cluster, which could be good for the bandit algorithm), HOWEVER, fitting a quantile that isn't 50 feels like it could mess up with NeuralTS (since we're not predicting a mean anymore, we're predicting a quantile, and the output of the NN usually goes into a Normal distribution as the mean param)
* Label dist smoothing with a bigger batch size seems to help 
* More data. You need to have at least a couple of high risk observations in order to get a good idea of what makes a high risk combination
* for 1000 rx, small batches, sqrt_inv LDS, smaller models seems to have less bias in validation
* Low dim, MSE is sufficient, high dim, quantile seems better in early 
* 3 layers of 128 width seem to be sufficient to overfit in training
* lr of 0.001 seem to help that overfitting, nice, lr of 0.01 seems to fail overfitting in some cases (dataset 100 and 50)
* mse is better than rmse
* If doing LDS, sqrt_inv is better than just True
* For low dim data: MSE seems equal or better than quantile in more situations (learning rate for example)
* Avoir un validation set pondere de maniere plus egale (ex. bins et extrema)

* **LDS WORKS, IMPORTANT, USE IT**
* **EXTREMA VALIDATION IS BETTER THAN BINS**
* **WITH VAL EXTREMA, MSE PERFORMS SIMILARLY TO QUANTILE, BUT IS SIMPLER**
* **IF WE USE WEIGHT DECAY THAT DECREASES WHEN EPOCHS GO UP, FOCUS ON TRAINING LOSS, WE CAN GET VERY GOOD RESULTS**

# Actual training

In [30]:
df = pd.read_csv("tuner_top10.csv")
config_cols = [col for col in df.columns if 'config' in col]
config_df = df[config_cols]
mapper = {col: col.split("/")[1] for col in config_cols}
config_df = config_df.rename(columns=mapper)
config_df = config_df.replace({np.nan: None})

In [31]:
df[config_cols + ["test_loss"]]

Unnamed: 0,config/batch_norm,config/batch_size,config/dataset,config/decay,config/hidden,config/lds,config/lr,config/n_obs,config/optim,config/patience,config/validation,config/width,test_loss
0,True,1024,500_rx_100000_combis_10_patterns_23,1e-05,1,,0.1,20000,adam,10,,128,0.0049
1,True,1024,500_rx_100000_combis_10_patterns_23,1e-05,1,,0.1,20000,adam,50,,128,0.0049
2,True,1024,500_rx_100000_combis_10_patterns_23,1e-05,1,,0.1,20000,adam,25,,128,0.0049
3,True,1024,500_rx_100000_combis_10_patterns_23,0.0001,1,,0.1,20000,adam,10,,128,0.005309
4,True,512,500_rx_100000_combis_10_patterns_23,0.001,1,,plateau,20000,adam,50,,128,0.00535
5,True,512,500_rx_100000_combis_10_patterns_23,0.0001,1,,0.1,20000,adam,10,,128,0.005436
6,True,512,500_rx_100000_combis_10_patterns_23,0.0001,1,,0.1,20000,adam,25,,128,0.005436
7,True,512,500_rx_100000_combis_10_patterns_23,0.0001,1,,0.1,20000,adam,50,,128,0.005436
8,True,256,500_rx_100000_combis_10_patterns_23,1e-05,1,,0.1,20000,adam,10,,128,0.005438
9,True,256,500_rx_100000_combis_10_patterns_23,1e-05,1,,0.1,20000,adam,25,,128,0.005438


In [32]:
config_df

Unnamed: 0,batch_norm,batch_size,dataset,decay,hidden,lds,lr,n_obs,optim,patience,validation,width
0,True,1024,500_rx_100000_combis_10_patterns_23,1e-05,1,,0.1,20000,adam,10,,128
1,True,1024,500_rx_100000_combis_10_patterns_23,1e-05,1,,0.1,20000,adam,50,,128
2,True,1024,500_rx_100000_combis_10_patterns_23,1e-05,1,,0.1,20000,adam,25,,128
3,True,1024,500_rx_100000_combis_10_patterns_23,0.0001,1,,0.1,20000,adam,10,,128
4,True,512,500_rx_100000_combis_10_patterns_23,0.001,1,,plateau,20000,adam,50,,128
5,True,512,500_rx_100000_combis_10_patterns_23,0.0001,1,,0.1,20000,adam,10,,128
6,True,512,500_rx_100000_combis_10_patterns_23,0.0001,1,,0.1,20000,adam,25,,128
7,True,512,500_rx_100000_combis_10_patterns_23,0.0001,1,,0.1,20000,adam,50,,128
8,True,256,500_rx_100000_combis_10_patterns_23,1e-05,1,,0.1,20000,adam,10,,128
9,True,256,500_rx_100000_combis_10_patterns_23,1e-05,1,,0.1,20000,adam,25,,128


In [33]:
configs = []
for i, row in config_df.iterrows():
    d = row.to_dict()
    d["custom_layers"] = None
    d["dropout_rate"] = None
    d["classif_thresh"] = None
    configs.append(d)


In [42]:
n_epochs = 100
# seeds = list(range(25))
seeds = [0, 1, 2]

param_values = {
    "dataset": [
        # "50_rx_100000_combis_4_patterns_3",
        # "100_rx_100000_combis_10_patterns_35",
        "500_rx_100000_combis_10_patterns_23",
    ],
    "width": [128],
    "hidden": [1],
    "n_obs": [20000],
    # "n_obs": [100, 1000, 10000, 20000],
    "decay": [0.0001],
    # "decay": [0.00001, 0.0001, 0.001, 0.01, 0.1, "epoch"],
    "lr": ["plateau"],
    # "lr": ["plateau", 0.001, 0.01, 0.1],
    "custom_layers": [None],
    "lds": [True],
    # "lds": [True, None],
    "batch_size": [128],
    # "batch_size": [32, 64, 128, 256, 512, 1024],
    "dropout_rate": [None],
    "loss": [["mse"]],
    "classif_thresh": [None],
    "batch_norm": [False],
    # "batch_norm": [False, True],
    "patience": [50],
    # "patience": [100, 50, 25, 10],
    "validation": [None],
    # "validation": [None, "bins", "extrema"],
    "optim": ["adam"],
    # "optim": ["adam", "sgd"],
    "modif": ["first_tune_res"],
}

# METHO: Prendre modele qui fonctionne bien, le decomposer

# TODO
# Decay = 0.1, 0.01, 0.001, 0.0001 DONE
# lr = 0.001, 0.01, 0.1, "plateau", optim = "adam", "sgd" avec les lrs DONE
# lds = True, False DONE
# batch_size = 32, 64, 128, 256, 512, 1024 DONE
# batch_norm = False, True  DONE
# early_stop = patience = 100, 50, 25, 10 validation = None, "bins", "extrema" DONE
# n_obs = 100, 1000, 10000, 20000 DONE

configs = [dict(zip(param_values, v)) for v in product(*param_values.values())]
print(len(configs))


1


In [39]:
def run_config(config, exp_dir="memoire"):
    n_layers = config["hidden"]
    width = config["width"]
    n_obs = config["n_obs"]
    decay = config["decay"]
    dataset = config["dataset"]
    lr = config["lr"]
    custom_layers = config["custom_layers"]
    lds = config["lds"]
    batch_size = config["batch_size"]
    dropout_rate = config["dropout_rate"]
    classif_thresh = config["classif_thresh"]
    batch_norm = config["batch_norm"]
    patience = config["patience"]
    validation = config["validation"]
    optim_name = config["optim"]
    noval = validation is None

    n_outputs = 1
    pred_idx = 0

    criterion = nn.MSELoss()

    train_losses = []
    test_losses = []
    val_losses = []
    train_r2s = []
    val_r2s = []
    test_r2s = []
    if exp_dir == None:
        exp_dir == ""
    l = []
    for k, v in config.items():
        l += [f"{k}={v}"]

    exp_dir += "/" + "/".join(l)

    # Train for 25 seeds
    for i, seed in enumerate(seeds):
        logdir = f"runs/{exp_dir}/{seed=}"
        writer = SummaryWriter(log_dir=logdir)
        min_val_loss = float("inf")
        min_train_loss = float("inf")
        val_activ_min_loss = None
        train_activ_min_loss = None
        test_activ_min_loss = None
        val_activ_mintrain_loss = None
        train_activ_mintrain_loss = None
        test_activ_mintrain_loss = None

        mintrain_epoch = 0
        minval_epoch = 0

        seed_train_losses = [np.nan] * n_epochs
        seed_val_losses = [np.nan] * n_epochs
        seed_test_losses = [np.nan] * n_epochs
        seed_train_r2s = [np.nan] * n_epochs
        seed_val_r2s = [np.nan] * n_epochs
        seed_test_r2s = [np.nan] * n_epochs
        early_stopping = EarlyStoppingActiv(patience=patience)

        make_deterministic(seed=seed)

        trainloader, training_data, X_val, y_val, n_dim, X_test, y_test = setup_data(
            dataset,
            batch_size,
            n_obs,
            lds,
            classif_thresh,
            validation,
            dataset_path="/home/quo/Documents/Maitrise/optimneuralbandits/testing/datasets",
        )

        X_train, y_train = training_data.combis, training_data.labels

        if custom_layers is not None:
            net = VariableNet(n_dim, custom_layers)
        else:
            net = Network(
                n_dim, n_layers, n_outputs, width, dropout_rate, batch_norm
            ).to(device)

        if decay == "epoch":
            decay_val = 1
        else:
            decay_val = decay

        if lr == "plateau":
            if optim_name == "adam":
                optim = torch.optim.Adam(
                    net.parameters(), lr=0.01, weight_decay=decay_val
                )
            else:
                optim = torch.optim.SGD(
                    net.parameters(), lr=0.01, weight_decay=decay_val
                )
            sched = ReduceLROnPlateau(optim, "min", patience=patience // 2)
        else:
            if optim_name == "adam":
                optim = torch.optim.Adam(
                    net.parameters(), lr=float(lr), weight_decay=decay_val
                )
            else:
                optim = torch.optim.SGD(net.parameters(), lr=float(lr), weight_decay=decay_val)

        ### RECORD MODEL ###
        writer.add_graph(net, X_train)

        for e in range(n_epochs):
            if decay == "epoch":
                optim.param_groups[0]["weight_decay"] = 1 / (e + 1)

            ### TRAIN ###
            for X, y in trainloader:
                optim.zero_grad()
                train_activ = net(X)
                train_loss = criterion(train_activ, y)
                train_loss.backward()
                optim.step()

            ### EVAL ###
            with torch.no_grad():
                net.eval()
                # Compute losses
                (
                    train_activ,
                    train_loss,
                    val_activ,
                    val_loss,
                    test_activ,
                    test_loss,
                ) = get_losses_and_activ(
                    net,
                    criterion,
                    X_train,
                    y_train,
                    X_val,
                    y_val,
                    X_test,
                    y_test,
                )
                net.train()

                # Get R2 metric
                train_r2 = r2_score(
                    y_train.cpu().numpy(), train_activ.cpu().numpy()[:, pred_idx]
                )
                val_r2 = r2_score(
                    y_val.cpu().numpy(), val_activ.cpu().numpy()[:, pred_idx]
                )

                # Save
                seed_train_losses[e] = train_loss
                seed_val_losses[e] = val_loss
                seed_train_r2s[e] = train_r2
                seed_val_r2s[e] = val_r2

                writer.add_scalar("Loss/train", train_loss, e)
                writer.add_scalar("Loss/val", val_loss, e)
                writer.add_scalar("R2/train", train_r2, e)
                writer.add_scalar("R2/val", val_r2, e)

                if X_test is not None and y_test is not None:
                    test_r2 = r2_score(
                        y_test.cpu().numpy(), test_activ.cpu().numpy()[:, pred_idx]
                    )
                    seed_test_losses[e] = test_loss
                    seed_test_r2s[e] = test_r2
                    writer.add_scalar("Loss/test", test_loss, e)
                    writer.add_scalar("R2/test", test_r2, e)

                # Update LR scheduler
                if lr == "plateau":
                    sched.step(val_loss)

                (
                    train_activ_min_loss,
                    val_activ_min_loss,
                    test_activ_min_loss,
                    min_val_loss,
                    train_activ_mintrain_loss,
                    val_activ_mintrain_loss,
                    test_activ_mintrain_loss,
                    min_train_loss,
                    mintrain_epoch,
                    minval_epoch,
                ) = update_minimums(
                    train_loss,
                    min_train_loss,
                    train_activ,
                    val_loss,
                    min_val_loss,
                    val_activ,
                    test_loss,
                    test_activ,
                    pred_idx,
                    val_activ_min_loss,
                    train_activ_min_loss,
                    test_activ_min_loss,
                    val_activ_mintrain_loss,
                    train_activ_mintrain_loss,
                    test_activ_mintrain_loss,
                    e,
                    mintrain_epoch,
                    minval_epoch,
                )
            ### VERIFY EARLY STOP ###
            # Is weird rn but basically we just want to record the first early stop activations, but since we also want the lowest validation error's activation we can't break out yet
            if not early_stopping.early_stop:
                early_stopping(val_loss, train_activ, val_activ, test_activ)
                # early_stopping(train_loss, train_activ, val_activ, test_activ)
                if early_stopping.early_stop:
                    ### PLOT EARLY STOP REPRESENTATION ###
                    fig_pgt_es = plot_pred_vs_gt(
                        y_train,
                        early_stopping.train_activ,
                        y_val,
                        early_stopping.val_activ,
                        y_test,
                        early_stopping.test_activ,
                        pred_idx=pred_idx,
                        noval=noval,
                    )
                    writer.add_figure("pred_vs_gt_final", fig_pgt_es)

                    fig_pgt_es = plot_pred_vs_gt(
                        y_train,
                        early_stopping.train_activ,
                        y_val,
                        early_stopping.val_activ,
                        y_test,
                        early_stopping.test_activ,
                        pred_idx=pred_idx,
                        invert=True,
                        noval=noval,
                    )
                    writer.add_figure("zinvert_pred_vs_gt_final", fig_pgt_es)
                    writer.flush()

        ### PLOT PRED VS TRUE FOR THIS SEED  (min val loss) ###
        fig_pgt_minval = plot_pred_vs_gt(
            y_train,
            train_activ_min_loss,
            y_val,
            val_activ_min_loss,
            y_test,
            test_activ_min_loss,
            pred_idx=pred_idx,
            noval=noval,
        )

        writer.add_figure("pred_vs_gt_minval", fig_pgt_minval)

        fig_pgt_minval = plot_pred_vs_gt(
            y_train,
            train_activ_min_loss,
            y_val,
            val_activ_min_loss,
            y_test,
            test_activ_min_loss,
            pred_idx=pred_idx,
            invert=True,
            noval=noval,
        )

        writer.add_figure("zinvert_pred_vs_gt_minval", fig_pgt_minval)

        ### PLOT PRED VS TRUE FOR THIS SEED  (min train loss) ###
        fig_pgt_mintrain = plot_pred_vs_gt(
            y_train,
            train_activ_mintrain_loss,
            y_val,
            val_activ_mintrain_loss,
            y_test,
            test_activ_mintrain_loss,
            pred_idx=pred_idx,
            noval=noval,
        )

        writer.add_figure("pred_vs_gt_mintrain", fig_pgt_mintrain)

        fig_pgt_mintrain = plot_pred_vs_gt(
            y_train,
            train_activ_mintrain_loss,
            y_val,
            val_activ_mintrain_loss,
            y_test,
            test_activ_mintrain_loss,
            pred_idx=pred_idx,
            invert=True,
            noval=noval,
        )

        writer.add_figure("zinvert_pred_vs_gt_mintrain", fig_pgt_mintrain)

        ###

        if not early_stopping.early_stop:
            train_activ = train_activ.cpu().numpy()
            val_activ = val_activ.cpu().numpy()
            if X_test is not None and y_test is not None:
                test_activ = test_activ.cpu().numpy()

            fig_pgt_final = plot_pred_vs_gt(
                y_train,
                train_activ,
                y_val,
                val_activ,
                y_test,
                test_activ,
                pred_idx=pred_idx,
                noval=noval,
            )

            writer.add_figure("pred_vs_gt_final", fig_pgt_final)

        writer.flush()
        writer.close()
        plt.close("all")

        train_losses.append(seed_train_losses)
        val_losses.append(seed_val_losses)
        test_losses.append(seed_test_losses)
        train_r2s.append(seed_train_r2s)
        val_r2s.append(seed_val_r2s)
        test_r2s.append(seed_test_r2s)
    ### PLOT AGGREGATE DATA FOR ALL SEEDS ###
    # logdir = f"runs/{exp_dir}/aggregate"
    # writer = SummaryWriter(log_dir=logdir)

    # ### PLOTS ###
    # train_losses = np.array(train_losses)
    # val_losses = np.array(val_losses)
    # test_losses = np.array(test_losses)
    # fig_loss =  plot_metric(n_epochs, train_losses, val_losses, test_losses)

    # train_r2s = np.array(train_r2s)
    # val_r2s = np.array(val_r2s)
    # fig_r2 = plot_metric(n_epochs, train_r2s, val_r2s, test_r2s)

    # writer.add_figure("losses", fig_loss)
    # writer.add_figure("r2s", fig_r2)

    # writer.flush()
    # writer.close()
    plt.close("all")

    save_metrics(train_losses, f"metrics/{exp_dir}/train_losses")
    save_metrics(val_losses, f"metrics/{exp_dir}/val_losses")
    save_metrics(test_losses, f"metrics/{exp_dir}/test_losses")
    save_metrics(train_r2s, f"metrics/{exp_dir}/train_r2s")
    save_metrics(val_r2s, f"metrics/{exp_dir}/val_r2s")
    save_metrics(test_r2s, f"metrics/{exp_dir}/test_r2s")

    # print(f"saved at runs/{exp_dir}")


In [44]:
from copy import deepcopy

for config in deepcopy(configs):
    # config["modif"] = "tune_res"
    run_config(config)

# for config in deepcopy(configs):
#     config["batch_norm"] = True
#     config["modif"] = "with_bn"
#     run_config(config)

# for config in deepcopy(configs):
#     config["decay"] = "epoch"
#     config["modif"] = "epoch_decay"
#     run_config(config)

# for config in deepcopy(configs):
#     config["decay"] = "epoch"
#     config["batch_norm"] = True
#     config["modif"] = "epoch_decay_bn"
#     run_config(config)

# for config in deepcopy(configs):
#     config["modif"] = "tune_res"
#     config["patience"] = 25
#     run_config(config)

# for config in deepcopy(configs):
#     run_config(config)
    

# with Pool(4) as p:
#     p.map(run_config, conf1igs)

# Remarque sur le weight decay

Si on passe par le weight decay avec les epoques et que l'on utilise aucun set de validation. Alors LDS = True semble mieux que sqrt_inv comme ca force davantage le modele a se concentrer sur les observations a faible frequence. Learning rate en plateau semble egalement mieux. Une batch size plus petite semble egalement meilleure en terme de representation finale.

### Observations 3 aout 2022
* 1 layer, 128 est suffisant pour overfit 99999 observations. Super.
* On veut utiliser le decay dans notre cas pour regulariser le reseau tot dans son apprentissage pour eviter d'overfit juste le set d'entrainement (moins il a d'observations, plus il devrait etre regu dans un bandit pcq plus le set d'entrainement est petit, plus il risque d'overfit le set)

In [19]:
configs

[{'dataset': '500_rx_100000_combis_10_patterns_23',
  'width': 128,
  'hidden': 1,
  'n_obs': 20000,
  'decay': 0,
  'lr': 'plateau',
  'custom_layers': None,
  'lds': True,
  'batch_size': 128,
  'dropout_rate': None,
  'loss': ['mse'],
  'classif_thresh': None,
  'batch_norm': False,
  'patience': 50,
  'validation': None,
  'optim': 'adam',
  'modif': 'lds_nodecay'},
 {'dataset': '500_rx_100000_combis_10_patterns_23',
  'width': 128,
  'hidden': 1,
  'n_obs': 20000,
  'decay': 0,
  'lr': 'plateau',
  'custom_layers': None,
  'lds': None,
  'batch_size': 128,
  'dropout_rate': None,
  'loss': ['mse'],
  'classif_thresh': None,
  'batch_norm': False,
  'patience': 50,
  'validation': None,
  'optim': 'adam',
  'modif': 'lds_nodecay'}]