In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import pandas as pd
import json
import os
import copy

import matplotlib.pyplot as plt
import seaborn as sns

import matplotlib.dates as mdates
from matplotlib.dates import SA

from utils.fitting.util import get_ensemble_params
from scripts.seir.combine_multiple_runs import combine_multiple_runs
from utils.fitting.util import create_output

In [None]:
outputs_dir = '/scratche/users/sansiddh/covid-modelling/phparams_2021_0317_102628/comb/'

In [None]:
def create_run_ledger(outputs_dir):
    run_ledger = pd.DataFrame(columns=['run', 'start_date', 'seed'])

    for i, run in enumerate(os.listdir(outputs_dir)):
        json_file = f'{outputs_dir}/{run}/config.json'
        with open(json_file) as f:
            config = json.load(f)
        start_date = config['fitting']['split']['start_date']
        seed = config['fitting']['optimiser_params']['seed']
        run_ledger.loc[i, :] = [run, start_date, seed]
        
    return run_ledger

In [None]:
def ensemble_params_df(outputs_dir):
    run_ledger = create_run_ledger(outputs_dir)
    df_params_master = copy.deepcopy(run_ledger)
    df_params_master.columns = pd.MultiIndex.from_arrays([df_params_master.columns, ['']*len(df_params_master.columns)])

    for i, run in enumerate(os.listdir(outputs_dir)):

        idx = df_params_master[df_params_master['run'] == run].index[0]

        beta = np.load(f'{outputs_dir}/{run}/beta.npy')
        trials_params = np.load(f'{outputs_dir}/{run}/trials_params.npy', allow_pickle=True)
        trials_losses = np.load(f'{outputs_dir}/{run}/trials_losses.npy', allow_pickle=True)

        em_params, em_params_dev = get_ensemble_params(trials_params, trials_losses, beta, return_dev=True)

        for param in list(em_params.keys()):
            df_params_master.loc[idx, (param, 'mean')] = em_params[param]
            df_params_master.loc[idx, (param, 'std')] = em_params_dev[param]

    df_params_master[('start_date', '')] = pd.to_datetime(df_params_master[('start_date', '')], format='%Y-%m-%d')
    df_params_master.columns = df_params_master.columns.map('_'.join)
    
    return df_params_master

In [None]:
df_params_master = ensemble_params_df(outputs_dir)
df_params_master

In [None]:
run_ledger = create_run_ledger(outputs_dir)
df_params_master = copy.deepcopy(run_ledger)
df_params_master.columns = pd.MultiIndex.from_arrays([df_params_master.columns, ['']*len(df_params_master.columns)])

for i, run in enumerate(os.listdir(outputs_dir)[1:2]):

    idx = df_params_master[df_params_master['run'] == run].index[0]

    beta = np.load(f'{outputs_dir}/{run}/beta.npy')
    trials_params = np.load(f'{outputs_dir}/{run}/trials_params.npy', allow_pickle=True)
    trials_losses = np.load(f'{outputs_dir}/{run}/trials_losses.npy', allow_pickle=True)
    


In [None]:
loss_wt = np.exp(-beta*trials_losses)
loss_wt = loss_wt / np.sum(loss_wt)
loss_wt

In [None]:
df_params = pd.DataFrame.from_dict(trials_params.tolist())
df_params['loss'] = trials_losses
df_params['loss_wt'] = loss_wt

In [None]:
fig, axs = plt.subplots(figsize=(16, 21), nrows=4, ncols=2)
params = list(trials_params[0].keys())
for i, param in enumerate(params):
    ax = axs.flat[i]
    sns.kdeplot(data=df_params, y=param, weights='loss_wt', ax=ax)

In [None]:
fig, axs = plt.subplots(figsize=(16, 21), nrows=4, ncols=2)
params = list(trials_params[0].keys())
for i, param in enumerate(params):
    ax = axs.flat[i]
    sns.scatterplot(data=df_params_master, x='start_date_', y=f'{param}_mean', hue='seed_', ax=ax)
    ax.set_title(f'Ensemble Mean of {param} for different seeds')
    ax.set_ylabel(param)
    ax.set_xlabel('Starting Date')
    
    ax.xaxis.set_major_locator(mdates.DayLocator(interval=14))
    ax.xaxis.set_minor_locator(mdates.DayLocator(interval=7))
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%m-%d'))
    ax.tick_params('x', labelrotation=45)
    ax.grid(alpha=0.5)
    
    ax.set_xlim(ax.get_xlim()[0], ax.get_xlim()[1]+7)
    
fig.suptitle(f'Ensemble Mean of all params vs time for different seeds (2k trials each)')
fig.subplots_adjust(top=0.95, hspace=0.3)

In [None]:
fig.savefig('em-mean-params-diff-seeds.png')

In [None]:
fig, axs = plt.subplots(figsize=(16, 21), nrows=4, ncols=2)
params = list(trials_params[0].keys())
for i, param in enumerate(params):
    ax = axs.flat[i]
    ax.errorbar(x=df_params_master['start_date_'], y=df_params_master[f'{param}_mean'], yerr=df_params_master[f'{param}_std']/np.sqrt(10000))
    ax.set_title(f'Ensemble Mean of {param} +- std error')
    ax.set_ylabel(param)
    ax.set_xlabel('Starting Date')
    
    ax.xaxis.set_major_locator(mdates.DayLocator(interval=14))
    ax.xaxis.set_minor_locator(mdates.DayLocator(interval=7))
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%m-%d'))
    ax.tick_params('x', labelrotation=45)
    ax.grid(alpha=0.5)
    
    ax.set_xlim(ax.get_xlim()[0], ax.get_xlim()[1]+7)
    
fig.suptitle(f'Ensemble Mean of all params vs time (5 seeds combined)')
fig.subplots_adjust(top=0.95, hspace=0.3)

In [None]:
for i, run in enumerate(os.listdir(outputs_dir)[:1]):
    idx = df_params_master[df_params_master['run'] == run].index[0]
    
    beta = np.load(f'{outputs_dir}/{run}/beta.npy')
    trials_params = np.load(f'{outputs_dir}/{run}/trials_params.npy', allow_pickle=True)
    trials_losses = np.load(f'{outputs_dir}/{run}/trials_losses.npy', allow_pickle=True)
    import pdb; pdb.set_trace()

In [None]:
fig.savefig('em-mean-params-std-error.png')

In [None]:
np.savetxt('../../configs/exper/runs.txt', run_ledger['run'].to_numpy().reshape((-1, 5)).astype(int), fmt='%d')

In [None]:
np.loadtxt('../../configs/exper/runs.txt', dtype='int', delimiter=' ').tolist()