# SEIR with synthetic data

Experiments:

- Replace ground truth data from various compartments with synthetic data
- Time intervals
- Functions: log_erf, log_derf

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np

import datetime
from datetime import timedelta, datetime
from copy import deepcopy
import os
import json
import sys
sys.path.append('../../')

from utils.enums import Columns
from utils.util import read_config, train_test_split
from utils.data import cities
from utils.loss import Loss_Calculator

from data.dataloader import Covid19IndiaLoader
from data.processing import get_dataframes_cached, get_data

from main.ihme.fitting import single_cycle, create_output_folder
from main.seir.fitting import get_regional_data, data_setup, run_cycle

from models.seir.seir_testing import SEIR_Testing
from main.seir.forecast import get_forecast

from viz.forecast import plot_forecast_agnostic
from viz.fit import plot_fit
from viz.synthetic_data import plot_fit_uncertainty

## Functions to create custom dataset

In [None]:
def get_custom_dataset(df_actual, df_synthetic, use_actual=True, use_synthetic=True,
                       start_date=None, allowance=5, split1=15, split2=15, split3=15):

    if start_date is None:
        start_date = df_actual['date'].min()
    else:
        start_date = pd.to_datetime(start_date, dayfirst=False)
        
    threshold = start_date - timedelta(days=1)

    _, df_actual = train_test_split(df_actual, threshold)
    _, df_synthetic = train_test_split(df_synthetic, threshold)
        
    test_size = split3
    end_of_train = start_date + timedelta(allowance+split1+split2 - 1)
    
    properties = {
        "allowance_before_train": allowance,
        "total_length": allowance+split1+split2+split3,
        "train_length (s1+s2)": split1+split2,
        "test_length (s3)": test_size,
        "s1_length": split1,
        "s2_length": split2,
        "use_synthetic_s2": (use_actual and use_synthetic)
    }
    
    df_train, df_test = train_test_split(df_actual, end_of_train)

    if not use_synthetic:  
        pass
    elif not use_actual:
        df_train, _ = train_test_split(df_synthetic, end_of_train)
    elif use_actual and use_synthetic:
        end_of_actual = start_date + timedelta(split1 + allowance - 1)
        df_train1, _ = train_test_split(df_actual, end_of_actual)
        df_train_temp, _ = train_test_split(df_synthetic, end_of_train)
        _, df_train2 = train_test_split(df_train_temp, end_of_actual)
        df_train = pd.concat([df_train1, df_train2], axis=0)
    else:
        raise Exception("Train and test sets not defined.")

    if len(df_test) > test_size:
        df_test = df_test.head(test_size)
    else:
        raise Exception("Test set size {} greater than size available {}.".format(test_size, len(df_test)))
        
    df_train.reset_index(inplace=True, drop=True)
        
    df = pd.concat([df_train, df_test], axis=0)
    
    return df, df_train, df_test, properties

In [None]:
def format_custom_dataset(dataset, state=None, district=None, compartments=Columns.curve_fit_compartments()):
    """Format custom dataset according to the format required by the SEIR model. Select/insert required columns."""
    col_names = [col.name for col in compartments]
    dataset = dataset[['date']+col_names]
    if state:
        dataset.insert(1, "state", state)
    if district:
        dataset.insert(2, "district", district)
    return dataset

In [None]:
def insert_custom_dataset_into_dataframes(dataframes, dataset, start_date=None, compartments=None):
    """Replace original df_district with one or more columns from custom dataset."""
    if compartments is None:
        compartments = Columns.curve_fit_compartments()
        col_names = [col.name for col in compartments]
    else:
        col_names = compartments
        
    df_district, df_raw = dataframes
    
    if start_date is None:
        start_date = df_district['date'].min()
    else:
        start_date = pd.to_datetime(start_date, dayfirst=False)
    
    threshold = start_date - timedelta(days=1)

    _, df_district = train_test_split(df_district, threshold)
    _, df_raw = train_test_split(df_raw, threshold)
    
    num_rows = dataset.shape[0]
    df_district = df_district.head(num_rows)
    for col in col_names:
        df_district[col] = dataset[col].values
    return (df_district, df_raw)

## Check data

In [None]:
district = 'Pune'
state = 'Maharashtra'
disable_tracker = True

In [None]:
loader = Covid19IndiaLoader()
dataframes = loader.get_covid19india_api_data()

data = get_data(dataframes, state, district, disable_tracker=disable_tracker)
data

## Setup

In [None]:
allowance = 5 # number of days of actuals to have before s1 starts for rolling average
s1 = 10
s2 = 5
s3 = 7
delay = 15 # number of days by which to shift series forward from the first available date of data
ihme_val_size = 3
seir_c1_train_period = s1
seir_c1_val_period = s2
seir_c2_train_period = 7
seir_c2_val_period = s3
smooth_window_ihme = 0

In [None]:
series_properties = {
    'allowance': allowance,
    's1': s1,
    's2': s2,
    's3': s3,
    'shift': delay,
    'ihme_val_size': ihme_val_size,
    'seir_c1_train_period': seir_c1_train_period,
    'seir_c1_val_period': seir_c1_val_period,
    'seir_c2_train_period': seir_c2_train_period,
    'seir_c2_val_period': seir_c2_val_period
}

In [None]:
start_date = data['date'].min() + timedelta(delay)
ihme_start_date = start_date + timedelta(allowance)
dataset_length = s1 + s2 
smooth_jump = True if district == "Mumbai" else False
replace = ['hospitalised', 'total_infected', 'deceased', 'recovered'] # buckets for which synthetic data is used
which_compartments = ['hospitalised', 'total_infected', 'deceased', 'recovered']

In [None]:
now = datetime.now().strftime("%Y%m%d-%H%M%S")
folder = f'{district}/{str(now)}'
output_folder = create_output_folder(f'synth/{folder}/')

## IHME model (I1)

### Setup

In [None]:
dist, st, area_names = cities[district.lower()]
config, model_params = read_config('../../scripts/ihme/config/default.yaml')
config['start_date'] = ihme_start_date
config['dataset_length'] = dataset_length
config['disable_tracker'] = disable_tracker
config['max_evals'] = 1
config['test_size'] = s2
config['val_size'] = ihme_val_size
config['min_days'] = 7
config['n_days_optimize'] = False
config['smooth'] = smooth_window_ihme

### Train

In [None]:
ihme_res = single_cycle(dist, st, area_names, model_params, **config)

### Outputs

In [None]:
%matplotlib inline
ihme_df_train, ihme_df_val = ihme_res['df_train'], ihme_res['df_val']
ihme_df_train_nora, ihme_df_val_nora = ihme_res['df_train_nora'], ihme_res['df_val_nora']
ihme_df_true = ihme_res['df_district']
ihme_df_pred = ihme_res['df_prediction']

makesum = copy.deepcopy(ihme_df_pred)
makesum['total_infected'] = ihme_df_pred['recovered'] + ihme_df_pred['deceased'] + ihme_df_pred['hospitalised']

plot_fit(
    makesum.reset_index(), ihme_df_train, ihme_df_val, ihme_df_train_nora, ihme_df_val_nora, 
    s1, st, dist, which_compartments=[c.name for c in Columns.curve_fit_compartments()],
    description = 'Train and test',
    savepath=os.path.join(output_folder, 'ihme.png'))

plot_forecast_agnostic(ihme_df_true, makesum.reset_index(), model_name='IHME M1', 
                       dist=dist, state=st, filename=os.path.join(output_folder, 'ihme-forecast.png'))


for plot_col in Columns.which_compartments():
    plot_fit_uncertainty(makesum.reset_index(), ihme_df_train, ihme_df_val, ihme_df_train_nora, ihme_df_val_nora, 
                         s1, s2, st, dist, draws=ihme_res['draws'],
                         which_compartments=[plot_col.name], 
                         description = 'Train and test',
                         savepath=os.path.join(output_folder, f'ihme_{plot_col.name}.png'))

In [None]:
ihme_res['df_prediction']['total_infected'] = makesum['total_infected']

### Uncertainty

In [None]:
draws = ihme_res['draws']
average_uncertainty_s2 = dict()
for compartment in [c.name for c in Columns.which_compartments()]:
    draws_compartment = draws[compartment]['draws']
    draws_compartment_s2 = draws_compartment[:, s1:s1+s2]
    average_uncertainty_s2[compartment] = np.mean(draws_compartment_s2[1] - draws_compartment_s2[0])
uncertainty = pd.DataFrame.from_dict(average_uncertainty_s2, orient='index', columns=['average s2 uncertainty'])
ihme_res['df_loss'] = pd.concat([ihme_res['df_loss'],uncertainty],axis=1)

### Save

In [None]:
i1 = ihme_res['df_loss'].T[['hospitalised', 'deceased', 'recovered', 'total_infected']]
i1.to_csv(output_folder+"ihme.csv")
ihme_res['df_loss']

## SEIR model (C1)

### Setup

In [None]:
c1_input = get_regional_data(dataframes, st, dist, (not disable_tracker), None, None, granular_data=False, 
                             smooth_jump=smooth_jump, smoothing_length=33, smoothing_method='weighted', t_recov=14,
                             return_extra=False, which_compartments=which_compartments)
c1_input

In [None]:
model=SEIR_Testing
variable_param_ranges=None
data_from_tracker=False
granular_data=False
filename=None
data_format='new'
train_period=seir_c1_train_period
val_period=seir_c1_val_period
num_evals=1500
N=1e7
initialisation='intermediate'
which_compartments=['hospitalised', 'total_infected', 'deceased', 'recovered']
smooth_jump=smooth_jump
smoothing_length=33
smoothing_method='weighted'

In [None]:
c1_df_district, c1_df_raw = c1_input
c1_df_district = c1_df_district.head(delay+train_period+val_period+allowance)

print(c1_df_district)

predictions_dict_c1 = dict()

observed_dataframes = data_setup(c1_df_district, c1_df_raw, val_period)

print('train\n', observed_dataframes['df_train'])
print('val\n', observed_dataframes['df_val'])

### Train

In [None]:
predictions_dict_c1['m1'] = run_cycle(
        st, dist, observed_dataframes, 
        model=model, variable_param_ranges=variable_param_ranges,
        data_from_tracker=data_from_tracker, train_period=train_period, 
        which_compartments=which_compartments, N=N,
        num_evals=num_evals, initialisation=initialisation
    )

### Outputs

In [None]:
predictions_dict_c1['m1']['ax'].savefig(output_folder+'seir-c1.png')

In [None]:
c1 = predictions_dict_c1['m1']['df_loss'].T[['hospitalised', 'deceased', 'recovered', 'total_infected']]
c1.to_csv(output_folder+"seir.csv")
predictions_dict_c1['m1']['df_loss']

## Create custom datasets

In [None]:
df1, train1, test1, prop1 = get_custom_dataset(ihme_res['df_district_nora'], ihme_res['df_prediction'], 
                                               use_synthetic=False, start_date=start_date,
                                               allowance=allowance, split1=s1, split2=s2, split3=s3)
train1 = format_custom_dataset(train1, st, dist)
df1 = format_custom_dataset(df1, st, dist)
train1

In [None]:
df2, train2, test2, prop2 = get_custom_dataset(ihme_res['df_district_nora'], ihme_res['df_prediction'],
                                               start_date=start_date,
                                               allowance=allowance, split1=s1, split2=s2, split3=s3)
train2 = format_custom_dataset(train2, st, dist)
df2 = format_custom_dataset(df2, st, dist)
train2

In [None]:
df3, train3, test3, prop3 = get_custom_dataset(ihme_res['df_district_nora'], predictions_dict_c1['m1']['df_prediction'], 
                                               start_date=start_date,
                                               allowance=allowance, split1=s1, split2=s2, split3=s3)
train3 = format_custom_dataset(train3, st, dist)
df3 = format_custom_dataset(df3, st, dist)

### Save custom datasets

In [None]:
df1.to_csv(output_folder+"Dataset1.csv")
df2.to_csv(output_folder+"Dataset2.csv")
df3.to_csv(output_folder+"Dataset3.csv")

## SEIR Model using custom datasets

### Data setup

In [None]:
input_df = get_regional_data(dataframes, st, dist, (not disable_tracker), None, None, granular_data=False, 
                             smooth_jump=smooth_jump, smoothing_length=33, smoothing_method='weighted', t_recov=14,
                             return_extra=False, which_compartments=which_compartments)

In [None]:
input_1 = insert_custom_dataset_into_dataframes(input_df, df1, 
                                                start_date=start_date, compartments=replace)
input_1

In [None]:
input_2 = insert_custom_dataset_into_dataframes(input_df, df2, 
                                                start_date=start_date, compartments=replace)
input_2

In [None]:
input_3 = insert_custom_dataset_into_dataframes(input_df, df3, 
                                                start_date=start_date, compartments=replace)
input_3

### Setup

In [None]:
model=SEIR_Testing
variable_param_ranges=None
data_from_tracker=(not disable_tracker)
granular_data=False
filename=None
data_format='new'
train_period=seir_c2_train_period
val_period=seir_c2_val_period
num_evals=1500
N=1e7
initialisation='intermediate'
which_compartments=['hospitalised', 'total_infected', 'deceased', 'recovered']
smooth_jump=smooth_jump
smoothing_length=33
smoothing_method='weighted'

### Experiment 1 - Ground truth data

In [None]:
predictions_dict_1 = dict()

observed_dataframes = data_setup(input_1[0], input_1[1], val_period)

print('train\n', observed_dataframes['df_train'])
print('val\n', observed_dataframes['df_val'])

predictions_dict_1['m1'] = run_cycle(
        st, dist, observed_dataframes, 
        model=model, variable_param_ranges=variable_param_ranges,
        data_from_tracker=data_from_tracker, train_period=train_period, 
        which_compartments=which_compartments, N=N,
        num_evals=num_evals, initialisation=initialisation
    )

In [None]:
t1 = predictions_dict_1['m1']['df_loss'].T[['hospitalised', 'deceased', 'recovered', 'total_infected']]
t1['exp'] = 1
predictions_dict_1['m1']['df_loss']

In [None]:
predictions_dict_1['m1']['ax'].savefig(output_folder+'seir-exp1.png')

### Experiment 2 - Ground truth + IHME forecast

In [None]:
predictions_dict_2 = dict()

observed_dataframes = data_setup(input_2[0], input_2[1], val_period)

print('train\n', observed_dataframes['df_train']) 
print('val\n', observed_dataframes['df_val'])

predictions_dict_2['m1'] = run_cycle(
        st, dist, observed_dataframes, 
        model=model, variable_param_ranges=variable_param_ranges,
        data_from_tracker=data_from_tracker, train_period=train_period, 
        which_compartments=which_compartments, N=N,
        num_evals=num_evals, initialisation=initialisation
    )

In [None]:
t2 = predictions_dict_2['m1']['df_loss'].T[['hospitalised', 'deceased', 'recovered', 'total_infected']]
t2['exp'] = 2
predictions_dict_2['m1']['df_loss']

In [None]:
predictions_dict_2['m1']['ax'].savefig(output_folder+'seir-exp2.png')

### Experiment 3 - Ground truth + SEIR forecast

In [None]:
predictions_dict_3 = dict()

observed_dataframes = data_setup(input_3[0], input_3[1], val_period)

print('train\n', observed_dataframes['df_train']) 
print('val\n', observed_dataframes['df_val'])

predictions_dict_3['m1'] = run_cycle(
        st, dist, observed_dataframes, 
        model=model, variable_param_ranges=variable_param_ranges,
        data_from_tracker=data_from_tracker, train_period=train_period, 
        which_compartments=which_compartments, N=N,
        num_evals=num_evals, initialisation=initialisation
    )

In [None]:
t3 = predictions_dict_3['m1']['df_loss'].T[['hospitalised', 'deceased', 'recovered', 'total_infected']]
t3['exp'] = 3
predictions_dict_3['m1']['df_loss']

In [None]:
predictions_dict_3['m1']['ax'].savefig(output_folder+'seir-exp3.png')

In [None]:
t = pd.concat([t1, t2, t3], axis=0)
t.index.name="index"
t.sort_values(by='index', inplace=True)
t.to_csv(output_folder+"/exp_"+"_".join(replace)+".csv")

## Plotting

In [None]:
predictions_dicts = [predictions_dict_1, predictions_dict_2, predictions_dict_3]
train_dicts = [train1, train2, train3]

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.dates as mdates

In [None]:
def plot_across_datasets(exp_no, fig, ax, df_true, df_prediction, df_train, dist, state, 
                         s1_start, train_start, s2_start, test_start, s3_end, graph_start, graph_end,
                         log_scale=False, filename=None,
                         model_name='M2', which_compartments=Columns.which_compartments()):
    for col in Columns.which_compartments():
        if col in which_compartments:
            ax.plot(df_true['date'], df_true[col.name],
                '-o', color=col.color, label=f'{col.label} (Observed)')
            ax.plot(df_train['date'], df_train[col.name],
                'x', color=col.color, label=f'{col.label} (Train data)')
            ax.plot(df_prediction["date"], df_prediction[col.name],
                    '-', color=col.color, label=f'{col.label} ({model_name} Forecast)')
            
            s1_start = pd.to_datetime(s1_start)
            train_start = pd.to_datetime(train_start)
            s2_start = pd.to_datetime(s2_start)
            test_start = pd.to_datetime(test_start)
            s3_end = pd.to_datetime(s3_end)
            graph_start = pd.to_datetime(graph_start)
            graph_end = pd.to_datetime(graph_end)

            line_height = plt.ylim()[1]
            ax.plot([train_start, train_start], [0,line_height], '--', color='black', label='Train starts')
            ax.plot([test_start, test_start], [0,line_height], '--', color='black', label='Test starts')

            ax.xaxis.set_major_locator(mdates.DayLocator(interval=7))
            ax.xaxis.set_minor_locator(mdates.DayLocator(interval=1))
            ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
            
            ax.axvspan(s1_start, s2_start, alpha=0.1, color='red')
            ax.axvspan(s2_start, test_start, alpha=0.1, color='yellow')
            ax.axvspan(test_start, s3_end, alpha=0.1, color='green')
    
            ax.legend(loc="upper left")
            ax.tick_params(labelrotation=45)
            ax.grid()
            
            ax.set_xlim(graph_start, graph_end)
            
            ax.set_xlabel('Time', fontsize=10)
            ax.set_ylabel('No of People', fontsize=10)
            
            if exp_no == 0: data = 'Using ground truth data'
            if exp_no == 1: data = 'Using data from IHME forecast'
            if exp_no == 2: data = 'Using data from SEIR forecast'
            
            ax.title.set_text('Forecast - {} - {}'.format(dist, data))
        
    return ax

In [None]:
graph_start = (ihme_start_date - timedelta(allowance+1)).strftime("%m-%d-%Y")
s1_start = (ihme_start_date).strftime("%m-%d-%Y")
s2_start = (ihme_start_date + timedelta(s1)).strftime("%m-%d-%Y")
test_start = (ihme_start_date + timedelta(s1+s2)).strftime("%m-%d-%Y")
train_start = (ihme_start_date + timedelta(s1+s2-seir_c2_train_period)).strftime("%m-%d-%Y")
s3_end = (ihme_start_date + timedelta(s1+s2+s3)).strftime("%m-%d-%Y")
graph_end = (ihme_start_date + timedelta(s1+s2+s3+1)).strftime("%m-%d-%Y")

for col in range(4):
    fig, ax = plt.subplots(3, sharex=True, sharey=True, figsize=(15, 15))
    for row in range(len(ax)):
        ax[row] = plot_across_datasets(row, fig, ax[row], input_df[0].iloc[delay:,:].head(allowance+s1+s2+s3), 
                                       predictions_dicts[row]['m1']['df_prediction'], 
                                       predictions_dicts[row]['m1']['df_district'], 
                                       district, state, s1_start, train_start, s2_start, test_start, s3_end, graph_start, graph_end,
                                       model_name='', which_compartments=[Columns.which_compartments()[col]],)
        
        filename = output_folder+Columns.which_compartments()[col].name
        fig.savefig(filename)

## Params

In [None]:
params_dict = {
    'compartments_replaced': replace,
    'dataset_properties': {
        'exp1': prop1,
        'exp2': prop2,
        'exp3': prop3
    },
    'series_properties': series_properties
}

with open(output_folder+'params.json', 'w') as outfile:
    json.dump(params_dict, outfile, indent=4)
    
config['start_date'] = config['start_date'].strftime("%Y-%m-%d")
with open(output_folder+'ihme-config.json', 'w') as outfile:
    json.dump(config, outfile, indent=4)
    
with open(output_folder+'ihme-model-params.json', 'w') as outfile:
    json.dump(repr(model_params), outfile, indent=4)