In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import matplotlib
import numpy as np
import pandas as pd
from tqdm import tqdm
import emcee
from multiprocessing import Pool
import time 
import matplotlib.pyplot as plt
import corner

matplotlib.rcParams.update({'font.size': 18})
from collections import defaultdict, OrderedDict

from utils.generic import init_params
from main.seir.optimiser import Optimiser
from models.seir.seir_testing import SEIR_Testing
from data.processing import get_district_time_series
from data.dataloader import get_covid19india_api_data

## Load covid19 data

In [None]:
dataframes = get_covid19india_api_data()

In [None]:
dataframes.keys()

In [None]:
df_district = get_district_time_series(dataframes)

In [None]:
## TODO: Make splits
df_train = df_district

## Loss Calculation Functions

In [None]:
def _calc_rmse(y_pred, y_true, log=True):
    if log:
        y_true = np.log(y_true)
        y_pred = np.log(y_pred)
    loss = np.sqrt(np.mean((y_true - y_pred)**2))
    return loss

def _calc_mape(y_pred, y_true):
    y_pred = y_pred[y_true > 0]
    y_true = y_true[y_true > 0]

    ape = np.abs((y_true - y_pred + 0) / y_true) *  100
    loss = np.mean(ape)
    return loss

def calc_loss_dict(states_time_matrix, df, method='rmse', rmse_log=False):
    pred_hospitalisations = states_time_matrix[6] + states_time_matrix[7] + states_time_matrix[8]
    pred_recoveries = states_time_matrix[9]
    pred_fatalities = states_time_matrix[10]
    pred_infectious_unknown = states_time_matrix[2] + states_time_matrix[4]
    pred_total_cases = pred_hospitalisations + pred_recoveries + pred_fatalities
    
    if method == 'rmse':
        if rmse_log:
            calculate = lambda x, y : _calc_rmse(x, y)
        else:
            calculate = lambda x, y : _calc_rmse(x, y, log=False)
    
    if method == 'mape':
            calculate = lambda x, y : _calc_mape(x, y)
    
    losses = {}
#     losses['hospitalised'] = calculate(pred_hospitalisations, df['Hospitalised'])
#     losses['recovered'] = calculate(pred_recoveries, df['Recovered'])
#     losses['fatalities'] = calculate(pred_fatalities, df['Fatalities'])
#     losses['active_infections'] = calculate(pred_infectious_unknown, df['Active Infections (Unknown)'])
    losses['total'] = calculate(pred_total_cases, df['total_infected'])
    
    return losses

def calc_loss(states_time_matrix, df, method='rmse', rmse_log=False):
    losses = calc_loss_dict(states_time_matrix, df, method, rmse_log)
#     loss = losses['hospitalised'] + losses['recovered'] + losses['total'] + losses['active_infections']
    loss = losses['total']
    return loss

## Initialize params and state values

In [None]:
vanilla_params, testing_params, state_init_values = init_params()

In [None]:
vanilla_params

In [None]:
vanilla_params

In [None]:
state_init_values

## Set priors for parameters of interest

In [None]:
## assuming uniform priors, following dictionary contains the ranges
prior_ranges = OrderedDict()
prior_ranges['R0'] = (1, 3)#(1.6, 3)
prior_ranges['T_inc'] = (1, 5) #(4, 5)
prior_ranges['T_inf'] = (1, 4) #(3, 4)
prior_ranges['T_recov_severe'] = (9, 20)
prior_ranges['P_severe'] = (0.3, 0.99)
prior_ranges['intervention_amount'] = (0.3, 1)
prior_ranges['c_sigma'] = (0.001, 5)

def param_init():
    theta = defaultdict()
    for key in prior_ranges:
        theta[key] = np.random.uniform(prior_ranges[key][0], prior_ranges[key][1])
        
    return theta

## Proposal function to sample theta_new given theta_old

In [None]:
proposal_sigmas = OrderedDict()
for key in prior_ranges:
    proposal_sigmas[key] = 1#0.025 * (prior_ranges[key][1] - prior_ranges[key][0])

def proposal(theta_old):
    theta_new = np.random.normal(loc=[*theta_old.values()], scale=[*proposal_sigmas.values()])
    return dict(zip(theta_old.keys(), theta_new))


## Log Likelihood and Prior

In [None]:
type(OrderedDict())

In [None]:
import collections
def log_likelihood(theta):
    if (np.array([*theta.values()]) < 0).any():
        return -np.inf
    #alpha = 0.01
    optimiser = Optimiser()
    default_params = optimiser.init_default_params(df_train)
    df_prediction = optimiser.solve(theta, default_params, df_train)
    pred = np.array(df_prediction['total_infected'])
    true = np.array(df_train['total_infected'])
    #sigma = alpha * true.std()
    sigma = theta['c_sigma']
    N = len(true)
    ll = - (N * np.log(np.sqrt(2*np.pi) * sigma)) - (np.sum(((true - pred) ** 2) / (2 * sigma ** 2)))
    return ll

def log_prior(theta):
#     prior = 1
#     for key in prior_ranges:
#         if in_valid_range(key, theta[key]):
#             prior *= 1 / (prior_ranges[key][1] - prior_ranges[key][0])
#         else:
#             prior = 0
#             break
    if (np.array([*theta.values()]) < 0).any():
        prior = 0
    else:
        prior = 1
    
    return np.log(prior)

def in_valid_range(key, value):
    return (value <= prior_ranges[key][1]) and (value >= prior_ranges[key][0])

## Acceptance function

In [None]:
T = 1
key_list = ['R0','T_inc','T_inf','T_recov_severe','P_severe','intervention_amount', 'c_sigma']

def convert_to_dict(theta):
    if type(theta) != dict:
        theta_vals = theta.copy()
        theta = dict()
        for key_index in range(len(key_list)):
            theta[key_list[key_index]] = theta_vals[key_index]
    return theta

def log_probability(theta):
    theta = convert_to_dict(theta)
    return log_likelihood(theta) + log_prior(theta)

def accept(theta_old, theta_new, boltzmann = False):  
    x_new = log_probability(theta_new)
    x_old = log_probability(theta_old)
    
    if (x_new) > (x_old):
        return True
    else:
        x = np.random.uniform(0, 1)
        return (x < np.exp(x_new - x_old))
    
def anneal_accept(iter):
    prob = 1 - np.exp(-(1/(iter + 1e-10)))
    x = np.random.uniform(0, 1)
    return (x < prob)

In [None]:
optimum_params = { 'R0': 2.1039262514239443, 'T_inc': 4.29031222687138,
                  'T_inf': 3.0377562096514046, 'T_recov_severe': 9.594716552601186,
                  'P_severe': 0.961325014139492, 'intervention_amount': 0.4101211254804955,
                    'c_sigma' : 1} # Sigma unknown, need to fit to likelihood

In [None]:
optimum_params.values()

## Metropolis loop

In [None]:
def metropolis(iter=1000):
    theta = param_init()
    accepted = [theta]
    rejected = list()
    
    for i in tqdm(range(iter)):
        theta_new = proposal(theta)
        if anneal_accept(i):
            theta = theta_new
        else:
            if accept(theta, theta_new):
                theta = theta_new
            else:
                rejected.append(theta_new)
        accepted.append(theta)
    
    return accepted, rejected

proposal_function – The proposal function. It should take 2 arguments: a numpy-compatible random number generator and a (K, ndim) list of coordinate vectors. This function should return the proposed position and the log-ratio of the proposal probabilities (ln𝑞(𝑥;𝑥′)−ln𝑞(𝑥′;𝑥) where 𝑥′ is the proposed coordinate).

In [None]:

def proposal_emcee(rng, theta_list):
    theta = theta_list[-1]
    theta_new = proposal(theta)
    if anneal_accept(i):
        theta = theta_new
    return theta, 0

In [None]:
init_param = []
for key in prior_ranges:
    init_param.append(optimum_params[key])

## Set up Emcee

In [None]:
nwalkers = 30
nsteps = 20000
ndim = len(init_param)
pos = init_param + 1e-2 * np.random.randn(nwalkers, ndim)
filename = "emcee.h5"
backend = emcee.backends.HDFBackend(filename)
backend.reset(nwalkers, ndim)
sampler = emcee.EnsembleSampler(nwalkers, ndim, log_probability,
                                #moves=[(emcee.moves.DEMove(), 0.8), (emcee.moves.DESnookerMove(), 0.2),],
                                backend = backend,
                                   a=0.2)

In [None]:
"""start = time.time()
sampler.run_mcmc(pos, nsteps, progress=True);
end = time.time()
serial_time = end - start

   
    start = time.time()
    sampler.run_mcmc(pos, nsteps, progress=True);
    end = time.time()
    multi_time = end - start
    print("Multiprocessing took {0:.1f} seconds".format(multi_time))
    print("{0:.1f} times faster than serial".format(serial_time / multi_time))"""

In [None]:
# Run EMcee sampler
sampler.run_mcmc(pos, nsteps, progress=True);

In [None]:
#acc, rej = metropolis(iter=20000)

In [None]:
#df_samples = pd.DataFrame(acc)

In [None]:
samples = sampler.get_chain()
#samples[:,sampler.acceptance_fraction > 0.1,:].shape

In [None]:
sampler.acceptance_fraction

In [None]:
# View Time series of parameter values for multiple chains

fig, axes = plt.subplots(ndim, figsize=(10, 20), sharex=True)
samples = sampler.get_chain()
#samples = samples[:,sampler.acceptance_fraction > 0.1,:]
labels = key_list
for i in range(ndim):
    ax = axes[i]
    ax.plot(samples[:, :, i], "k", alpha=0.3)
    ax.set_xlim(0, len(samples))
    ax.set_ylabel(labels[i])
    ax.yaxis.set_label_coords(-0.1, 0.5)

axes[-1].set_xlabel("step number");

In [None]:
#Get autocorrelation time of the chain
tau = sampler.get_autocorr_time()
print(tau)

In [None]:
flat_samples = sampler.get_chain(discard=200, thin=20, flat=True)#[:,sampler.acceptance_fraction > 0.1,:]
#num_samples, num_chains, num_params = flat_samples.shape
#flat_samples = flat_samples.reshape((num_samples*num_chains, num_params))


In [None]:
flat_samples[:,0].shape

## Results

In [None]:
# View corner plot to check pairwise correlations
corner.corner(flat_samples)

In [None]:
plt.hist(flat_samples[:,0], bins=20)

## Use samples to estimate confidence intervals

In [None]:
pred_dfs = list()
optimiser = Optimiser()
default_params = optimiser.init_default_params(df_train)

In [None]:
sample_indices = np.random.randint(len(flat_samples), size=1000)
posterior_samples = flat_samples
for i in tqdm(sample_indices):
    pred_dfs.append(optimiser.solve( convert_to_dict(posterior_samples[int(i)]), default_params, df_train))

In [None]:
for df in pred_dfs:
    df.set_index('date', inplace=True)

In [None]:
result = pred_dfs[0].copy()
for col in result.columns:
    result["{}_low".format(col)] = ''
    result["{}_high".format(col)] = ''

In [None]:
def get_PI(date, key, multiplier=1.96):
    scaling_factor = 1
    pred_samples = list()
    for df in pred_dfs:
        pred_samples.append(df.loc[date, key])
    mu = np.array(pred_samples).mean()
    sigma =  scaling_factor*np.array(pred_samples).std()
    low = mu - multiplier*sigma
    high = mu + multiplier*sigma
    return mu, low, high

In [None]:
 pred_dfs[0]

In [None]:
for date in tqdm(pred_dfs[0].index):
    for key in pred_dfs[0]:
        result.loc[date, key], result.loc[date, "{}_low".format(key)], result.loc[date, "{}_high".format(key)] = get_PI(date, key)

In [None]:
result

## Visualize the intervals

In [None]:
plt.figure(figsize=(10, 10))
plt.plot(df_train['total_infected'], c='g', label='Actual')
plt.plot(result['total_infected'].tolist(), c='r', label='Estimated')
plt.plot(result['total_infected_low'].tolist(), c='r', linestyle='dashdot')
plt.plot(result['total_infected_high'].tolist(), c='r', linestyle='dashdot')
plt.xlabel("Day")
plt.ylabel("Total infected")
plt.legend()