In [None]:
import pandas as pd
import numpy as np
import requests
import datetime
import os
import sys

sys.path.append('../..')
from data.dataloader import Covid19IndiaLoader
from utils.age_standardisation import *

In [None]:
def standardise_age(district, state, area_name):
    data = district_timeseries[district].set_index('date')
    raw_all_district_age_data = age_data[state]

    all_district_age_data = clean(raw_all_district_age_data)

    # Get relevant district(s) data
    district_age_data = all_district_age_data[all_district_age_data['Area Name'] == area_name]

    district_age_band_pops, district_total_pop = get_age_band_population(district_age_data)
    assert(district_total_pop == sum(district_age_band_pops.values()))

    district_age_band_ratios = {k: v / district_total_pop for k, v in district_age_band_pops.items()}

    ref_age_band_ratios = {
        '0-9': 0.1144545912,
        '10-19': 0.1096780507,
        '20-29': 0.1387701325,
        '30-39': 0.1481915984,
        '40-49': 0.1548679659,
        '50-59': 0.1428622446,
        '60-69': 0.1092853481,
        '70-79': 0.05542319854,
        '80+': 0.02646687006,
    }

    # calculated here: https://docs.google.com/spreadsheets/d/1APX7XwoJPIbUXOgXa2vZNreDn6UseBucSGq9fh-N5jE/edit#gid=1859974541
    # CFR based
    ref_mortality_rate = {
        '0-9': 0.0002447933091,
        '10-19': 0.000377136454,
        '20-29': 0.0009389093188,
        '30-39': 0.001769966645,
        '40-49': 0.003645033792,
        '50-59': 0.01343325563,
        '60-69': 0.03884219978,
        '70-79': 0.08406020728,
        '80+': 0.1421208672,
    }

    # # Deaths/Population based (excl SK)
    # ref_mortality_rate = {
    #     '0-9': 0.00000001922452747,
    #     '10-19': 0.00000001996437158,
    #     '20-29': 0.0000001540307269,
    #     '30-39': 0.0000004222770726,
    #     '40-49': 0.00000128438119,
    #     '50-59': 0.000005125162691,
    #     '60-69': 0.00001948507013,
    #     '70-79': 0.00009988464688,
    #     '80+': 0.0003878624777,
    # }

    # Deaths/Population based (excl SK/China)
    # ref_mortality_rate = {
    #     '0-9': 0.00000008160306469,
    #     '10-19': 0.00000005586337197,
    #     '20-29': 0.0000005590581798,
    #     '30-39': 0.000001683722774,
    #     '40-49': 0.000005820174078,
    #     '50-59': 0.00002088458866,
    #     '60-69': 0.00007149402853,
    #     '70-79': 0.0003066451643,
    #     '80+': 0.0009211554807,
    # }

    # age weighted mortality based on other countries
    implied_mortality = sum([ref_age_band_ratios[k] * ref_mortality_rate[k] for k in ref_mortality_rate.keys()])

    # need mortality rate from India -- this is going to be dependent on case data, so introducing that testing bias...
    # observed deaths/known cases
    daily_observed_mortality = data['total_deaths']/district_total_pop

    # how much different is it observed than 'implied'
    daily_mortality_ratio = daily_observed_mortality/implied_mortality

    # mortality rate accounting for observed/implied discrepancy
    age_stratified_daily_mortality = {k: daily_mortality_ratio * ref_mortality_rate[k] for k in ref_mortality_rate.keys()}

    # mortality rate accounting for observed/implied discrepancy and weighted by model location age
    age_std_mortality = sum([age_stratified_daily_mortality[k] * district_age_band_ratios[k] for k in district_age_band_ratios.keys()])
    # print(age_std_mortality)

    log_age_std_mortality = np.log(age_std_mortality)

    checker = pd.DataFrame()
    # print(district_total_pop)
    checker['age_std'] = age_std_mortality
    checker['non_std'] = daily_observed_mortality
    return checker

In [None]:
loader = Covid19IndiaLoader()
dataframes = loader.get_covid19india_api_data()

districts = ['Mumbai', 'Bengaluru', 'Ahmadabad', 'Jaipur', 'Pune', 'New Delhi']
states = ['Maharashtra', 'Karnataka', 'Gujarat', 'Rajasthan', 'Maharashtra', 'Delhi']
district_timeseries = get_district_time_series(dataframes, state=states, district=districts)

In [None]:
# get age data in bands
filenames = 'DDW-{}00C-13'
state_file_mapping = {
    filenames.format('24'): 'Gujarat',
    filenames.format('29'): 'Karnataka',
    filenames.format('27'): 'Maharashtra',
    filenames.format('07'): 'Delhi',
    filenames.format('08'): 'Rajasthan',
}

age_data = {}
directory = '../../data/data/census/'
for filename in os.listdir(directory):
    df = pd.read_excel(os.path.join(directory, filename))
    age_data[state_file_mapping[filename.split('.')[0]]] = df.dropna(how='all')

In [None]:
amd = 'District - Ahmadabad (07)'
mumbai = 'District - Mumbai (23)'
mumbai2 = 'District - Mumbai Suburban (22)'
pune = 'District - Pune (25)'
delhi ='District - New Delhi (05)'
jaipur = 'District - Jaipur (12)'
bengaluru = 'District - Bangalore (18)'

In [None]:

# age_std = standardise_age('New Delhi', 'Delhi', delhi)
# age_std = standardise_age('Bengaluru', 'Karnataka', bengaluru)
age_std = standardise_age('Mumbai', 'Maharashtra', mumbai)
# age_std = standardise_age('Pune', 'Maharashtra', pune)
# age_std = standardise_age('Ahmadabad', 'Gujarat', amd)
# age_std = standardise_age('Jaipur', 'Rajasthan', jaipur)
age_std = age_std.reset_index(col_fill='date')
age_std

In [None]:
from datetime import timedelta, datetime

import curvefit
from curvefit.core.utils import data_translator
from curvefit.pipelines.basic_model import BasicModel

sys.path.append('../..')
from utils.data import Params
from models.ihme.plotting import Plotter

In [None]:
# load params
daily, smoothing_window = False, False
# params = Params(args.params)

age_std['group'] = 1
df = age_std
agg_df = age_std

from curvefit.core.functions import *
func = erf
# set vars
date, groupcol = 'date', 'group'
xcol, ycol = 'date', 'age_std'
daysforward, daysback = 90, -10
pipeline_run_args =  {
    "n_draws": 20,
    "cv_threshold": 1e-2,
    "smoothed_radius": [7,7],
    "num_smooths": 3,
    "exclude_groups": [],
    "exclude_below": 0,
    "exp_smoothing": None,
    "max_last": None,
}

priors = {
            "fe_init": [0.33, 0.5, 0.66],
			"fe_bounds": [[0, 1], [1, 100], [1, 10000]]
		}
# output
fname = 'age_std_mumbai'
output_folder = f'output/pipeline/{fname}'
if not os.path.exists(output_folder):
        os.makedirs(output_folder)

predictdate = pd.to_datetime(pd.Series([timedelta(days=x)+df[date].iloc[0] for x in range(-daysback,daysforward)]))
predictx = np.array([x+1 for x in range(-daysback,daysforward)])

# link functions
identity_fun = lambda x: x
exp_fun = lambda x : np.exp(x)

In [None]:
def fit_predict_plot(curve_model, xcol, ycol, data, test, func, pargs={}, orig_ycol=None):
    p_args = {
        "n_draws": 5,
        "cv_threshold": 1e-2,
        "smoothed_radius": [3,3], 
        "num_smooths": 3, 
        "exclude_groups": [], 
        "exclude_below": 0,
        "exp_smoothing": None, 
        "max_last": None
    }
    p_args.update(pargs)
    
    # pipeline
    pipeline.setup_pipeline()
    pipeline.run(n_draws=p_args['n_draws'], prediction_times=predictx, 
        cv_threshold=p_args['cv_threshold'], smoothed_radius=p_args['smoothed_radius'], 
        num_smooths=p_args['num_smooths'], exclude_groups=p_args['exclude_groups'], 
        exclude_below=p_args['exclude_below'], exp_smoothing=p_args['exp_smoothing'], 
        max_last=p_args['max_last']
    )
    params_estimate = pipeline.mod.params
    print(params_estimate)
    dailycolname = dailycol.format(ycol=ycol) if daily else None

    plotter = Plotter(pipeline, params, predictdate, predictx, f'{fname}_{seed}', output_folder, ycol, func)
    plotter.plot_draws(dailycolname=dailycolname)

    # plot_prediction calls these functions:
        # group_predictions, predictions = predict(func, multigroup)
        # calc_error(test, predictions, agg_data, daysback)
    plotter.plot_predictions(df, agg_data, agg_test, orig_ycol, test, daysback, smoothing_window, multigroup, dailycolname)

    # Now, all plotting is complete. Re-acquire detailed draws information for output (csv)
    # Reliability of these numbers are questionable. Uncertainty metric evalutation ongoing.
    for group in pipeline.groups:
        # x = prediction_times = predictx
        draws = pipeline.draws[group].copy()
        draws = data_translator(
            data=draws,
            input_space=pipeline.predict_space,
            output_space=pipeline.predict_space
        )
        mean_fit = pipeline.mean_predictions[group].copy() # predictions
        mean_fit = data_translator(
            data=mean_fit,
            input_space=pipeline.predict_space,
            output_space=pipeline.predict_space
        )
        mean = draws.mean(axis=0)
        # uncertainty
        lower = np.quantile(draws, axis=0, q=0.025)
        upper = np.quantile(draws, axis=0, q=0.975)

    return mean_fit, lower, mean, upper 

In [None]:
df.loc[:,'covs'] = len(df) * [ 1.0 ]
df.loc[:,'sd'] = df[date].apply(lambda x: [1.0 if x >= datetime(2020, 3, 24) else 0.0]).tolist()
df.loc[:,f'{ycol}_normalized'] = df[ycol]/df[ycol].max()

param_names  = [ 'alpha', 'beta', 'p' ]
covs = ['covs', 'covs', 'covs']
# link_fun = [ identity_fun, exp_fun, exp_fun ]
link_fun = [ exp_fun, identity_fun, exp_fun ] # According to their methods should be
var_link_fun = [ identity_fun, identity_fun, identity_fun ]
# var_link_fun = link_fun


pipeline = BasicModel(
    all_data=df, #: (pd.DataFrame) of *all* the data that will go into this modeling pipeline
    col_t=xcol, #: (str) name of the column with time
    col_group=groupcol, #: (str) name of the column with the group in it
    col_obs=ycol, #: (str) the name of the column with observations for fitting the model
    col_obs_compare=ycol,
    all_cov_names=covs, 
    fun=func, #: (callable) the space to fit in, one of curvefit.functions
    predict_space=func,
    obs_se_func=None,
    fit_dict=priors, #: keyword arguments to CurveModel.fit_params()
    basic_model_dict= { #: additional keyword arguments to the CurveModel class
        'col_obs_se': None,#(str) of observation standard error
        'col_covs': [[cov] for cov in covs],
        'param_names': param_names,#(list{str}):
        'link_fun': link_fun,#(list{function}):
        'var_link_fun': var_link_fun,#(list{function}):
    },
)
fit_predict_plot(pipeline, xcol, ycol, data, test, func, pargs=pipeline_run_args, orig_ycol=orig_ycol)