In [1]:
import pandas as pd
import numpy as np
import requests
import datetime
import os
import sys

sys.path.append('../..')
from data.dataloader import get_covid19india_api_data, get_rootnet_api_data
from utils.age_standardisation import *

In [22]:
def standardise_age(district, state, area_name):
    data = district_timeseries[district].set_index('date')
    raw_all_district_age_data = age_data[state]

    all_district_age_data = clean(raw_all_district_age_data)

    # Get relevant district(s) data
    district_age_data = all_district_age_data[all_district_age_data['Area Name'] == area_name]

    district_age_band_pops, district_total_pop = get_age_band_population(district_age_data)
    assert(district_total_pop == sum(district_age_band_pops.values()))

    district_age_band_ratios = {k: v / district_total_pop for k, v in district_age_band_pops.items()}

    ref_age_band_ratios = {
        '0-9': 0.1144545912,
        '10-19': 0.1096780507,
        '20-29': 0.1387701325,
        '30-39': 0.1481915984,
        '40-49': 0.1548679659,
        '50-59': 0.1428622446,
        '60-69': 0.1092853481,
        '70-79': 0.05542319854,
        '80+': 0.02646687006,
    }

    # calculated here: https://docs.google.com/spreadsheets/d/1APX7XwoJPIbUXOgXa2vZNreDn6UseBucSGq9fh-N5jE/edit#gid=1859974541
    # CFR based
    # ref_mortality_rate = {
    #     '0-9': 0.0002447933091,
    #     '10-19': 0.000377136454,
    #     '20-29': 0.0009389093188,
    #     '30-39': 0.001769966645,
    #     '40-49': 0.003645033792,
    #     '50-59': 0.01343325563,
    #     '60-69': 0.03884219978,
    #     '70-79': 0.08406020728,
    #     '80+': 0.1421208672,
    # }

    # # Deaths/Population based (excl SK)
    # ref_mortality_rate = {
    #     '0-9': 0.00000001922452747,
    #     '10-19': 0.00000001996437158,
    #     '20-29': 0.0000001540307269,
    #     '30-39': 0.0000004222770726,
    #     '40-49': 0.00000128438119,
    #     '50-59': 0.000005125162691,
    #     '60-69': 0.00001948507013,
    #     '70-79': 0.00009988464688,
    #     '80+': 0.0003878624777,
    # }

    # Deaths/Population based (excl SK/China)
    ref_mortality_rate = {
        '0-9': 0.00000008160306469,
        '10-19': 0.00000005586337197,
        '20-29': 0.0000005590581798,
        '30-39': 0.000001683722774,
        '40-49': 0.000005820174078,
        '50-59': 0.00002088458866,
        '60-69': 0.00007149402853,
        '70-79': 0.0003066451643,
        '80+': 0.0009211554807,
    }
    
    # age weighted mortality based on other countries
    implied_mortality = sum([ref_age_band_ratios[k] * ref_mortality_rate[k] for k in ref_mortality_rate.keys()])

    # need mortality rate from India -- this is going to be dependent on case data, so introducing that testing bias...
    # observed deaths/known cases
    daily_observed_mortality = data['total_deaths']/district_total_pop

    # how much different is it observed than 'implied'
    daily_mortality_ratio = daily_observed_mortality/implied_mortality

    # mortality rate accounting for observed/implied discrepancy
    age_stratified_daily_mortality = {k: daily_mortality_ratio * ref_mortality_rate[k] for k in ref_mortality_rate.keys()}

    # mortality rate accounting for observed/implied discrepancy and weighted by model location age
    age_std_mortality = sum([age_stratified_daily_mortality[k] * district_age_band_ratios[k] for k in district_age_band_ratios.keys()])
    # print(age_std_mortality)

    log_age_std_mortality = np.log(age_std_mortality)

    checker = pd.DataFrame()
    print(district_total_pop)
    checker['age_std'] = age_std_mortality
    checker['non_std'] = daily_observed_mortality
    print(checker)

In [19]:
dataframes = get_covid19india_api_data()

districts = ['Mumbai', 'Bengaluru', 'Ahmadabad', 'Jaipur', 'Pune', 'New Delhi']
states = ['Maharashtra', 'Karnataka', 'Gujarat', 'Rajasthan', 'Maharashtra', 'Delhi']
district_timeseries = get_district_time_series(dataframes, state=states, district=districts)

42 deaths in Maharashtra with unknown district
adding 14 deaths to Mumbai/Pune each
164
178
4 deaths in Karnataka with unknown district
6 deaths in Gujarat with unknown district
adding 6 deaths to Ahmadabad count
20 deaths in Rajasthan with unknown district
42 deaths in Maharashtra with unknown district
adding 14 deaths to Mumbai/Pune each
51
65
53 deaths in Delhi with unknown district


In [20]:
# get age data in bands
filenames = 'DDW-{}00C-13'
state_file_mapping = {
    filenames.format('24'): 'Gujarat',
    filenames.format('29'): 'Karnataka',
    filenames.format('27'): 'Maharashtra',
    filenames.format('07'): 'Delhi',
    filenames.format('08'): 'Rajasthan',
}

age_data = {}
directory = '../../data/data/census/'
for filename in os.listdir(directory):
    df = pd.read_excel(os.path.join(directory, filename))
    age_data[state_file_mapping[filename.split('.')[0]]] = df.dropna(how='all')

In [5]:
amd = 'District - Ahmadabad (07)'
mumbai = 'District - Mumbai (23)'
mumbai2 = 'District - Mumbai Suburban (22)'
pune = 'District - Pune (25)'
delhi ='District - New Delhi (05)'
jaipur = 'District - Jaipur (12)'
bengaluru = 'District - Bangalore (18)'

In [23]:

# standardise_age('New Delhi', 'Delhi', delhi)
# standardise_age('Bengaluru', 'Karnataka', bengaluru)
standardise_age('Mumbai', 'Maharashtra', mumbai)
# standardise_age('Pune', 'Maharashtra', pune)
# standardise_age('Ahmadabad', 'Gujarat', amd)
# standardise_age('Jaipur', 'Rajasthan', jaipur)

3069834
                 age_std   non_std
date                              
2020-03-11  0.000000e+00  0.000000
2020-03-12  0.000000e+00  0.000000
2020-03-13  0.000000e+00  0.000000
2020-03-14  0.000000e+00  0.000000
2020-03-15  0.000000e+00  0.000000
2020-03-16  0.000000e+00  0.000000
2020-03-17  0.000000e+00  0.000000
2020-03-18  0.000000e+00  0.000000
2020-03-19  0.000000e+00  0.000000
2020-03-20  0.000000e+00  0.000000
2020-03-21  0.000000e+00  0.000000
2020-03-22  0.000000e+00  0.000000
2020-03-23  0.000000e+00  0.000000
2020-03-24  0.000000e+00  0.000000
2020-03-25  0.000000e+00  0.000000
2020-03-26  0.000000e+00  0.000000
2020-03-27  0.000000e+00  0.000000
2020-03-28  0.000000e+00  0.000000
2020-03-29  8.238735e-07  0.000002
2020-03-30  9.886482e-07  0.000002
2020-03-31  9.886482e-07  0.000002
2020-04-01  1.812522e-06  0.000004
2020-04-02  1.812522e-06  0.000004
2020-04-03  1.812522e-06  0.000004
2020-04-04  2.471621e-06  0.000005
2020-04-05  3.789818e-06  0.000007
2020-04-06  