In [None]:
from datetime import datetime, timedelta

import pandas as pd

from covid_model_deaths.data import get_input_data

# DCR_FILE = '/home/j/temp/reed/misc/data_dcr_lag8.csv'
DCR_FILE = '/home/j/temp/reed/misc/data_dcr_lag8_all_locs_deaths_gte10_v3.csv'
DCR_MEAN_FILE = '/home/j/temp/reed/misc/data_dcr_lag8_overall_mean_v3.csv'


## Load case data and attach `Days` column for averaging

In [None]:
df = get_input_data('full_data')
df = df.loc[df['Country/Region'] != 'Georgia']
df.loc[df['Province/State'].isnull(), 'Province/State'] = df['Country/Region']
df['location_id'] = df['location_id'].astype(int)
df = df.rename(index=str, columns={'Province/State':'Location'})
df['day0'] = df.groupby('location_id', as_index=False)['Date'].transform(min)
df['Days'] = df['Date'] - df['day0']
df['Days'] = df['Days'].apply(lambda x: x.days)
df.loc[df['Confirmed case rate'] == 0, 'Confirmed case rate'] = 1e-16
df['ln(confirmed case rate)'] = np.log(df['Confirmed case rate'])
df = df[['location_id', 'Location', 'Country/Region', 'Date', 'Days', 'ln(confirmed case rate)', 'population']]
df = df.sort_values(['location_id', 'Date']).reset_index(drop=True)
df.head()


## Do averaging

In [None]:
def _moving_average_cases(df):
    if df.location_id.unique().size != 1:
        raise ValueError('Multiple locations in dataset.')
    if df['Days'].min() != 0:
        raise ValueError('Not starting at 0')
    df = df.merge(pd.DataFrame({'Days': np.arange(df['Days'].min(), df['Days'].max()+1)}), how='outer')
    df = df.sort_values('Days').reset_index(drop=True)
    df.loc[df['Date'].isnull(), 'Date'] = (df.loc[df['Date'].isnull(), 'Days']
                                           .apply(lambda x: df['Date'].min() + timedelta(days=x)))
    # TODO: Document.
    df = df.fillna(method='pad')
    df['location_id'] = df['location_id'].astype(int)

    # FIXME: Shadowing variable from outer scope.  Make a separate
    #  function.
    def moving_3day_avg(day, df, measure):
        # determine difference
        days = np.array([day-1, day, day+1])
        days = days[days >= 0]
        days = days[days <= df['Days'].max()]
        avg = df.loc[df['Days'].isin(days), measure].mean()

        return avg

    # get diffs
    avgs = [moving_3day_avg(i, df, 'ln(confirmed case rate)') for i in df['Days']]
    df['Observed ln(confirmed case rate)'] = df['ln(confirmed case rate)']
    df['ln(confirmed case rate)'] = avgs

    # replace last point w/ daily value over 3->2 and 2->1 and the first
    # with 1->2, 2->3; use observed if 3 data points or less
    if len(df) > 3:
        last_step = np.mean(np.array(avgs[-3:-1]) - np.array(avgs[-4:-2]))
        df['ln(confirmed case rate)'][len(df)-1] = (df['ln(confirmed case rate)'][len(df)-2]
                                                            + last_step)
        first_step = np.mean(np.array(avgs[2:4]) - np.array(avgs[1:3]))
        df['ln(confirmed case rate)'][0] = df['ln(confirmed case rate)'][1] - first_step
    else:
        df['ln(confirmed case rate)'] = df['Observed ln(confirmed case rate)']

    return df
loc_dfs = [df.loc[df['location_id'] == l].reset_index(drop=True) for l in df.location_id.unique()]
loc_df = pd.concat([_moving_average_cases(loc_df) for loc_df in loc_dfs])
loc_df = loc_df.loc[~loc_df['ln(confirmed case rate)'].isnull()]
loc_df.head()


## get reads data and use it to produce deaths from smoothed cases
#### call it ln(asdr) just to make it compatible with data we will have in death model

In [None]:
# location-specific dataset
dcr_df = pd.read_csv(DCR_FILE)
dcr_df = dcr_df.loc[dcr_df['Country.Region'] != 'Georgia']
dcr_df = dcr_df[['location_id', 'dcr_lag8']]

# use average for all locations that don't have specific
dcr_mean_df = pd.read_csv(DCR_MEAN_FILE)
for location_id in [i for i in loc_df['location_id'].unique() if i not in dcr_df['location_id'].to_list()]:
    _dcr_df = dcr_mean_df.copy()
    _dcr_df['location_id'] = location_id
    dcr_df = dcr_df.append(_dcr_df[['location_id', 'dcr_lag8']])
dcr_df = dcr_df.reset_index(drop=True)

# manually fix Iceland (this is their ratio on 04/16)
dcr_df.loc[dcr_df['location_id'] == 83, 'dcr_lag8'] = 8.0 / 1616.0

death_df = loc_df.merge(dcr_df[['location_id', 'dcr_lag8']])
death_df['Confirmed case rate'] = np.exp(death_df['ln(confirmed case rate)'])
death_df['Death rate'] = death_df['Confirmed case rate'] * death_df['dcr_lag8']
death_df['ln(age-standardized death rate)'] = np.log(death_df['Death rate'])

# shift data forward 8 days
death_df['Date'] = death_df['Date'].apply(lambda x: x + timedelta(days=8))
death_df = death_df[['location_id', 'Location', 'Country/Region', 'Date', 'ln(age-standardized death rate)']]

# save data
death_df.to_csv(f"/ihme/covid-19/deaths/mobility_inputs/{datetime.now().strftime('%Y_%m_%d')}/deaths_from_cases.csv", 
                index=False)
death_df.head()
