In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
from hyperopt import hp, tpe, fmin, Trials
from tqdm import tqdm
# from tqdm.notebook import tqdm

from collections import OrderedDict
import itertools
from functools import partial
import datetime
from joblib import Parallel, delayed

from models.seir.seir_testing import SEIR_Testing
from data.dataloader import get_global_data, get_indian_data
from data.bbmp_dataloader import loadbbmpdata
from utils.plotting import create_plots
from utils.generic import init_params

## Load BBMP Data

In [None]:
df, df_agg = loadbbmpdata('bbmp-raw.csv')
df_agg = df_agg.iloc[9:-3, :]
df_agg.reset_index(drop=True, inplace=True)

In [None]:
filename = 'bbmp-linelist-13-04-20.csv'

df = pd.read_csv('../../data/data/{}'.format(filename))

df.columns = [x if x != 'Result declared on' else 'Result Declaration Date' for x in df.columns]
df.columns = [x if x != 'Status Date' else 'Release Date' for x in df.columns]
df.columns = [x.title() for x in df.columns]

columns = [x for x in df.columns if ('Date' in x) or (x in ['Sl. No.', 'Current Status', 'Status']) ]
df = df[columns]

df.loc[df['Status'] == 'Death', 'Death Date'] = '13/04/2020'

In [None]:
# Replace all non entries with 01.01.2000
date_columns = [x for x in df.columns if 'Date' in x]
for column in date_columns:
    # Replace with 01.01.2000
    df.loc[df[column].isna(), column] = '01.01.2000'
    df.loc[df[column] == '-', column] = '01.01.2000'

In [None]:
# Replace all non entries with 01.01.2000
date_columns = [x for x in df.columns if 'Date' in x]
for column in date_columns:
    # Replace with 01.01.2000
    df[column] = df[column].apply(lambda x : x.replace('.', '/'))

In [None]:
df

In [None]:
# # Convert to pd.datetime
# date_columns = [x for x in df.columns if 'Date' in x]
# for column in date_columns:
#     if column != 'Release Date':
#         df[column] = df[column].apply(lambda x : x.strip())
#         df.loc[:, column] = pd.to_datetime(df.loc[:, column], format='%d/%m/%Y', errors='ignore')
#         df.loc[:, column] = pd.to_datetime(df.loc[:, column], format='%d.%m.%Y', errors='ignore')
#     else:
#         df.loc[:, column] = pd.to_datetime(df.loc[:, column], errors='ignore')
#         df.loc[:, column] = pd.to_datetime(df.loc[:, column], format='%m.%d.%Y', errors='ignore')

In [None]:
# Convert to pd.datetime
date_columns = [x for x in df.columns if 'Date' in x]
for column in date_columns:
    df[column] = df[column].apply(lambda x : x.strip())
    df.loc[:, column] = pd.to_datetime(df.loc[:, column], format='%d/%m/%Y', errors='ignore')

In [None]:
df

In [None]:
# Convert all 01/01/2000 to NaN
date_columns = [x for x in df.columns if 'Date' in x]
for column in date_columns:
    df.loc[df[column].apply(lambda x : x.year) == 2000, column] = np.nan

In [None]:
df

In [None]:
sum(df['Date Of Hospitalization'].isna())

In [None]:
# Create ICU and Ventilator variable
df['On Ventilator'] = df['Current Status'].apply(lambda x : (not pd.isna(x)) and ('ventilator' in x.lower()) )
df['On ICU'] = df['Current Status'].apply(lambda x : (not pd.isna(x)) and ('icu' in x.lower() or 'ventilator' in x.lower()) )
df['On ICU'] = np.logical_or(df['On ICU'], df['On Ventilator'])

# Create Exposed, Infectious, Hospitalisation Time variables
df['Exposed Time'] = np.maximum((df['Date Of Onset Of Symptoms'] - df['Date Of Arrival To India']).astype('timedelta64[D]'), 0)
df['Infectious Time'] = np.maximum((df['Date Of Hospitalization'] - df['Date Of Onset Of Symptoms']).astype('timedelta64[D]'), 0)
df['Hospitalisation Time'] = np.maximum((df['Release Date'] - df['Result Declaration Date']).astype('timedelta64[D]'), 0)

In [None]:
np.mean(df['Infectious Time'])

In [None]:
# Fill in missing values
infectious_time = round(np.mean(df['Infectious Time']))
time_delay_int_result_declaration = round(np.mean(np.maximum((df['Result Declaration Date'] - df['Date Of Hospitalization']).astype('timedelta64[D]'), 0)))

for i, row in df.iterrows():
    if pd.isna(row['Date Of Hospitalization']):
        if not pd.isna(row['Date Of Onset Of Symptoms']):
            df.loc[i , 'Date Of Hospitalization'] = row['Date Of Onset Of Symptoms'] + datetime.timedelta(days=infectious_time)
        else:
            df.loc[i , 'Date Of Hospitalization'] = df.loc[i-1 , 'Date Of Hospitalization']

    if pd.isna(row['Date Of Onset Of Symptoms']):   
        df.loc[i , 'Date Of Onset Of Symptoms'] = df.loc[i , 'Date Of Hospitalization'] - datetime.timedelta(days=infectious_time)

    if pd.isna(row['Date Of Sample Collection']):
        df.loc[i , 'Date Of Sample Collection'] = df.loc[i , 'Date Of Hospitalization']

    if pd.isna(row['Result Declaration Date']):
        df.loc[i , 'Result Declaration Date'] = df.loc[i , 'Date Of Hospitalization'] + datetime.timedelta(days=time_delay_int_result_declaration)

In [None]:
df

In [None]:
# Create processed dataframe from bbmp data
min_values = []
max_values = []
for column in date_columns:
    min_values.append(np.min(df[column]))
    max_values.append(np.max(df[column]))

start_date = np.nanmin(np.array(min_values))
end_date = np.nanmax(np.array(max_values))

daterange = pd.date_range(start=start_date, end=end_date)
daterange

df_agg = pd.DataFrame(index=daterange, columns=['Active Infections (Unknown)', 'Hospitalised', 'On ICU', 'On Ventilator', 'Fatalities', 
                                                'Total Infected', 'Total Infected (Unknown)', 'Recovered'])
df_agg.loc[:, :] = 0
df_agg.head()

for i, row in df.iterrows():

    df_agg.loc[row['Date Of Onset Of Symptoms']:row['Date Of Hospitalization'], 'Active Infections (Unknown)'] += 1

    if not pd.isna(row['Release Date']):
        df_agg.loc[row['Release Date']:, 'Recovered'] += 1
        df_agg.loc[row['Date Of Hospitalization']:row['Release Date']-datetime.timedelta(days=1), 'Hospitalised'] += 1
        if row['On ICU']:
            df_agg.loc[row['Date Of Hospitalization']:row['Release Date']-datetime.timedelta(days=1), 'On ICU'] += 1
        if row['On Ventilator']:
            df_agg.loc[row['Date Of Hospitalization']:row['Release Date']-datetime.timedelta(days=1), 'On Ventilator'] += 1
    
    elif not pd.isna(row['Death Date']):
        df_agg.loc[row['Death Date']:, 'Fatalities'] += 1
        df_agg.loc[row['Date Of Hospitalization']:row['Death Date']-datetime.timedelta(days=1), 'Hospitalised'] += 1
        if row['On ICU']:
            df_agg.loc[row['Date Of Hospitalization']:row['Death Date']-datetime.timedelta(days=1), 'On ICU'] += 1
        if row['On Ventilator']:
            df_agg.loc[row['Date Of Hospitalization']:row['Death Date']-datetime.timedelta(days=1), 'On Ventilator'] += 1
    
    else:
        df_agg.loc[row['Date Of Hospitalization']:, 'Hospitalised'] += 1
        if row['On ICU']:
            df_agg.loc[row['Date Of Hospitalization']:, 'On ICU'] += 1
        if row['On Ventilator']:
            df_agg.loc[row['Date Of Hospitalization']:, 'On Ventilator'] += 1

    df_agg.loc[row['Date Of Hospitalization']:, 'Total Infected'] += 1
    df_agg.loc[row['Date Of Onset Of Symptoms']:, 'Total Infected (Unknown)'] += 1
        

df_agg.reset_index(inplace=True) 
df_agg.columns = [x if x != 'index' else 'Date' for x in df_agg.columns]

df_agg.to_csv('../../data/data/bbmp-processed-13-04.csv', index=False)

In [None]:
df_agg