In [5]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
from hyperopt import hp, tpe, fmin, Trials
from tqdm import tqdm
# from tqdm.notebook import tqdm

from collections import OrderedDict
import itertools
from functools import partial
import datetime
from joblib import Parallel, delayed

from models.seir.seir_testing import SEIR_Testing
from data.dataloader import get_global_data, get_indian_data
from data.bbmp_dataloader import loadbbmpdata
from utils.plotting import create_plots
from utils.generic import init_params

## Load BBMP Data

In [70]:
df, df_agg = loadbbmpdata('bbmp-raw.csv')
df_agg = df_agg.iloc[9:-3, :]
df_agg.reset_index(drop=True, inplace=True)

In [120]:
filename = 'bbmp-linelist-13-04-20.csv'

df = pd.read_csv('../../data/data/{}'.format(filename))

df.columns = [x if x != 'Result declared on' else 'Result Declaration Date' for x in df.columns]
df.columns = [x if x != 'Status Date' else 'Release Date' for x in df.columns]
df.columns = [x.title() for x in df.columns]

columns = [x for x in df.columns if ('Date' in x) or (x in ['Sl. No.', 'Current Status', 'Status']) ]
df = df[columns]

df.loc[df['Status'] == 'Death', 'Death Date'] = '13/04/2020'

In [121]:
# Replace all non entries with 01.01.2000
date_columns = [x for x in df.columns if 'Date' in x]
for column in date_columns:
    # Replace with 01.01.2000
    df.loc[df[column].isna(), column] = '01.01.2000'
    df.loc[df[column] == '-', column] = '01.01.2000'

In [122]:
# Replace all non entries with 01.01.2000
date_columns = [x for x in df.columns if 'Date' in x]
for column in date_columns:
    # Replace with 01.01.2000
    df[column] = df[column].apply(lambda x : x.replace('.', '/'))

In [123]:
df

Unnamed: 0,Sl. No.,Date Of Arrival To India,Date Of Onset Of Symptoms,Date Of Hospitalization,Date Of Sample Collection,Result Declaration Date,Current Status,Status,Release Date,Death Date
0,1,01/03/2020,04/03/2020,08/03/2020,05/03/2020,08/03/2020,Asymtomatic; Sample Negative on 21.03.2020; Di...,Recovered,26/03/2020,01/01/2000
1,2,01/01/2000,08/03/2020,08/03/2020,08/03/2020,09/03/2020,Second sample NEGATIVE on 19.03.2020 ; Discharged,Recovered,23/03/2020,01/01/2000
2,3,01/01/2000,08/03/2020,08/03/2020,08/03/2020,09/03/2020,Asymtomatic; Sample Negative on 21.03.2020 ; D...,Recovered,26/03/2020,01/01/2000
3,4,08/03/2020,04/03/2020,09/03/2020,08/03/2020,09/03/2020,Asymtomatic; Sample Negative on 21.03.2020; Di...,Recovered,24/03/2020,01/01/2000
4,5,06/03/2020,09/03/2020,11/03/2020,10/03/2020,11/03/2020,Second sample NEGATIVE on 19.03.2020; Discharg...,Recovered,19/03/2020,01/01/2000
...,...,...,...,...,...,...,...,...,...,...
70,71,01/01/2000,09/04/2020,10/04/2020,01/01/2000,10/04/2020,Isolated in the hospital,Active,01/01/2000,01/01/2000
71,72,01/01/2000,06/04/2020,10/04/2020,08/04/2020,10/04/2020,Isolated in the hospital,Active,01/01/2000,01/01/2000
72,73,01/01/2000,01/01/2000,11/04/2020,01/01/2000,01/01/2000,Isolated in the hospital,Active,01/01/2000,01/01/2000
73,74,01/01/2000,01/01/2000,11/04/2020,01/01/2000,01/01/2000,Isolated in the hospital,Active,01/01/2000,01/01/2000


In [124]:
# # Convert to pd.datetime
# date_columns = [x for x in df.columns if 'Date' in x]
# for column in date_columns:
#     if column != 'Release Date':
#         df[column] = df[column].apply(lambda x : x.strip())
#         df.loc[:, column] = pd.to_datetime(df.loc[:, column], format='%d/%m/%Y', errors='ignore')
#         df.loc[:, column] = pd.to_datetime(df.loc[:, column], format='%d.%m.%Y', errors='ignore')
#     else:
#         df.loc[:, column] = pd.to_datetime(df.loc[:, column], errors='ignore')
#         df.loc[:, column] = pd.to_datetime(df.loc[:, column], format='%m.%d.%Y', errors='ignore')

In [125]:
# Convert to pd.datetime
date_columns = [x for x in df.columns if 'Date' in x]
for column in date_columns:
    df[column] = df[column].apply(lambda x : x.strip())
    df.loc[:, column] = pd.to_datetime(df.loc[:, column], format='%d/%m/%Y', errors='ignore')

In [126]:
df

Unnamed: 0,Sl. No.,Date Of Arrival To India,Date Of Onset Of Symptoms,Date Of Hospitalization,Date Of Sample Collection,Result Declaration Date,Current Status,Status,Release Date,Death Date
0,1,2020-03-01,2020-03-04,2020-03-08,2020-03-05,2020-03-08,Asymtomatic; Sample Negative on 21.03.2020; Di...,Recovered,2020-03-26,2000-01-01
1,2,2000-01-01,2020-03-08,2020-03-08,2020-03-08,2020-03-09,Second sample NEGATIVE on 19.03.2020 ; Discharged,Recovered,2020-03-23,2000-01-01
2,3,2000-01-01,2020-03-08,2020-03-08,2020-03-08,2020-03-09,Asymtomatic; Sample Negative on 21.03.2020 ; D...,Recovered,2020-03-26,2000-01-01
3,4,2020-03-08,2020-03-04,2020-03-09,2020-03-08,2020-03-09,Asymtomatic; Sample Negative on 21.03.2020; Di...,Recovered,2020-03-24,2000-01-01
4,5,2020-03-06,2020-03-09,2020-03-11,2020-03-10,2020-03-11,Second sample NEGATIVE on 19.03.2020; Discharg...,Recovered,2020-03-19,2000-01-01
...,...,...,...,...,...,...,...,...,...,...
70,71,2000-01-01,2020-04-09,2020-04-10,2000-01-01,2020-04-10,Isolated in the hospital,Active,2000-01-01,2000-01-01
71,72,2000-01-01,2020-04-06,2020-04-10,2020-04-08,2020-04-10,Isolated in the hospital,Active,2000-01-01,2000-01-01
72,73,2000-01-01,2000-01-01,2020-04-11,2000-01-01,2000-01-01,Isolated in the hospital,Active,2000-01-01,2000-01-01
73,74,2000-01-01,2000-01-01,2020-04-11,2000-01-01,2000-01-01,Isolated in the hospital,Active,2000-01-01,2000-01-01


In [127]:
# Convert all 01/01/2000 to NaN
date_columns = [x for x in df.columns if 'Date' in x]
for column in date_columns:
    df.loc[df[column].apply(lambda x : x.year) == 2000, column] = np.nan

In [128]:
df

Unnamed: 0,Sl. No.,Date Of Arrival To India,Date Of Onset Of Symptoms,Date Of Hospitalization,Date Of Sample Collection,Result Declaration Date,Current Status,Status,Release Date,Death Date
0,1,2020-03-01,2020-03-04,2020-03-08,2020-03-05,2020-03-08,Asymtomatic; Sample Negative on 21.03.2020; Di...,Recovered,2020-03-26,NaT
1,2,NaT,2020-03-08,2020-03-08,2020-03-08,2020-03-09,Second sample NEGATIVE on 19.03.2020 ; Discharged,Recovered,2020-03-23,NaT
2,3,NaT,2020-03-08,2020-03-08,2020-03-08,2020-03-09,Asymtomatic; Sample Negative on 21.03.2020 ; D...,Recovered,2020-03-26,NaT
3,4,2020-03-08,2020-03-04,2020-03-09,2020-03-08,2020-03-09,Asymtomatic; Sample Negative on 21.03.2020; Di...,Recovered,2020-03-24,NaT
4,5,2020-03-06,2020-03-09,2020-03-11,2020-03-10,2020-03-11,Second sample NEGATIVE on 19.03.2020; Discharg...,Recovered,2020-03-19,NaT
...,...,...,...,...,...,...,...,...,...,...
70,71,NaT,2020-04-09,2020-04-10,NaT,2020-04-10,Isolated in the hospital,Active,NaT,NaT
71,72,NaT,2020-04-06,2020-04-10,2020-04-08,2020-04-10,Isolated in the hospital,Active,NaT,NaT
72,73,NaT,NaT,2020-04-11,NaT,NaT,Isolated in the hospital,Active,NaT,NaT
73,74,NaT,NaT,2020-04-11,NaT,NaT,Isolated in the hospital,Active,NaT,NaT


In [129]:
sum(df['Date Of Hospitalization'].isna())

0

In [130]:
# Create ICU and Ventilator variable
df['On Ventilator'] = df['Current Status'].apply(lambda x : (not pd.isna(x)) and ('ventilator' in x.lower()) )
df['On ICU'] = df['Current Status'].apply(lambda x : (not pd.isna(x)) and ('icu' in x.lower() or 'ventilator' in x.lower()) )
df['On ICU'] = np.logical_or(df['On ICU'], df['On Ventilator'])

# Create Exposed, Infectious, Hospitalisation Time variables
df['Exposed Time'] = np.maximum((df['Date Of Onset Of Symptoms'] - df['Date Of Arrival To India']).astype('timedelta64[D]'), 0)
df['Infectious Time'] = np.maximum((df['Date Of Hospitalization'] - df['Date Of Onset Of Symptoms']).astype('timedelta64[D]'), 0)
df['Hospitalisation Time'] = np.maximum((df['Release Date'] - df['Result Declaration Date']).astype('timedelta64[D]'), 0)

In [131]:
np.mean(df['Infectious Time'])

3.7241379310344827

In [132]:
# Fill in missing values
infectious_time = round(np.mean(df['Infectious Time']))
time_delay_int_result_declaration = round(np.mean(np.maximum((df['Result Declaration Date'] - df['Date Of Hospitalization']).astype('timedelta64[D]'), 0)))

for i, row in df.iterrows():
    if pd.isna(row['Date Of Hospitalization']):
        if not pd.isna(row['Date Of Onset Of Symptoms']):
            df.loc[i , 'Date Of Hospitalization'] = row['Date Of Onset Of Symptoms'] + datetime.timedelta(days=infectious_time)
        else:
            df.loc[i , 'Date Of Hospitalization'] = df.loc[i-1 , 'Date Of Hospitalization']

    if pd.isna(row['Date Of Onset Of Symptoms']):   
        df.loc[i , 'Date Of Onset Of Symptoms'] = df.loc[i , 'Date Of Hospitalization'] - datetime.timedelta(days=infectious_time)

    if pd.isna(row['Date Of Sample Collection']):
        df.loc[i , 'Date Of Sample Collection'] = df.loc[i , 'Date Of Hospitalization']

    if pd.isna(row['Result Declaration Date']):
        df.loc[i , 'Result Declaration Date'] = df.loc[i , 'Date Of Hospitalization'] + datetime.timedelta(days=time_delay_int_result_declaration)

In [133]:
df

Unnamed: 0,Sl. No.,Date Of Arrival To India,Date Of Onset Of Symptoms,Date Of Hospitalization,Date Of Sample Collection,Result Declaration Date,Current Status,Status,Release Date,Death Date,On Ventilator,On ICU,Exposed Time,Infectious Time,Hospitalisation Time
0,1,2020-03-01,2020-03-04,2020-03-08,2020-03-05,2020-03-08,Asymtomatic; Sample Negative on 21.03.2020; Di...,Recovered,2020-03-26,NaT,False,False,3.0,4.0,18.0
1,2,NaT,2020-03-08,2020-03-08,2020-03-08,2020-03-09,Second sample NEGATIVE on 19.03.2020 ; Discharged,Recovered,2020-03-23,NaT,False,False,,0.0,14.0
2,3,NaT,2020-03-08,2020-03-08,2020-03-08,2020-03-09,Asymtomatic; Sample Negative on 21.03.2020 ; D...,Recovered,2020-03-26,NaT,False,False,,0.0,17.0
3,4,2020-03-08,2020-03-04,2020-03-09,2020-03-08,2020-03-09,Asymtomatic; Sample Negative on 21.03.2020; Di...,Recovered,2020-03-24,NaT,False,False,0.0,5.0,15.0
4,5,2020-03-06,2020-03-09,2020-03-11,2020-03-10,2020-03-11,Second sample NEGATIVE on 19.03.2020; Discharg...,Recovered,2020-03-19,NaT,False,False,3.0,2.0,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,71,NaT,2020-04-09,2020-04-10,2020-04-10,2020-04-10,Isolated in the hospital,Active,NaT,NaT,False,False,,1.0,
71,72,NaT,2020-04-06,2020-04-10,2020-04-08,2020-04-10,Isolated in the hospital,Active,NaT,NaT,False,False,,4.0,
72,73,NaT,2020-04-07,2020-04-11,2020-04-11,2020-04-12,Isolated in the hospital,Active,NaT,NaT,False,False,,,
73,74,NaT,2020-04-07,2020-04-11,2020-04-11,2020-04-12,Isolated in the hospital,Active,NaT,NaT,False,False,,,


In [134]:
# Create processed dataframe from bbmp data
min_values = []
max_values = []
for column in date_columns:
    min_values.append(np.min(df[column]))
    max_values.append(np.max(df[column]))

start_date = np.nanmin(np.array(min_values))
end_date = np.nanmax(np.array(max_values))

daterange = pd.date_range(start=start_date, end=end_date)
daterange

df_agg = pd.DataFrame(index=daterange, columns=['Active Infections (Unknown)', 'Hospitalised', 'On ICU', 'On Ventilator', 'Fatalities', 
                                                'Total Infected', 'Total Infected (Unknown)', 'Recovered'])
df_agg.loc[:, :] = 0
df_agg.head()

for i, row in df.iterrows():

    df_agg.loc[row['Date Of Onset Of Symptoms']:row['Date Of Hospitalization'], 'Active Infections (Unknown)'] += 1

    if not pd.isna(row['Release Date']):
        df_agg.loc[row['Release Date']:, 'Recovered'] += 1
        df_agg.loc[row['Date Of Hospitalization']:row['Release Date']-datetime.timedelta(days=1), 'Hospitalised'] += 1
        if row['On ICU']:
            df_agg.loc[row['Date Of Hospitalization']:row['Release Date']-datetime.timedelta(days=1), 'On ICU'] += 1
        if row['On Ventilator']:
            df_agg.loc[row['Date Of Hospitalization']:row['Release Date']-datetime.timedelta(days=1), 'On Ventilator'] += 1
    
    elif not pd.isna(row['Death Date']):
        df_agg.loc[row['Death Date']:, 'Fatalities'] += 1
        df_agg.loc[row['Date Of Hospitalization']:row['Death Date']-datetime.timedelta(days=1), 'Hospitalised'] += 1
        if row['On ICU']:
            df_agg.loc[row['Date Of Hospitalization']:row['Death Date']-datetime.timedelta(days=1), 'On ICU'] += 1
        if row['On Ventilator']:
            df_agg.loc[row['Date Of Hospitalization']:row['Death Date']-datetime.timedelta(days=1), 'On Ventilator'] += 1
    
    else:
        df_agg.loc[row['Date Of Hospitalization']:, 'Hospitalised'] += 1
        if row['On ICU']:
            df_agg.loc[row['Date Of Hospitalization']:, 'On ICU'] += 1
        if row['On Ventilator']:
            df_agg.loc[row['Date Of Hospitalization']:, 'On Ventilator'] += 1

    df_agg.loc[row['Date Of Hospitalization']:, 'Total Infected'] += 1
    df_agg.loc[row['Date Of Onset Of Symptoms']:, 'Total Infected (Unknown)'] += 1
        

df_agg.reset_index(inplace=True) 
df_agg.columns = [x if x != 'index' else 'Date' for x in df_agg.columns]

df_agg.to_csv('../../data/data/bbmp-processed-13-04.csv')

In [135]:
df_agg

Unnamed: 0,Date,Active Infections (Unknown),Hospitalised,On ICU,On Ventilator,Fatalities,Total Infected,Total Infected (Unknown),Recovered
0,2020-03-01,0,0,0,0,0,0,0,0
1,2020-03-02,0,0,0,0,0,0,0,0
2,2020-03-03,0,0,0,0,0,0,0,0
3,2020-03-04,4,0,0,0,0,0,4,0
4,2020-03-05,4,0,0,0,0,0,4,0
5,2020-03-06,4,0,0,0,0,0,4,0
6,2020-03-07,5,0,0,0,0,0,5,0
7,2020-03-08,7,3,0,0,0,3,7,0
8,2020-03-09,5,4,0,0,0,4,8,0
9,2020-03-10,6,4,0,0,0,4,10,0
