In [None]:
import pandas as pd
import numpy as np
import xarray as xr

# 20210504_munge_ll_hosp_data

The purpose of this notebook is to reformat the (local-only) hospitalization line list data in the file `data/CV19Hospital_ICU_DeID_20210414.csv`. **Important**: before committing this notebook, ensure to clear cell outputs, since these data should not be pushed to public cloud.

Right now, data have the following schema:
```csv
ID,Age,Zip Code of Residence,Hospital,Date of Admission,ICU?,Ventilator?,Date of Discharge,Discharge Status,Occupation
9999,99,99999,hosp1,2020-99-99,False,False,2020-99-99,HOSPICE - MEDICAL FACILITY 51,Retired
9999,99,99999,hosp1,2020-99-99,False,False,2020-99-99,Expired 20,Retired
9999,99,99999,hosp2,2020-99-99,False,False,2020-99-99,HOME,Resident
9999,99,99999,hosp2,2020-99-99,False,False,2020-99-99,HOME,Construction and Extraction Occupations
```

And we want to transform into something like:
```csv
date,zip_code,hosp,deceased
```

----

Convert to CSV: 
```
in2csv data/CV19Hospital_ICU_DeID_20210414.xlsx > data/CV19Hospital_ICU_DeID_20210414.csv
```

In [None]:
raw_df = pd.read_csv('../data/CV19Hospital_ICU_DeID_20210414.csv', parse_dates=True)

Is it plausible to pull death data out of these? And fit to `Ih2D`?

In [None]:
raw_df['Discharge Status'].unique()

Save this for later; there are way too many outcomes documented here that could be categorized as deaths.

In [None]:
raw_df.head()

In [None]:
df = (
    raw_df
    [['ID', 'Zip Code of Residence', 'Date of Admission', 'Date of Discharge']]
    .rename(columns={'ID': 'id', 'Zip Code of Residence': 'zip_code', 'Date of Admission': 'admission_date', 'Date of Discharge': 'discharge_date'})
    # null admission discharge dates
    [raw_df['Date of Discharge'] != 'No Discharge Date']
    # .replace('No Discharge Date', np.nan)
)

Convert to datetimes:

In [None]:
df['discharge_date'] = pd.to_datetime(df['discharge_date'], format="%Y-%m-%d")
df['admission_date'] = pd.to_datetime(df['admission_date'], format="%Y-%m-%d")

In [None]:
df.head()

Next, use groupby objects to catalogue admission and discharge events, indexed by zip code and date:

In [None]:
admissions = (
    df
    .rename(columns={'admission_date': 'date'})
    .groupby(['zip_code', 'date'])
    ['id']
    .count()
)
discharges = (
    df
    .rename(columns={'discharge_date': 'date'})
    .groupby(['zip_code', 'date'])
    ['id']
    .count()
)
delta = admissions.sub(discharges, fill_value=0.)
delta.head()

Calculate net change for each zip code. In theory, this should be zero:

In [None]:
delta_per_zip = delta.groupby('zip_code').sum()
# delta_per_zip
with_unacc_dc = delta_per_zip[delta_per_zip > 0].count()
unacc_dc = delta_per_zip[delta_per_zip > 0].sum().astype(int)
acc_dc = -delta[delta < 0].sum().astype(int)
print(f"There are {with_unacc_dc} zip codes with unaccounted discharges out of {len(delta_per_zip.index)} zip codes.")
print(f"{unacc_dc} unaccounted for discharges and {acc_dc} accounted for discharges.")

What should we do about unaccounted for discharges? Possible solutions:
- Set discharge date as max date
    - Inflates Ih
- Ignore admission and discharge for `id` with unaccounted discharge
    - Deflates Ih
    - Best path forward according to KP
- Calculate the mean hospital duration and use that to extrapolate a discharge date

Will probably need to discuss with KP. ~~Until then, take option #1, which is probably the easiest at this point, since it just means "ignore any possible missed discharge events."~~ Going with second option, implemented above already.

In [None]:
delta.head()

## Convert Events to Incidental Heads in Beds

`delta` contains hospitalization events, but we want count of `Ih` compartment at every timepoint. ~~We'll do this by naively looping over every timepoint, and optimize later if this becomes a pain point.~~ `cumsum` is perfect for this.

We reindex over a continuous date axis, and padding with zeroes.

In [None]:
cont_dt = pd.date_range(
    start=delta.index.get_level_values(1).min(),
    end=delta.index.get_level_values(1).max(), 
    freq='D'
)
cont_dt

In [None]:
cont_midx = pd.MultiIndex.from_product(
    [delta.index.levels[0], cont_dt],
    names=['zip_code', 'date'])
ih_ser= delta.reindex(cont_midx, fill_value=0., copy=True)
# remove 'Unknown' zip code
del ih_ser['Unknown']
# convert to df
ih = ih_ser.to_frame(name='delta')

In [None]:
ih['cumsum'] = ih['delta'].cumsum()
assert ih['cumsum'].any()

## Write to CSV

In [15]:
ih['cumsum'].to_csv('../data/CV19Hospital_ICU_DeID_20210414_cumsum.csv')
!du -hs '../data/CV19Hospital_ICU_DeID_20210414_cumsum.csv'

5.0M	../data/CV19Hospital_ICU_DeID_20210414_cumsum.csv
