# Calculating state-by-state implied infection numbers

This notebook tries to compute what the full infection numbers in the past and present likely were/are.

It does so in the past by blending variables for "median days from infection to death" and "infection fatility rate" (IFR) with smoothed death rates. In other words, days_to_death days before date D, there must have been roughly (deaths_on_date_D / IFR) infections to end up with a given number of deaths on date D.

When looking at the most recent days_to_death days, it looks up what percentage of infections were confirmed on the last day calculated in the past, and applies that percentage to the new infections found since then. It normalizes a bit by the amount of testing done on each day to try to handle significant ramping up/down of testing during that time, but the recent projections are admittedly sketchy.

As of December 2021, the data is entirely drawn from two CDC data sets:
* Deaths (https://data.cdc.gov/api/views/r8kw-7aab/rows.csv?accessType=DOWNLOAD)
* Hospitalizations (https://beta.healthdata.gov/api/views/g62h-syeh/rows.csv?accessType=DOWNLOAD)

#### LOCALIZATION REQUIRED:

Look at the `cdc_common.py` file and modify the `download_path()` function to provide a useful place to store some temporary files created in the process of running. It used to expect downloaded files to be there, but now just stores pickle files for caching data.

In [None]:
%matplotlib inline
import os
import time
from datetime import datetime

import numpy
import pandas
import matplotlib
import matplotlib.pyplot as plt
# import scipy.stats

import cdc_common
from cdc_common import load_data, DOD_META, get_infections_df, download_path, load_hospital_stats

# change the cell width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

pandas.set_option('display.max_rows', 1000)
pandas.set_option('display.max_columns', 1000)
pandas.set_option('display.width', 1000)

In [None]:
EARLIEST_DATE = pandas.Period('2020-03-01', freq='D')

# Set a latest date when the most recent days have garbage (like on or after holidays)
LATEST_DATE = pandas.Period('2020-12-23', freq='D')
LATEST_DATE = pandas.Period('2021-12-08', freq='D')
LATEST_DATE = None

# Set a number of recent days to not display in the graphs for lack of future days to smooth them
NON_DISPLAY_DAYS = 0

In [None]:
latest_date, meta, all_stats, cdc_stats, hosp_stats = load_data(EARLIEST_DATE, LATEST_DATE)
latest_displayed = latest_date - NON_DISPLAY_DAYS
print(f"Latest date = {str(latest_date)}; latest displayed = {str(latest_displayed)}")

In [None]:
latest_displayed = latest_date - NON_DISPLAY_DAYS

## Calc Estimated Infections

In [None]:
# Median number of days between being exposed and developing illness
INCUBATION = 4

# Number of days one is infectious (this isn't actually used yet)
INFECTIOUS = 10

# Median days in between exposure and death
DEATH_LAG = 19

In [None]:
# Here is where you set variables for IFR assumptions

# Note that this IFR represents a country-wide average on any given day, but the IFRs
# are actually adjusted up/down based on median age and nursing home residents per capita

# This set represents my worst case scenario (in my 95% CI interval)
# Start by setting the inital and final IFRs
IFR_S, IFR_E = 0.013, 0.006
# Then set dates in between by which it linearly scales to various targets
IFR_BREAKS = [['2020-04-30', 0.0095], ['2020-07-31', 0.007], ['2020-09-15', 0.006]]

# This set is my optimistic scenario
IFR_S, IFR_E = 0.01, 0.0025
IFR_BREAKS = [['2020-04-30', 0.0075], ['2020-07-31', 0.0045], ['2020-09-15', 0.0025]]

# This set is a highly optimistic scenario that matches the recent CDC data
IFR_S, IFR_E = 0.009, 0.002
IFR_BREAKS = [['2020-04-30', 0.007], ['2020-07-31', 0.003], ['2020-09-15', 0.002]]

# This is my expected scenario
IFR_S, IFR_E = 0.01, 0.005
IFR_BREAKS = [['2020-04-30', 0.0085], ['2020-07-31', 0.005], ['2020-09-15', 0.004], ['2021-01-15', 0.004],
              ['2021-06-01', 0.0025]]

# This is my expected scenario
IFR_S, IFR_E = 0.014, 0.004
IFR_BREAKS = [['2020-04-30', 0.011], ['2020-07-31', 0.009], ['2020-09-15', 0.007], ['2021-01-15', 0.0055],
              ['2021-06-01', 0.0045]]

# This is my expected scenario
IFR_S, IFR_E = 0.013, 0.003
IFR_BREAKS = [['2020-04-30', 0.01], ['2020-07-31', 0.0085], ['2020-09-15', 0.007], ['2021-01-15', 0.0055],
              ['2021-06-01', 0.004], ['2021-09-01', IFR_E]]

# This is my expected scenario
IFR_S, IFR_E = 0.013, 0.001
IFR_BREAKS = [['2020-04-30', 0.01], ['2020-07-31', 0.0085], ['2020-09-15', 0.007], ['2021-01-15', 0.006],
              ['2021-06-01', 0.0045], ['2021-09-01', 0.0035], ['2021-11-15', 0.0025], ['2021-12-10', 0.0015]]

In [None]:
print("These are the factors applied against IFR state-by-state, based on median age and nursing home numbers:")
IFR_S_S, IFR_E_S = f'{100*IFR_S:.1f}%', f'{100*IFR_E:.2f}%'
infected_states = get_infections_df(all_stats, meta, DEATH_LAG, IFR_S, IFR_E, IFR_BREAKS, INCUBATION, INFECTIOUS)
EST_LINE = str(latest_date - (DEATH_LAG - 1))
print(f"Total infected by {latest_date}: {int(infected_states.NewInf.sum()):,}")
print(f"Vertical line marking recent estimations set at {EST_LINE}")

In [None]:
infected_states.tail()

## Now for the charts

In [None]:
# Just nicking off the values we don't want to display here
fazzy = infected_states.reset_index()
fazzy = fazzy[fazzy.Date <= latest_displayed]
fazzy = fazzy.set_index(['ST', 'Date'])
infected_states = fazzy

In [None]:
fazzle = infected_states.reset_index()[['Date', 'NewHosp', 'Daily']].groupby('Date').sum()
fazzle.columns = ['New Hospitalizations', 'Deaths']
fazzle.loc[:'2020-07-31', 'New Hospitalizations'] = numpy.nan
fazzle = fazzle.loc[:str(latest_date), :]
fimble = fazzle.loc['2020-07-31':str(latest_date), :]
fimble = fazzle.loc[:str(latest_date), :]
fam = fimble.plot(
    title=f"New Hospitalizations vs. Daily Deaths",
    secondary_y='Deaths', figsize=(25,5), ylim=0)
axes = fam.get_figure().get_axes()
for i in range(len(axes)):
    axes[i].set_ylim(0)
# BIDEN_LINE = pandas.Period('2021-01-22', freq='D')
# foo = fam.axvline(BIDEN_LINE, color="red", linestyle="--")

In [None]:
fazzle.iloc[15:, :]

In [None]:
fazzle['New Hospitalizations'].max(), fazzle.Deaths.max(), fazzle.Deaths.sum()

In [None]:
infected_states.columns

In [None]:
df = infected_states.reset_index()
# df = df[df.Date < DT]
print(f"{df.NewInf.sum()}, {(df.NewInf.sum()/327_000_000)}")

In [None]:
fizzle = infected_states.reset_index()[['Date', 'NewInf', 'Daily']].groupby('Date').sum().copy()
fizzle.columns = ['New Infections', 'Deaths']
fizzle = fizzle.loc['2021-11-12':, :]
fam = fizzle.plot(
    title=f"New Infections vs. Daily Deaths",
    secondary_y='Deaths', figsize=(25,5), ylim=0)

axes = fam.get_figure().get_axes()
for i in range(len(axes)):
    axes[i].set_ylim(0)

In [None]:
fizzle.loc['2020-09-05':, :]

In [None]:
fizzle = infected_states.reset_index()[['Date', 'CurrHosp', 'Daily']].groupby('Date').sum().copy()
fizzle.columns = ['Hospitalizations', 'Deaths']
fizzle.loc[:'2020-07-31', 'Hospitalizations'] = numpy.nan
fam = fizzle.plot(
    title=f"Current Hospitalizations vs. Daily Deaths",
    secondary_y='Deaths', figsize=(25,5), ylim=0)

axes = fam.get_figure().get_axes()
for i in range(len(axes)):
    axes[i].set_ylim(0)

In [None]:
fizzle

In [None]:
# This is where I noodle around to investigate particular states of interest

# This next line lists all 51 (DC included)
st_names = list(infected_states.index.get_level_values(0).unique())
st_names = ['CA', 'DC', 'MA', 'NM', 'NY', 'VA', ]
grouper, group_name = 'CurrHosp', 'Current Hospitalizations'
grouper, group_name = 'NHospPerM', 'New Hospitalizations/M'
num_plots = max(len(st_names), 2)
fig, axes = plt.subplots(num_plots, figsize=(25, 5*num_plots))
for i, st in enumerate(st_names):
    est_lag = DOD_META.get(st, (None, DEATH_LAG))[1]
    est_line = str(latest_date - (est_lag - 1))
    data = infected_states.loc[st, :].reset_index()[['Date', grouper, 'DPerM']].copy()
    # data = infected_states.loc[st, :].reset_index()[['Date', 'CurrHosp']].copy()
    data = data[data.Date >= '2020-08-01']
    data.columns = ['Date', group_name, 'Deaths/M']
    data = data.groupby('Date').sum()
    data.loc[:'2020-07-31', group_name] = numpy.nan
    # data.columns = ['Date', 'Hospitalizations']
    fam = data.plot(
        ax=axes[i], title=st, ylim=0, secondary_y='Deaths/M',
    )
    fam.axvline(est_line, color="red", linestyle="--")
    # fam.axvline('2021-09-10', color="red", linestyle="--")

axes = fam.get_figure().get_axes()
for i in range(len(axes)):
    axes[i].set_xlabel(None)
    if i < len(axes)/2:
        axes[i].set_ylim(0, 150)
    else:
        axes[i].set_ylim(0, 26)

In [None]:
# I usually will set this back about 10 days because I don't trust the estimated infections too much
DT = str(latest_date - 10)
term = 'NIPerM'
divisor = 10000 # 10000 to convert NIPerM to total percentage ever infected
ni = infected_states.reset_index()[['ST', 'Date', term]].copy()
ni = ni[ni.Date < DT].copy()
ni = (ni.groupby('ST').sum()[term].sort_values(ascending=False) / divisor)
# for v in ni.sort_index().values:
#     print(v/100)
ni

In [None]:
infected_states.Daily.sum()

In [None]:
# raise ValueError()

## Detritus

In [None]:
infected_states.reset_index().columns

In [None]:
grouper, col, per_m, title, cutoff = 'Voting', 'NewHosp', 'NHospPerM', 'New Hospitalizations', '2020-08-15'
grouper, col, per_m, title, cutoff = 'Region', 'Daily', 'DPerM', 'Daily Deaths', '2020-03-10'
fam = infected_states
# fam = fam[fam[grouper].isin(['NE', 'RedSouth'])]
fam = fam.reset_index().groupby(['Date', grouper]).sum().loc[:, [col, 'Pop',]]
fam[per_m] = fam[col] / fam.Pop
fam = fam.loc[cutoff:, :]
foo = pandas.pivot_table(fam, values=per_m, index=['Date'],
                         columns=grouper).plot(title=f"{title}/Million by {grouper}",
                                                 figsize=(25,5))
fam = fam.reset_index().groupby(grouper).agg({col: 'sum', 'Pop': 'mean'})
fam[per_m] = fam[col] / fam.Pop
fam = fam.sort_values(['DPerM'], ascending=False)
fam.columns = ['Total Deaths', 'Population', 'Deaths/Million']
fam

In [None]:
fam = infected_states.reset_index()
fam['Area'] = fam.ST.apply(lambda x: 'MA' if x == 'MA' else 'Rest')
grouper, col, per_m, title, cutoff = 'Area', 'Daily', 'DPerM', 'Daily Deaths', '2020-03-10'
fam = fam.groupby(['Date', grouper]).sum().loc[:, [col, 'Pop',]]
fam[per_m] = fam[col] / fam.Pop
fam = fam.loc[cutoff:, :]
foo = pandas.pivot_table(fam, values=per_m, index=['Date'],
                         columns=grouper).plot(title=f"{title}/Million by {grouper}",
                                                 figsize=(25,5))
fam = fam.reset_index().groupby(grouper).agg({col: 'sum', 'Pop': 'mean'})
fam[per_m] = fam[col] / fam.Pop
fam

In [None]:
infected_states.loc['DC', :].Daily.sum()

In [None]:
infected_states.loc['DC', :].loc['2021-12-01':'2021-12-31', ['Daily', 'NewHosp', 'CurrHosp', 'DPerM']].Daily.sum()

In [None]:
infected_states.columns

In [None]:
infected_states.reset_index().groupby(['Date']).sum().ActInf.tail(20)

In [None]:
download_path('data_download_file_reference_2020.csv')

In [None]:
# df20 = pandas.read_csv(download_path('data_download_file_reference_2020.csv'))
# df20 = df20[df20.location_name == 'United States of America'][['date', 'inf_mean', 'inf_cuml_mean', 'cases_mean', 'cumulative_deaths', 'daily_deaths', 'cumulative_cases', ]].copy()
# df20

In [None]:
# df21 = pandas.read_csv(download_path('data_download_file_reference_2021.csv'))
# df21 = df21[df21.location_name == 'United States of America'][['date', 'inf_mean', 'inf_cuml_mean', 'cases_mean', 'cumulative_deaths', 'daily_deaths', 'cumulative_cases', ]].copy()
# df21

In [None]:
# df20.inf_mean.sum() + df21.inf_mean.sum()

In [None]:
int(3.409786 * 100_000_000)

In [None]:
grouper, col, per_m, title, cutoff = 'Area', 'Daily', 'DPerM', 'Daily Deaths', '2020-03-10'
grouper, col, per_m, title, cutoff = 'Area', 'Daily', 'DPerM', 'Daily Deaths', '2021-02-01'
for st in list(infected_states.index.get_level_values(0).unique()):
    fam = infected_states.reset_index()
    fam = fam[fam.Date >= cutoff].copy()
    fam['Area'] = fam.ST.apply(lambda x: st if x == st else 'Rest')
    fam = fam.groupby([grouper, 'Date']).sum().loc[:, [col, 'Pop',]]
    fam[per_m] = fam[col] / fam.Pop
    fam = fam.loc[cutoff:, ['DPerM']]
    print(st, fam.loc[st, 'DPerM'].corr(fam.loc['Rest', 'DPerM']))

In [None]:
st = 'FL'
fam = infected_states.reset_index()
fam['Area'] = fam.ST.apply(lambda x: st if x == st else 'Rest')
grouper, col, per_m, title, cutoff = 'Area', 'Daily', 'DPerM', 'Daily Deaths', '2020-03-15'
fam = fam.groupby(['Date', grouper]).sum().loc[:, [col, 'Pop',]]
fam[per_m] = fam[col] / fam.Pop
fam = fam.loc[cutoff:, :]
foo = pandas.pivot_table(fam, values=per_m, index=['Date'],
                         columns=grouper).plot(title=f"{title}/Million by {grouper}",
                                                 figsize=(25,5))
fam = fam.reset_index().groupby(grouper).agg({col: 'sum', 'Pop': 'mean'})
fam[per_m] = fam[col] / fam.Pop
fam

In [None]:
infected_states[['Daily', 'NewConf']].reset_index().groupby('Date').sum()

In [None]:
cases = pandas.read_csv(download_path('cdc_cases.csv'), parse_dates=['submission_date'])
cases = cases[['submission_date', 'state', 'new_case', 'pnew_case']].copy()
cases.columns = ['Date', 'State', 'RawNew', 'RawProb']
cases = cases[~cases.State.isin(['AS', 'FSM', 'GU', 'MP', 'PR', 'PW', 'RMI', 'VI',])]
cases['Raw'] = cases.RawNew + cases.RawProb
cases = cases.groupby('Date').sum()[['Raw']].sort_index().reset_index()
cases.Date = [pandas.Period(x, freq='D') for x in cases.Date]
cases = cases.set_index('Date')
cases['Cases'] = cases.Raw.rolling(window=7, center=True, min_periods=1).mean()
cases = cases[['Cases']].copy()
deaths = infected_states[['Daily']].reset_index().groupby('Date').sum()
deaths.columns = ['Deaths']
foo = deaths.shift(-14)
both = pandas.concat([cases, foo], axis=1)
both['CFR'] = 100 * both.Deaths / both.Cases
both = both[['Deaths', 'CFR']]
# both = both.loc['2021-12-23':'2022-02-02', :].copy()
# both = both.loc['2021-12-23':'2022-02-02', :].copy()
both = both.loc['2021-04-15':, :].iloc[:-13, :].copy()
fam = both.plot(
    title=f"US Deaths vs. CFR",
    secondary_y='CFR', figsize=(25,5), ylim=0)

In [None]:
both.corr()

In [None]:
ma_df = pandas.read_excel('/mnt/c/Users/Patri/OneDrive/COVID/coronavirus.xlsx', sheet_name='MA', header=2).set_index('Date')
ma_df = ma_df[['Ctriag7', 'New5', 'Cur5', 'ICU5', 'Int5', 'Dtriag5']].sort_index()
ma_df.columns = ['Cases', 'NewHosp', 'CurHosp', 'ICU', 'Intub', 'Deaths']
foo = ma_df.loc['2021-02-15':, :][['Cases', 'Deaths']].copy()
foo['CFR'] = (foo.Deaths.shift(-14) * 100) / foo.Cases
foo = foo.iloc[:-16, :][['Deaths', 'CFR']]
fam = foo.plot(
    title=f"Massachusetts Deaths vs. CFR",
    secondary_y='CFR', figsize=(25,5), ylim=0)
axes = fam.get_figure().get_axes()
for i in range(len(axes)):
    axes[i].set_ylim(0)

In [None]:
owid_orig = pandas.read_csv(download_path('owid-covid-data.csv'), parse_dates=['date'])
owid_orig.head(3)

In [None]:
owid = owid_orig[owid_orig.iso_code == 'USA']
owid = owid[['date', 'new_deaths']].copy()
owid.date = [pandas.Period(d, freq='D') for d in owid.date]
owid = owid.set_index('date').loc['2020-02-29':, :].copy()
owid.columns = ['Reported']
owid.Reported = owid.Reported.rolling(window=14, center=True, win_type='triang', min_periods=14).mean()
owid = owid.dropna()
owid

In [None]:
dod = infected_states[['Daily']].reset_index().groupby('Date').sum()
# dod.index = [pandas.Period(d, freq='D') for d in dod.index]
dod.columns = ['DoD']
dod

In [None]:
for days in range(20):
    d = dod.loc['2021-01-01':'2021-12-31', 'DoD']
    r = owid.shift(-days).loc['2021-01-01':'2021-12-31', 'Reported']
    print(f"{days}: {d.corr(r)}")

In [None]:
pandas.concat([owid, dod], axis=1)

In [None]:
dod.DoD.tail(13).sum()

In [None]:
ma_centiles = (ma_df / (ma_df.max() / 100)).loc['2020-04-06':, :]
ma_centiles

In [None]:
ma_centiles.loc['2020-09-01':, :].plot(figsize=(25,5), ylim=0)

In [None]:
ma_centiles.loc['2022-03-01':, :].plot(figsize=(25,5), ylim=0)

In [None]:
ma_df.plot(figsize=(25,5), ylim=0, secondary_y=['Cases', 'CurHosp'])