# Calculating state-by-state implied infection numbers

This notebook tries to compute what the full infection numbers in the past and present likely were/are.

It does so in the past by blending variables for "median days from infection to death" and "infection fatility rate" (IFR) with smoothed death rates. In other words, days_to_death days before date D, there must have been roughly (deaths_on_date_D / IFR) infections to end up with a given number of deaths on date D.

It does in the present to looking at what percentage of infections were confirmed on the last day calculated in the past, and applying that percentage to the new infections found since then. That doesn't quite take into account if there is a significant ramping of testing during that time, but it should be close enough.

The principal source of death data is files from the NY Times, supplemented by a more accurate DateOfDeath.csv from Massachusetts. The source of testing data is The COVID Tracking Project, maintained by The Atlantic.

NOTE: Prior to running this notebook, you should retrieve the latest DateOfDeath.csv file by:

1. going to https://www.mass.gov/info-details/covid-19-response-reporting,
2. downloading the raw data zip from the line saying "Raw data used to create the dashboard is available here:"
3. copying the DateofDeath.csv in that file to the same directory as the notebook

Yeah, that could be automated. Just haven't done it yet...

In [None]:
%matplotlib inline
import numpy
import pandas
import matplotlib
import matplotlib.pyplot as plt

from common import load_data, smooth_series, calc_mid_weekly_average

In [None]:
# Earliest date that there is sufficient data for all states, including MA
EARLIEST_DATE = pandas.Period('2020-03-10', freq='D')
LATEST_DATE = pandas.Period('2020-08-11', freq='D')
LATEST_DATE = None


In [None]:
latest_date, meta, nyt_stats, ct_stats = load_data(EARLIEST_DATE, LATEST_DATE)
print(f"Latest date = {str(latest_date)}")

In [None]:
ct_stats.tail()

### Put the two datasets together

In [None]:
ct1 = ct_stats.set_index(['ST', 'Date']).sort_index()[['Pos', 'Tests']]
nyt1 = nyt_stats.set_index(['ST', 'Date']).sort_index()[['Deaths']]
stats = ct1.join(nyt1)
stats.head()

### Calculate new stats, state by state

In [None]:
def calc_state_stats(state, state_stats, meta):
    st = state_stats.groupby('Date').sum().sort_index().copy()

    st['ST'] = state
    st['Pop'] = meta.loc[state].Pop

    # Smooth series that might not be reported daily in some states
    st.Pos = smooth_series(st.Pos)
    st.Deaths = smooth_series(st.Deaths)

    # Prep for 7-day smoothing calculations
    st['Confirms'], st['Confirms7'] = calc_mid_weekly_average(st.Pos)
    st['Daily'], st['Deaths7'] = calc_mid_weekly_average(st.Deaths)
    st['DTests'], st['DTests7'] = calc_mid_weekly_average(st.Tests)

    return st.reset_index().set_index(['ST', 'Date']).copy()

In [None]:
meta_tmp = meta.set_index('ST')[['Pop']]

In [None]:
states = [calc_state_stats(state, df, meta_tmp)
          for state, df in stats.reset_index().groupby('ST')]
states[-17].tail()

In [None]:
def get_infections_df(states, death_lag, ifr_high, ifr_low, incubation, infectious,
                      max_confirmed_ratio=0.7):
    new_states = []
    for state in states:
        state = state.copy()

        # Calculate the IFR to apply for each day
        ifr = pandas.Series(numpy.linspace(ifr_high, ifr_low, len(state)), index=state.index)
        # Calculate the infections in the past
        infections = state.shift(-death_lag).Deaths7 / ifr
        
        # Calculate the min infections based on max_confirmed_ratio
        min_infections = state.Confirms7 / max_confirmed_ratio
        infections = infections.combine(min_infections, max, 0)

        # Find out the ratio of infections that were detected on the last date in the past
        last_date = infections.index[-(death_lag+1)]
        last_ratio = infections.loc[last_date] / (state.loc[last_date, 'Confirms7'] + 1)

        # Apply that ratio to the dates since that date
        infections.iloc[-death_lag:] = state.Confirms7.iloc[-death_lag:] * last_ratio

        state['DPerM'] = state.Deaths7 / state.Pop
        state['NewInf'] = infections
        state['TotInf'] = infections.cumsum()
        state['ActInf'] = infections.rolling(infectious).sum().shift(incubation)
        state['ActKnown'] = state.Confirms7.rolling(infectious).sum()
        state['ActUnk'] = state.ActInf - state.ActKnown
        state['AIPer1000'] = state.ActInf / state.Pop / 1000.
        state['AUPer1000'] = state.ActUnk / state.Pop / 1000.
        state['PctFound'] = state.Confirms7 / (state.NewInf + 1)
        new_states.append(state)

    return pandas.concat(new_states)

In [None]:
infected_states = get_infections_df(states, 19, 0.011, 0.004, 4, 10)
print(infected_states.NewInf.sum())
fam = infected_states.reset_index()[['Date', 'NewInf']].groupby('Date').sum().plot(
    title="Infection Estimations, 19 median days to death, IFR improving 1.1% - 0.4%",
    figsize=(13,5), legend=None, ylim=0
)

In [None]:
fam = infected_states.reset_index()[['Date', 'Deaths7']].groupby('Date').sum().plot(
    title="Deaths", figsize=(13,5),
    legend=None, ylim=0
)

In [None]:
foozle = infected_states.reset_index()[['Date', 'NewInf', 'Deaths7']].groupby('Date').sum()
foozle.columns = ['Infections', 'Deaths']
ax = foozle.plot(
    title="Daily Infections vs. Deaths, 19 median days to death, IFR improving 1.1% - 0.4%",
    secondary_y='Deaths', figsize=(13,5), ylim=0)

In [None]:
foo = infected_states.reset_index().set_index(['Date', 'ST']).sort_index()
foo = foo[['Pop', 'Confirms7', 'Deaths7', 'NewInf', 'AIPer1000', 'AUPer1000', 'PctFound']]

In [None]:
faz = foo.loc[latest_date, :].sort_values('AUPer1000', ascending=False).copy()
faz = faz.reset_index()[['ST', 'Pop', 'Confirms7', 'Deaths7', 'AUPer1000', 'PctFound']]
faz.columns = ['ST', 'Pop', 'Cases', 'Deaths', 'ActUnk1000', 'PctFound']
faz = faz.set_index('ST')
faz.head(15)

In [None]:
faz

In [None]:
faz[faz.PctFound <= 0.3].sort_values('PctFound')

In [None]:
fam = infected_states[['Pop', 'Confirms7', 'Deaths7', 'NewInf']].copy()
fam['C7Per'] = fam.Confirms7 / fam.Pop
fam['D7Per'] = fam.Deaths7 / fam.Pop
fam['NIPer'] = fam.NewInf / fam.Pop
fam = fam.reset_index()[['ST', 'NIPer', 'C7Per', 'D7Per']]
fam.columns = ['ST', 'Infections', 'Confirms', 'Deaths']
fam = fam.groupby('ST').max().copy()
fam.sort_values('Deaths', ascending=False)

In [None]:
raise ValueError()

## Now for the charts...

In [None]:
st_names = ['AL', 'AZ', 'FL', 'GA', 'LA', 'MS', 'NV', 'SC', 'TX']
st_names = list(infected_states.index.get_level_values(0).unique())
st_names = ['NY', 'NJ', 'MA']
st_names = ['GA']
num_plots = max(len(st_names), 2)
fig, axes = plt.subplots(num_plots, figsize=(10, 4*num_plots))
for i, state in enumerate(st_names):
    try:
        infected_states.loc[state, :].DPerM.plot(ax=axes[i], title=state)
    except:
        pass


In [None]:
infected_states.index.get_level_values(0).unique()

In [None]:
df = pandas.concat(states)[['DTests7']].reset_index()
st_names = list(df.ST.unique())
fig, axes = plt.subplots(len(st_names), figsize=(10, 4*len(st_names)))
for i, state in enumerate(st_names):
    try:
        df[df.ST == state].set_index('Date').DTests7.plot(ax=axes[i], title=state)
    except:
        pass


In [None]:
infected_states.loc['GA', :].DPerM.plot(title='GA')

In [None]:
infected_states.tail()