# Calculating state-by-state implied infection numbers

This notebook tries to compute what the full infection numbers in the past and present likely were/are.

It does so in the past by blending variables for "median days from infection to death" and "infection fatility rate" (IFR) with smoothed death rates. In other words, days_to_death days before date D, there must have been roughly (deaths_on_date_D / IFR) infections to end up with a given number of deaths on date D.

It does in the present to looking at what percentage of infections were confirmed on the last day calculated in the past, and applying that percentage to the new infections found since then. That doesn't quite take into account if there is a significant ramping of testing during that time, but it should be close enough.

The principal source of death data is files from the NY Times, supplemented by a more accurate DateOfDeath.csv from Massachusetts. The source of testing data is The COVID Tracking Project, maintained by The Atlantic.

NOTE: Prior to running this notebook, you should retrieve the latest DateOfDeath.csv file by:

1. going to https://www.mass.gov/info-details/covid-19-response-reporting,
2. downloading the raw data zip from the line saying "Raw data used to create the dashboard is available here:"
3. copying the DateofDeath.csv in that file to the same directory as the notebook

Yeah, that could be automated. Just haven't done it yet...

In [None]:
%matplotlib inline
import numpy
import pandas
import matplotlib
import matplotlib.pyplot as plt

from common import load_data, smooth_series, calc_mid_weekly_average
from common import calc_state_stats, get_infections_df, find_smooth_dates

In [None]:
# Earliest date that there is sufficient data for all states, including MA
EARLIEST_DATE = pandas.Period('2020-03-10', freq='D')
LATEST_DATE = pandas.Period('2020-12-05', freq='D')
LATEST_DATE = None

In [None]:
latest_date, meta, nyt_stats, ct_stats = load_data(EARLIEST_DATE, LATEST_DATE)
print(f"Latest date = {str(latest_date)}")

In [None]:
nyt_stats.tail(2)

In [None]:
ct_stats.tail()

In [None]:
# raise ValueError()

### Put the two datasets together

In [None]:
ct1 = ct_stats.set_index(['ST', 'Date']).sort_index()[['Pos', 'Neg']]
nyt1 = nyt_stats.set_index(['ST', 'Date']).sort_index()[['Deaths']]
both = ct1.join(nyt1)
meta_tmp = meta.set_index('ST')

In [None]:
states = [calc_state_stats(state, df, meta_tmp, latest_date)
          for state, df in both.reset_index().groupby('ST')]
states[-17].tail(2)

In [None]:
stats = pandas.concat(states).reset_index()
stats[stats.ST == 'WV'].tail(5)[['Date', 'RawDeaths', 'Deaths', 'Deaths7']]

### Calculate new stats, state by state

In [None]:
INCUBATION, INFECTIOUS, DEATH_LAG = 4, 10, 19
IFR_S, IFR_E = 0.011, 0.003
IFR_BREAKS = [['2020-04-30', 0.008], ['2020-07-31', 0.004], ['2020-09-15', 0.003]]
IFR_S, IFR_E = 0.012, 0.004
IFR_BREAKS = [['2020-04-30', 0.0085], ['2020-07-31', 0.005], ['2020-09-15', 0.004]]
IFR_S_S, IFR_E_S = f'{100*IFR_S:.1f}%', f'{100*IFR_E:.2f}%'
infected_states = get_infections_df(states, meta, DEATH_LAG, IFR_S, IFR_E, IFR_BREAKS, INCUBATION, INFECTIOUS)
EST_LINE = str(latest_date - (DEATH_LAG - 1))
print(infected_states.NewInf.sum())
print(EST_LINE)
infected_states.tail(3)

In [None]:
# state = states[34]
# st, start = state.index[0]
# spans = []
# start_amt = IFR_S
# for end, end_amt in IFR_BREAKS:
#     end = pandas.Period(end, 'D')
#     idx = pandas.period_range(start=start, end=end, freq='D')
#     spans.append(pandas.Series(numpy.linspace(start_amt, end_amt, len(idx)), index=idx).iloc[0:-1])
#     start, start_amt = end, end_amt

# st, end = state.index[-1]
# idx = pandas.period_range(start=start, end=end, freq='D')
# spans.append(pandas.Series(numpy.linspace(start_amt, IFR_E, len(idx)), index=idx))
# span = pandas.concat(spans)
# span = pandas.Series(span.values, index=state.index)
# span
# # ifr = pandas.Series(numpy.linspace(IFR_S, IFR_E, len(state)), index=state.index)
# # ifr[0], ifr[-1]

In [None]:
fam = infected_states.reset_index()[['Date', 'NewInf']].groupby('Date').sum().plot(
    title=f"Infection Estimations, 19 median days to death, "
          f"IFR improving {IFR_S_S} - {IFR_E_S}",
    figsize=(13,5), legend=None, ylim=0
)
__ = fam.axvline(EST_LINE, color="red", linestyle="--")

In [None]:
infected_states.reset_index()[['Date', 'NewInf']].groupby('Date').sum().tail(2)

In [None]:
fam = infected_states.reset_index()[['Date', 'Deaths7']].groupby('Date').sum().plot(
    title="Deaths", figsize=(13,5),
    legend=None, ylim=0, secondary_y='Deaths7'
)

In [None]:
foozle = infected_states.reset_index()[['Date', 'NewInf', 'Deaths7']].groupby('Date').sum()
foozle.columns = ['Infections', 'Deaths']
fam = foozle.plot(
    title=f"Daily Infections vs. Deaths, 19 median days to death, "
          f"IFR improving {IFR_S_S} - {IFR_E_S}",
    secondary_y='Deaths', figsize=(13,5), ylim=0)
__ = fam.axvline(EST_LINE, color="red", linestyle="--")
__ = fam.get_figure().get_axes()[1].set_ylim(0)

In [None]:
foo = infected_states.reset_index()[['Date', 'Region', 'NewInf', 'Deaths7', 'Pop']]
foo = foo.groupby(['Region', 'Date']).sum()
foo['NIPerM'] = foo.NewInf / foo.Pop
foo['DPerM'] = foo.Deaths7 / foo.Pop

In [None]:
fam = pandas.pivot_table(foo.reset_index(), values = 'NIPerM', index=['Date'],
                         columns = 'Region').plot(title="New Daily Infections per Million", figsize=(15,5))
__ = fam.axvline(EST_LINE, color="red", linestyle="--")

In [None]:
fam = pandas.pivot_table(foo.reset_index(), values = 'DPerM', index=['Date'],
                         columns = 'Region').plot(title="Daily Deaths per Million", figsize=(15,5))

In [None]:
foo = infected_states.reset_index().set_index(['Date', 'ST']).sort_index()
foo = foo[['Pop', 'Confirms7', 'Deaths7', 'NewInf', 'AIPer1000', 'AUPer1000', 'PctFound']]
faz = foo.loc[latest_date, :].sort_values('AUPer1000', ascending=False).copy()
faz = faz.reset_index()[['ST', 'Pop', 'Confirms7', 'Deaths7', 'AIPer1000', 'AUPer1000', 'PctFound']]
faz.columns = ['ST', 'Pop', 'Cases', 'Deaths', 'AIPer1000', 'ActUnk1000', 'PctFound']
faz.head(10)

In [None]:
foo[foo.AUPer1000 > 36.6].loc[:'2020-11-18', :]

In [None]:
faz.sort_values('PctFound').head()  # [faz.PctFound <= 0.15]

In [None]:
fam = infected_states[['Pop', 'Confirms7', 'Deaths7', 'NewInf']].copy()
fam['C7Per'] = fam.Confirms7 / fam.Pop
fam['D7Per'] = fam.Deaths7 / fam.Pop
fam['NIPer'] = fam.NewInf / fam.Pop
fam = fam.reset_index()[['ST', 'NIPer', 'C7Per', 'D7Per']]
fam.columns = ['ST', 'Infections', 'Confirms', 'Deaths']
fam = fam.groupby('ST').max().copy()
print("Maximum deaths/M/day states ever had")
fam.sort_values('Deaths', ascending=False)

In [None]:
st_names = list(infected_states.index.get_level_values(0).unique())
st_names = ['ND', 'SD', 'MS', 'ID', 'NM', 'WY']
st_names = ['AR', 'CO', 'CT', 'DE', 'IA', 'ID', 'IL', 'IN', 'KY', 'MA',
            'MI', 'MN', 'MO', 'MT', 'NC', 'ND', 'NE', 'NM', 'OK', 'PA', 'RI',
            'SD', 'TN', 'TX', 'UT', 'WI', 'WV', 'WY',]
st_names = ['AK', 'WY', 'SD', 'NM', 'ND']
st_names = ['AZ', 'SD', 'ND', 'NM', 'GA', 'MS', 'NV', 'NJ', 'WY', 'ID',
            'SC', 'MA', 'DC', 'LA', 'FL', 'MT', 'IL', 'TX', 'NY']
st_names = ['AK', 'ME', 'OR', 'UT', 'VT', 'WA', 'WY']
st_names = ['AZ', 'NE', 'MT', 'NM', 'WY', 'ND', 'SD']
st_names = ['MA', 'RI', 'CT', 'NY', 'NJ', 'DC']
st_names = ['DC', 'MA']
st_names = ['SD', 'ND', 'OR']
st_names = list(infected_states.index.get_level_values(0).unique())
st_names = ['IA', 'IL', 'MN', 'MT', 'ND', 'NM', 'SD']
st_names = ['DC', 'MA', 'NM']
num_plots = max(len(st_names), 2)
fig, axes = plt.subplots(num_plots, figsize=(15, 5*num_plots))
for i, st in enumerate(st_names):
    data = infected_states.loc[st, :].reset_index()[['Date', 'NIPerM', 'DPerM']].copy()
    # data = data[data.Date >= '2020-10-01']
    data.columns = ['Date', 'Infections/M', 'Deaths/M']
    fam = data.groupby('Date').sum().plot(
        ax=axes[i], title=st, ylim=0, secondary_y='Deaths/M',
    )
    fam.axvline(EST_LINE, color="red", linestyle="--")

In [None]:
ni = infected_states.reset_index()[['ST', 'Date', 'NIPerM']].copy()
# ni = ni[ni.Date < '2020-11-23'].copy()
ni = (ni.groupby('ST').sum().NIPerM.sort_values(ascending=False) / 10000)
ni.head(15)

In [None]:
raise ValueError()

## Now for the charts...

In [None]:
infected_states.reset_index().columns

In [None]:
df = pandas.concat(states)[['DTests7']].reset_index()
st_names = list(df.ST.unique())
fig, axes = plt.subplots(len(st_names), figsize=(10, 4*len(st_names)))
for i, state in enumerate(st_names):
    try:
        df[df.ST == state].set_index('Date').DTests7.plot(ax=axes[i], title=state)
    except:
        pass


In [None]:
foo = {}
for st in ['FL', 'MA']:
    data = infected_states.loc[st, :].loc['2020-07-01':, :]
    data = infected_states.loc[st, :]
    foo[st] = data.NIPerM  # data.DPerM
foo = pandas.DataFrame(foo)
fam = foo.plot(figsize=(15,5), legend=True, ylim=0)

In [None]:
spaz = nyt_stats[['ST', 'Nursing', 'Pop', 'Median']].drop_duplicates().copy()
spaz['NPerM'] = spaz.Nursing / spaz.Pop
spaz.sort_values('Median', ascending=False)