# Calculating state-by-state implied infection numbers

This notebook tries to compute what the full infection numbers in the past and present likely were/are.

It does so in the past by blending variables for "median days from infection to death" and "infection fatility rate" (IFR) with smoothed death rates. In other words, days_to_death days before date D, there must have been roughly (deaths_on_date_D / IFR) infections to end up with a given number of deaths on date D.

When looking at the most recent days_to_death days, it looks up what percentage of infections were confirmed on the last day calculated in the past, and applies that percentage to the new infections found since then. It normalizes a bit by the amount of testing done on each day to try to handle significant ramping up/down of testing during that time, but the recent projections are admittedly sketchy.

The principal source of death data is files from the NY Times, supplemented byvmore accurate date-of-death datasets from 20+ states. The source of testing data is The COVID Tracking Project, maintained by The Atlantic.

In [None]:
%matplotlib inline
import numpy
import pandas
import matplotlib
import matplotlib.pyplot as plt
# import scipy.stats

import common2
from common2 import load_data, DOD_META, get_infections_df  # , smooth_series, calc_mid_weekly_average
# from common import calc_state_stats, get_infections_df, find_smooth_dates, load_in_data

In [None]:
pandas.set_option('display.max_rows', 300)
pandas.set_option('display.max_columns', 1000)
pandas.set_option('display.width', 1000)

In [None]:
EARLIEST_DATE = pandas.Period('2020-03-01', freq='D')

# Set a latest date when the most recent days have garbage (like on or after holidays)
LATEST_DATE = pandas.Period('2020-12-23', freq='D')
LATEST_DATE = None
LATEST_DATE = pandas.Period('2021-04-10', freq='D')

# Set a number of recent days to not display in the graphs for lack of future days to smooth them
NON_DISPLAY_DAYS = 0

In [None]:
latest_date, meta, all_stats = load_data(EARLIEST_DATE, LATEST_DATE)
latest_displayed = latest_date - NON_DISPLAY_DAYS
print(f"Latest date = {str(latest_date)}; latest displayed = {str(latest_displayed)}")

In [None]:
latest_displayed = latest_date - NON_DISPLAY_DAYS

## Calc Estimated Infections

In [None]:
# Median number of days between being exposed and developing illness
INCUBATION = 4

# Number of days one is infectious (this isn't actually used yet)
INFECTIOUS = 10

# Median days in between exposure and death
DEATH_LAG = 19

In [None]:
# Here is where you set variables for IFR assumptions

# Note that this IFR represents a country-wide average on any given day, but the IFRs
# are actually adjusted up/down based on median age and nursing home residents per capita

# This set represents my worst case scenario (in my 95% CI interval)
# Start by setting the inital and final IFRs
IFR_S, IFR_E = 0.013, 0.006
# Then set dates in between by which it linearly scales to various targets
IFR_BREAKS = [['2020-04-30', 0.0095], ['2020-07-31', 0.007], ['2020-09-15', 0.006]]

# This set is my optimistic scenario
IFR_S, IFR_E = 0.01, 0.0025
IFR_BREAKS = [['2020-04-30', 0.0075], ['2020-07-31', 0.0045], ['2020-09-15', 0.0025]]

# This set is a highly optimistic scenario that matches the recent CDC data
IFR_S, IFR_E = 0.009, 0.002
IFR_BREAKS = [['2020-04-30', 0.007], ['2020-07-31', 0.003], ['2020-09-15', 0.002]]

# This is my expected scenario
IFR_S, IFR_E = 0.01, 0.0035
IFR_BREAKS = [['2020-04-30', 0.0085], ['2020-07-31', 0.005], ['2020-09-15', 0.004], ['2021-01-15', 0.004]]

In [None]:
IFR_S_S, IFR_E_S = f'{100*IFR_S:.1f}%', f'{100*IFR_E:.2f}%'
infected_states = get_infections_df(all_stats, meta, DEATH_LAG, IFR_S, IFR_E, IFR_BREAKS, INCUBATION, INFECTIOUS)
EST_LINE = str(latest_date - (DEATH_LAG - 1))
print(f"Total infected by {latest_date}: {int(infected_states.NewInf.sum()):,}")
print(f"Vertical line marking recent estimations set at {EST_LINE}")

In [None]:
infected_states.tail()

## Now for the charts

In [None]:
# Just nicking off the values we don't want to display here
fazzy = infected_states.reset_index()
fazzy = fazzy[fazzy.Date <= latest_displayed]
fazzy = fazzy.set_index(['ST', 'Date'])
infected_states = fazzy

In [None]:
fazzle = infected_states.reset_index()[['Date', 'CurrHosp', 'Daily']].groupby('Date').sum()
fazzle.columns = ['Hospitalizations', 'Deaths']
# fazzle.loc[:'2020-07-15', 'New'] = numpy.nan
fazzle = fazzle.loc['2020-08-01':'2021-04-10', :]
fam = fazzle.plot(
    title=f"Current Hospitalizations vs. Daily Deaths",
    secondary_y='Deaths', figsize=(16,5), ylim=0)
axes = fam.get_figure().get_axes()
for i in range(len(axes)):
    axes[i].set_ylim(0)

In [None]:
fazzle.loc['2020-09-05':, :]

In [None]:
fazzle.Hospitalizations.max(), fazzle.Deaths.max(), fazzle.Deaths.sum()

In [None]:
fizzle = infected_states.reset_index()[['Date', 'NewInf', 'Daily']].groupby('Date').sum().copy()
fizzle.columns = ['New Infections', 'Deaths']
# fizzle = fizzle.loc['2020-08-01':, :]
fam = fizzle.plot(
    title=f"New Infections vs. Daily Deaths",
    secondary_y='Deaths', figsize=(16,5), ylim=0)

axes = fam.get_figure().get_axes()
for i in range(len(axes)):
    axes[i].set_ylim(0)

In [None]:
fizzle.loc['2020-09-05':, :]

In [None]:
fizzle = infected_states.reset_index()[['Date', 'CurrHosp', 'Daily']].groupby('Date').sum().copy()
fizzle.columns = ['Hospitalizations', 'Deaths']
fizzle.loc[:'2020-07-31', 'Hospitalizations'] = numpy.nan
fam = fizzle.plot(
    title=f"Hospitalizations vs. Daily Deaths",
    secondary_y='Deaths', figsize=(16,5), ylim=0)

axes = fam.get_figure().get_axes()
for i in range(len(axes)):
    axes[i].set_ylim(0)

In [None]:
# This is where I noodle around to investigate particular states of interest

# This next line lists all 51 (DC included)
st_names = ['SD', 'ND', 'IA', 'TN']
st_names = ['AZ', 'NM', 'PA', 'TX', 'VT',]
st_names = ['AZ', 'PA', 'WV', 'NM', 'MS', 'KS', 'TN', 'SD', 'NV', 'AL',
            'AR', 'RI', 'IL', 'IN', 'SC', 'MI', 'MA', 'CA', 'NJ', 'TX', ]
st_names = ['CA', 'TX', 'PA', 'NY', 'FL', 'AZ', 'IL', 'GA', ]
st_names = ['KS', 'MS', 'AL', 'AZ', 'PA', ]
st_names = ['TX', 'PA', 'GA', 'MA', 'FL', 'NC', 'TN', ]
st_names = ['AK', 'WV', 'ND', 'NM', 'SD']
st_names = ['CA', 'NY', 'IL', 'LA', ]
st_names = ['DC', 'DC',]
st_names = ['CA', 'DC', 'NM', 'MA', 'VA', ]
st_names = ['SD', 'ND', 'IA', 'TN']
st_names = st_names[40:]
st_names = list(infected_states.index.get_level_values(0).unique())
st_names = ['TX', 'MI']
num_plots = max(len(st_names), 2)
fig, axes = plt.subplots(num_plots, figsize=(15, 5*num_plots))
for i, st in enumerate(st_names):
    data = infected_states.loc[st, :].reset_index()[['Date', 'CurrHosp', 'DPerM']].copy()
    data = data[data.Date >= '2020-10-01']
    data.columns = ['Date', 'Hospitalizations', 'Deaths/M']
    fam = data.groupby('Date').sum().plot(
        ax=axes[i], title=st, ylim=0, secondary_y='Deaths/M',
    )
    fam.axvline(EST_LINE, color="red", linestyle="--")

axes = fam.get_figure().get_axes()
for i in range(len(axes)):
    axes[i].set_ylim(0)
    axes[i].set_xlabel(None)

In [None]:
df = infected_states.reset_index()
# df = df[df.Date < DT]
print(f"{df.NewInf.sum()}, {(df.NewInf.sum()/327_000_000)}")

In [None]:
# I usually will set this back about 10 days because I don't trust the estimated infections too much
DT = '2021-03-31'
term = 'NIPerM'
divisor = 10000 # 10000 to convert NIPerM to total percentage ever infected
ni = infected_states.reset_index()[['ST', 'Date', term]].copy()
ni = ni[ni.Date < DT].copy()
ni = (ni.groupby('ST').sum()[term].sort_values(ascending=False) / divisor)
# for v in ni.sort_index().values:
#     print(v/100)
ni

In [None]:
infected_states.Daily.sum()

In [None]:
raise ValueError()

In [None]:
# list(infected_states.index.get_level_values(0).unique())

In [None]:
num_plots = max(len(DOD_META), 2)
fig, axes = plt.subplots(num_plots, figsize=(15, 5*num_plots))
for i, (st, __, ignore_days, __) in enumerate(DOD_META):
    data = infected_states.loc[st, :].reset_index()[['Date', 'CurrHosp', 'DPerM']].copy()
    data = data[data.Date >= '2020-10-01']
    data.columns = ['Date', 'Hospitalizations', 'Deaths/M']
    fam = data.groupby('Date').sum().plot(
        ax=axes[i], title=st, ylim=0, secondary_y='Deaths/M',
    )
    est_line = str(latest_date - ignore_days)
    fam.axvline(est_line, color="red", linestyle="--")

axes = fam.get_figure().get_axes()
for i in range(len(axes)):
    axes[i].set_ylim(0)
    axes[i].set_xlabel(None)

In [None]:
st_names = ['AR', 'CO', 'DC', 'DE', 'HI', 'IA', 'ID',
            'KS', 'KY', 'MD', 'ME', 'MN', 'MT', 'NE', 'NH', 'NM',
            'OK', 'OR', 'UT', 'VT', 'WA', 'WI', 'WV', 'WY']
st_names = ['HI', 'MT', ]
st_names = ['DE', 'KS', ]
num_plots = max(len(st_names), 2)
fig, axes = plt.subplots(num_plots, figsize=(15, 5*num_plots))
for i, st in enumerate(st_names):
    data = infected_states.loc[st, :].reset_index()[['Date', 'Hospital5', 'DPerM']].copy()
    data = data[data.Date >= '2020-11-01']
    data.columns = ['Date', 'Hospitalizations', 'Deaths/M']
    fam = data.groupby('Date').sum().plot(
        ax=axes[i], title=st, ylim=0, secondary_y='Deaths/M',
    )
    fam.axvline(EST_LINE, color="red", linestyle="--")

axes = fam.get_figure().get_axes()
for i in range(len(axes)):
    axes[i].set_ylim(0)

In [None]:
df = infected_states.reset_index()
# df = df[df.Date < DT]
print(f"{(df.NewInf.sum()/327_000_000)}")

In [None]:
# I usually will set this back about 10 days because I don't trust the estimated infections too much
DT = '2021-02-20'
term = 'NIPerM'
divisor = 10000 # 10000 to convert NIPerM to total percentage ever infected
ni = infected_states.reset_index()[['ST', 'Date', term]].copy()
ni = ni[ni.Date < DT].copy()
ni = (ni.groupby('ST').sum()[term].sort_values(ascending=False) / divisor)
# for v in ni.sort_index().values:
#     print(v/100)
ni

In [None]:
# Stopping the processing of this notebook
raise ValueError()

## Detritus

In [None]:
infected_states.reset_index().columns

In [None]:
foo = infected_states.reset_index()
foo = foo[foo.Date == '2021-02-23']
foo = foo.set_index('ST')
foo.sort_values('DPerM', ascending=False)[['DPerM']]

In [None]:
foo = infected_states.reset_index().groupby('ST').max()[['Deaths', 'Pop']]
foo['DPerM'] = foo.Deaths / foo.Pop
foo.sort_values('DPerM', ascending=False).head(30)

In [None]:
#pandas.read_csv('https://covidtracking.com/api/v1/states/daily.csv', low_memory=False)

In [None]:
df = infected_states.loc['VA', :][['RawInc', 'Daily', 'Deaths7', 'DPerM', 'Confirms7', 'NIPerM']]
df.loc['2020-12-03':, :].tail(60)

In [None]:
df = pandas.concat(states)[['DTests7']].reset_index()
st_names = list(df.ST.unique())
fig, axes = plt.subplots(len(st_names), figsize=(10, 4*len(st_names)))
for i, state in enumerate(st_names):
    try:
        df[df.ST == state].set_index('Date').DTests7.plot(ax=axes[i], title=state)
    except:
        pass


In [None]:
foo = {}
for st in ['WY', 'MA']:
    data = infected_states.loc[st, :]
    data = infected_states.loc[st, :].loc['2020-07-01':, :]
    # foo[st] = data.NIPerM
    foo[st] = data.DPerM
foo = pandas.DataFrame(foo)
fam = foo.plot(figsize=(15,5), legend=True, ylim=0)

In [None]:
spaz = nyt_stats[['ST', 'Nursing', 'Pop', 'Median']].drop_duplicates().copy()
spaz['NPerM'] = spaz.Nursing / spaz.Pop
spaz.sort_values('Median', ascending=False)

In [None]:
fizz = infected_states.reset_index()
fizz = fizz[fizz.Date <= '2020-12-01']
fizz.NewInf.sum()

In [None]:
foo = infected_states.loc['NM', :]
foo.Daily.tail(60)

In [None]:
#infected_states.columns

In [None]:
foo = infected_states[['Deaths7', 'DPerM', 'Pop']].reset_index().copy()
ma = foo[foo.ST.isin(['MA'])].copy()
us = foo.groupby('Date').sum().reset_index()
us['ST'] = 'US'
us['DPerM'] = us.Deaths7 / us.Pop
both = pandas.concat([ma, us]).sort_values(['Date', 'ST'])
both.tail()
fam = pandas.pivot_table(both, values = 'DPerM', index=['Date'],
                         columns = 'ST').plot(title="US vs. MA Deaths/Million", figsize=(15,5))

In [None]:
# state = states[34]
# st, start = state.index[0]
# spans = []
# start_amt = IFR_S
# for end, end_amt in IFR_BREAKS:
#     end = pandas.Period(end, 'D')
#     idx = pandas.period_range(start=start, end=end, freq='D')
#     spans.append(pandas.Series(numpy.linspace(start_amt, end_amt, len(idx)), index=idx).iloc[0:-1])
#     start, start_amt = end, end_amt

# st, end = state.index[-1]
# idx = pandas.period_range(start=start, end=end, freq='D')
# spans.append(pandas.Series(numpy.linspace(start_amt, IFR_E, len(idx)), index=idx))
# span = pandas.concat(spans)
# span = pandas.Series(span.values, index=state.index)
# span
# # ifr = pandas.Series(numpy.linspace(IFR_S, IFR_E, len(state)), index=state.index)
# # ifr[0], ifr[-1]

In [None]:
# fam = infected_states.reset_index()[['Date', 'NewInf']].groupby('Date').sum().plot(
#     title=f"Infection Estimations, 19 median days to death, "
#           f"IFR improving {IFR_S_S} - {IFR_E_S}",
#     figsize=(13,5), legend=None, ylim=0
# )
# __ = fam.axvline(EST_LINE, color="red", linestyle="--")

In [None]:
# fam = infected_states.reset_index()[['Date', 'Deaths7']].groupby('Date').sum().plot(
#     title="Deaths", figsize=(13,5),
#     legend=None, ylim=0, secondary_y='Deaths7'
# )

In [None]:
fizz = infected_states.reset_index().groupby('Date').agg({'DPerM': [numpy.mean, numpy.std]}).dropna()
fizz.columns = ['Mean', 'StdDev']
fizz['Ratio'] = fizz.StdDev / fizz.Mean
fizz.sort_values('Ratio').head(20)

In [None]:
fizz = infected_states.reset_index().groupby('Date').agg({'DPerM': lambda x: numpy.std(x) / numpy.mean(x)}).dropna()
fizz