In [None]:
%matplotlib inline
import datetime

import numpy
import pandas
import matplotlib
import matplotlib.pyplot as plt

from common2 import load_data, calc_mid_weekly_average, smooth_series, spread_deaths

In [None]:
# Earliest date that there is sufficient data for all states, including MA
EARLIEST_DATE = pandas.Period('2020-03-10', freq='D')
LATEST_DATE = pandas.Period('2020-08-11', freq='D')
LATEST_DATE = None

In [None]:
latest_date, meta, nyt_stats, ct_stats = load_data(EARLIEST_DATE, LATEST_DATE)
print(f"Latest date = {str(latest_date)}")

In [None]:
nyt1 = nyt_stats.set_index(['ST', 'Date']).sort_index()[['Deaths']]
meta_tmp = meta.set_index('ST')
nyt1.head()

In [None]:
def create_smooth_dates(earliest_date, latest_date):
    sd = str(earliest_date)
    ed = str(latest_date)
    all_dates = pandas.date_range(start=sd, end=ed, freq='D')

    for k, cfg in SMOOTH_CONFIGS.items():
        # Start empty
        dates = pandas.DatetimeIndex([], freq='D')
        
        # Compile date ranges excluding certain days of the week
        for dow in cfg['DaysOfWeek']:
            dates = dates.union(pandas.date_range(start=sd, end=ed, freq=dow))
        
        # Add the holidays (and some surrounding days sometimes)
        holidays = cfg.get('Holidays', [])
        if len(holidays):
            dates = dates.union(pandas.DatetimeIndex(holidays))
        
        # Make sure that there is at least one non-excluded day at the end
        for i in range(1, len(dates)):
            if dates[-i] != all_dates[-i]:
                break
        if i > 1:
            i -= 1
            print(f"Keeping date(s) {list(dates[-i:])}")
            dates = dates[:-i].copy()

        SMOOTH_DATES[k] = pandas.PeriodIndex([pandas.Period(str(v), freq='D') for v in dates])

def find_smooth_dates(st):
    for k, states in SMOOTH_MAPS.items():
        if st in states:
            return SMOOTH_DATES[k]
    return None

In [None]:
def calc_state_stats(state, state_stats, meta, latest_date):
    st = state_stats.groupby('Date').sum().sort_index().copy()

    st['ST'] = state
    st['RawDeaths'] = st.Deaths
    st['RawInc'] = st.Deaths - st.Deaths.shift()

    st = st.reset_index().copy()

    # Correct for various jumps in the data
    STATE_DEATH_ADJUSTMENTS = (
        ('AL', -20, '2020-04-23'),
        ('AZ', 45, '2020-05-08'),
        ('AR', 143, '2020-09-15'),
        ('CO', 65, '2020-04-24'),
        ('CO', -29, '2020-04-25'),
        ('DE', 67, '2020-06-23'),
        ('DE', 47, '2020-07-24'),
        ('IL', 123, '2020-06-08'),
        ('IN', 11, '2020-07-03'),
        ('LA', 40, '2020-04-14'),
        ('LA', 40, '2020-04-22'),
        ('MD', 68, '2020-04-15'),
        ('MI', 220, '2020-06-05'),
        ('MI', 60, '2020-09-09'),
        ('NJ', 1854, '2020-06-25'),
        ('NJ', 75, '2020-07-08'),
        ('NJ', -54, '2020-07-22'),
        ('NJ', -38, '2020-07-29'),
        ('NJ', -25, '2020-08-05'),
        ('NJ', -10, '2020-08-12'),
        ('NJ', -44, '2020-08-26'),
        ('NY', 608, '2020-06-30'),  # most apparently happened at least three weeks earlier
        ('NY', -113, '2020-08-06'),
        ('NY', -11, '2020-09-09'),
        ('OH', 80, '2020-04-29'),
        ('SC', 25, '2020-04-29'),
        ('SC', 37, '2020-07-16'),
        ('TN', 16, '2020-06-12'),
        ('TX', 636, '2020-07-27'),
        ('WA', -12, '2020-06-17'),
        ('WA', 7, '2020-06-18'),
        ('WA', 30, '2020-07-24'),
        ('WA', -11, '2020-08-05'),
        ('WI', 8, '2020-06-10'),
    )

    for state_, deaths, deaths_date in STATE_DEATH_ADJUSTMENTS:
        if state_ != state:
            continue
        if pandas.Period(deaths_date) <= latest_date:
            spread_deaths(st, state_, deaths, deaths_date)

    # Blank out and forward fill entries for days with wimpy reporting
    dates = find_smooth_dates(state)
    if dates is not None:
        st = st.set_index('Date')
        indices = st.index.isin(dates)
        st.loc[indices, 'Deaths'] = numpy.nan
        st.Deaths = st.Deaths.fillna(method='ffill')
        st = st.reset_index().copy()

    # Smooth series that might not be reported daily in some states
    st.Deaths = smooth_series(st.Deaths)

    # Prep for 7-day smoothing calculations
    st['Daily'], st['Deaths7'] = calc_mid_weekly_average(st.Deaths)

    return st.reset_index().set_index(['ST', 'Date']).copy()

In [None]:
SMOOTH_CONFIGS = dict(
    SatSun=
        dict(
            DaysOfWeek = ('W-SAT', 'W-SUN', ),
            Holidays = (
                '05-23-2020', '05-26-2020', '05-27-2020',  # Memorial Day
                '07-03-2020', '07-04-2020', # Independence Day
                '09-05-2020', '09-08-2020', '09-09-2020',  # Labor Day
            )
        ),
    SatSunMon=
        dict(
            DaysOfWeek = ('W-SAT', 'W-SUN', 'W-MON', ),
            Holidays = (
                '05-23-2020', '05-26-2020', '05-27-2020',  # Memorial Day
                '07-03-2020', '07-04-2020', # Independence Day
                '09-05-2020', '09-08-2020', '09-09-2020',  # Labor Day
            )
        ),
    SunMon=
        dict(
            DaysOfWeek = ('W-SUN', 'W-MON'),
            Holidays = (
                '05-23-2020', '05-26-2020', '05-27-2020',  # Memorial Day
                '07-03-2020', '07-04-2020', # Independence Day
                '09-05-2020', '09-08-2020', '09-09-2020',  # Labor Day
            )
        ),
    SunMonTue=
        dict(
            DaysOfWeek = ('W-SUN', 'W-MON', 'W-TUE'),
            Holidays = (
                '05-23-2020', '05-26-2020', '05-27-2020',  # Memorial Day
                '07-03-2020', '07-04-2020', # Independence Day
                '09-05-2020', '09-08-2020', '09-09-2020',  # Labor Day
            )
        ),
    NewYork=
        dict(
            DaysOfWeek = (),
            Holidays = (
                '04-30-2020', '05-01-2020', '05-02-2020',
                '05-03-2020', '05-04-2020', '05-05-2020', 
                '05-23-2020', '05-24-2020', '05-25-2020',  # Memorial Day
            )
        ),
    Penn=
        dict(
            DaysOfWeek = ('W-SUN', 'W-MON'),
            Holidays = (
                '04-21-2020', '04-22-2020', '04-23-2020',
                '04-24-2020', '04-25-2020', '04-26-2020',
                '04-27-2020', '04-28-2020', '04-29-2020',

                '05-03-2020', '05-04-2020', '05-05-2020',
                '05-06-2020', '05-07-2020',

                '05-23-2020', '05-26-2020', '05-27-2020',  # Memorial Day
                '07-03-2020', '07-04-2020', # Independence Day
                '09-05-2020', '09-08-2020', '09-09-2020',  # Labor Day
            )
        ),
    Virginia=
        dict(
            DaysOfWeek = ('W-SUN', 'W-MON'),
            Holidays = (
                '05-23-2020', '05-26-2020', '05-27-2020',  # Memorial Day
                '07-03-2020', '07-04-2020', # Independence Day
                '09-05-2020', '09-08-2020', '09-09-2020',  # Labor Day

                '2020-09-10', '2020-09-11', '2020-09-12', 
                '2020-09-13', '2020-09-14',
            )
        ),
)

SMOOTH_DATES = dict()

SMOOTH_MAPS = dict(
    SatSun=('GA', 'IA', 'ID', 'KS', 'TN', 'UT', ),
    SatSunMon=('CA', 'CO', 'DE', 'IL', 'LA', 'NV', 'OH', 'SC', ),
    SunMon=('AR', 'AZ', 'FL', 'HI', 'IN', 'KY', 'MD', 'MI', 'MN', 'MO',
       'MS', 'NC', 'NE', 'NH', 'NJ', 'OK', 'OR', 'SD', 'TX', 'WA', 'WI', ),
    SunMonTue=('AL', ),
    NewYork=('NY', ),
    Penn=('PA', ),
    Virginia=('VA', ),
)

create_smooth_dates(EARLIEST_DATE, latest_date)

states = {state: calc_state_stats(state, df, meta_tmp, latest_date)
          for state, df in nyt1.reset_index().groupby('ST')}
states['AZ'].tail(2)

In [None]:
# AK, AL, AR, AZ, CA, CO, CT, DC, DE, FL, GA, HI, IA, ID, IL, IN, KS,
# KY, LA, MA, MD, ME, MI, MN, MO, MS, MT, NC, ND, NE, NH, NJ, NM, NV,
# NY, OH, OK, OR, PA, RI, SC, SD, TN, TX, UT, VA, VT, WA, WI, WV, WY

st_names = list(states.keys())
for st in st_names:
    if st != 'WY':
        continue
    st_df = states[st]
    fam = st_df.loc[st, :].reset_index()[['Date', 'RawInc']].groupby('Date').sum().plot(
        title=f"{st}",
        figsize=(13,5), legend=None, ylim=0
    )
    fam = st_df.loc[st, :].reset_index()[['Date', 'Daily']].groupby('Date').sum().plot(
        title=f"{st}",
        figsize=(13,5), legend=None, ylim=0
    )
    fam = st_df.loc[st, :].reset_index()[['Date', 'Deaths7']].groupby('Date').sum().plot(
        title=f"{st}",
        figsize=(13,5), legend=None, ylim=0
    )
    break

In [None]:
#st_df.Daily.tail(115).head(30)

In [None]:
# states[st][['RawInc', 'Daily', 'Deaths7']].tail(60)

In [None]:
raise ValueError()

### Calculate new stats, state by state

In [None]:
IFR_S, IFR_E = 0.011, 0.0035
IFR_S_S, IFR_E_S = f'{100*IFR_S:.1f}%', f'{100*IFR_E:.2f}%', 
infected_states = get_infections_df(states, 19, IFR_S, IFR_E, 4, 10)
print(infected_states.NewInf.sum())
infected_states.tail(3)

In [None]:
fam = infected_states.reset_index()[['Date', 'NewInf']].groupby('Date').sum().plot(
    title=f"Infection Estimations, 19 median days to death, "
          f"IFR improving {IFR_S_S} - {IFR_E_S}",
    figsize=(13,5), legend=None, ylim=0
)

In [None]:
fam = infected_states.reset_index()[['Date', 'Deaths7']].groupby('Date').sum().plot(
    title="Deaths", figsize=(10,4),
    legend=None, ylim=0
)

In [None]:
foozle = infected_states.reset_index()[['Date', 'NewInf', 'Deaths7']].groupby('Date').sum()
foozle.columns = ['Infections', 'Deaths']
ax = foozle.plot(
    title=f"Daily Infections vs. Deaths, 19 median days to death, "
          f"IFR improving {IFR_S_S} - {IFR_E_S}",
    secondary_y='Deaths', figsize=(13,5), ylim=0)

In [None]:
uri = "https://dshs.texas.gov/coronavirus/TexasCOVID19DailyCountyFatalityCountData.xlsx"
df = pandas.read_excel(uri, skiprows=[0, 1], nrows=256)

In [None]:
s = df.iloc[-1, 1:]
s.index = pandas.period_range('2020-03-07', freq='D', periods=len(s))
daily, mid7 = calc_mid_weekly_average(s)
fig = mid7.plot(figsize=(13,5), title="Texas smoothed death rate")