### Corona trends algorithm development

Data is from [Corona Data Scraper](https://coronadatascraper.com/#home)


In [1]:
import numpy as np
import pandas as pd

In [2]:
rawtimeseries = pd.read_csv('https://coronadatascraper.com/timeseries.csv', parse_dates=['date'])

display('Timeseries:')
display(rawtimeseries.head())

  interactivity=interactivity, compiler=compiler, result=result)


'Timeseries:'

Unnamed: 0,name,level,city,county,state,country,population,lat,long,url,...,recovered,active,tested,hospitalized,hospitalized_current,discharged,icu,icu_current,growthFactor,date
0,"Antwerp, Flanders, Belgium",county,,Antwerp,Flanders,Belgium,1847486.0,51.2485,4.7175,https://epistat.wiv-isp.be/,...,,,,,,,,,,2020-01-22
1,"Antwerp, Flanders, Belgium",county,,Antwerp,Flanders,Belgium,1847486.0,51.2485,4.7175,https://epistat.wiv-isp.be/,...,,,,,,,,,1.0,2020-01-23
2,"Antwerp, Flanders, Belgium",county,,Antwerp,Flanders,Belgium,1847486.0,51.2485,4.7175,https://epistat.wiv-isp.be/,...,,,,,,,,,1.0,2020-01-24
3,"Antwerp, Flanders, Belgium",county,,Antwerp,Flanders,Belgium,1847486.0,51.2485,4.7175,https://epistat.wiv-isp.be/,...,,,,,,,,,1.0,2020-01-25
4,"Antwerp, Flanders, Belgium",county,,Antwerp,Flanders,Belgium,1847486.0,51.2485,4.7175,https://epistat.wiv-isp.be/,...,,,,,,,,,1.0,2020-01-26


In [None]:
counties = {
    code: county for code, county in counties.items() if county["state"] in ("wa")
}

In [None]:
county_xs = [county["lons"] for county in counties.values()]
county_ys = [county["lats"] for county in counties.values()]

county_names = [county['name'] for county in counties.values()]
county_rates = [unemployment[county_id] for county_id in counties]
county_rates = unemployment
color_mapper = LogColorMapper(palette=palette)

data=dict(
    x=county_xs,
    y=county_ys,
    name=county_names,
    rate=county_rates,
)

TOOLS = "pan,wheel_zoom,reset,hover,save"

p = figure(
    title="New York Unemployment, 2009", tools=TOOLS,
    x_axis_location=None, y_axis_location=None,
    tooltips=[
        ("Name", "@name"), ("Unemployment rate", "@rate%"), ("(Long, Lat)", "($x, $y)")
    ])
p.grid.grid_line_color = None
p.hover.point_policy = "follow_mouse"

p.patches('x', 'y', source=data,
          fill_color={'field': 'rate', 'transform': color_mapper},
          fill_alpha=0.7, line_color="white", line_width=0.5)

In [None]:
show(p)

### USA county-level data

In [None]:
USA = rawtimeseries[(rawtimeseries.country == 'United States') & (rawtimeseries.level == 'county')]

### NY for proof of concept

In [None]:
NY = USA[USA.state == 'New York']
len(NY)

In [None]:
NY['county'].nunique()

In [None]:
NY['county'].value_counts()

In [None]:
# no zeros in this dataset which means that I cannot tell between zero and not recorded
sum(NY['deaths'] == 0)

In [None]:
Rockland = NY[NY.county.isin(['Rockland County','Wyoming County'])]

In [None]:
len(Rockland)

In [None]:
Rockland.dtypes

In [None]:
# f = Rockland[['date','county','cases','deaths']].set_index(['county','date'])
f=NY[['date','county','cases','deaths']].set_index(['county','date'])

In [None]:
f

### Data Cleaning

Notice all the data are cumulative since beginning of recording. So we will have to do first-differencing to get the actual number of deaths per day.

The daily cumulative numbers have some inaccuracies: 
1. Have missing data at the beginning and center of timeseries (see Wyoming County below). At the beginning these NAs are zero, in the center of a timeseries they cannot be zero.  
1. Have some cumulative numbers that drop from one day to the next (see Rockland County below).

We have to do the following
1. Roll up to weekly  
1. Clean up NAs 
1. Clean up drops in cumulative numbers

Probably easiest to roll up to the weekly numbers first (not really a rollup, simply take the value for every Sunday). This will aleviate some of the other issues. 

In [None]:
f.loc['Wyoming County',:].plot()

In [None]:
f.loc['Rockland County',:].plot()

#### Missing values 

There are two types of missing values that I want to treat differently: 
1. Missing values at the beginning of a time series 
2. Missing values within a time series

Wyoming County is a good example of both of these.

I will transform these as follows:
1. fill in with zero 
1. fill in with the most recent cumulative count (presumably the day before)

Because of the way the fillna code works, we are going to do these in the opposite order (start with #2).

In [None]:
# recreate df removing index
# f = Rockland[['date','county','cases','deaths']]
f=NY[['date','county','cases','deaths']]

In [None]:
f.head()

In [None]:
pd.set_option('display.max_rows', None)

# this is step 2, fill with the prior day (or most recent day without NA)
f = f.groupby('county',as_index=False).fillna(method='ffill')

In [None]:
f[f.county == 'Wyoming County']

In [None]:
# this is step #1 fill remainder of NAs with zero (will always be at beginning of timeseries)
f.fillna(0.0,inplace=True)

In [None]:
f[f.county == 'Albany County']

#### Filter to sundays

Create a filter for all the sundays.  

In [None]:
# sundays = pd.date_range('2019-12-29', '2021-01-31', freq='W-SUN')
saturdays = pd.date_range('2019-12-28', '2021-01-30', freq='W-SAT')

In [None]:
# newf = f[f.date.isin(sundays)]
f = f[f.date.isin(saturdays)]

In [None]:
f

### Calculate the weekly deaths by differing cumulative deaths

In [None]:
# calculate the weekly deaths
f['prev_value'] = f.groupby('county')['cases'].shift(1)
f['weekly_cases'] = f['cases']-f['prev_value']

f['prev_value'] = f.groupby('county')['deaths'].shift(1)
f['weekly_deaths'] = f['deaths']-f['prev_value']

In [None]:
f

In [None]:
df_trend = f[['date','county','weekly_cases','weekly_deaths']].set_index(['county','date'])

In [None]:
df_trend.loc['Rockland County',:].plot()

In [None]:
df_trend.loc['Wyoming County',:].plot()

### Week over week

In [None]:
df_trend['prev_value'] = df_trend.groupby('county')['weekly_cases'].shift(1)
df_trend['cases_wow'] = (df_trend['weekly_cases']-df_trend['prev_value'])/df_trend['prev_value']
df_trend.loc[np.isnan(df_trend['cases_wow']), 'cases_wow'] = 0.0
df_trend.loc[~np.isfinite(df_trend['cases_wow']), 'cases_wow'] = 100.0

df_trend['cases_wow2'] = pd.cut(df_trend['cases_wow'],bins=[-1,-0.5,-0.05,0.05,.5,1,99999],labels=['-100% to -50%','-50% to -5%','no change','5% to 50%','50% to 100%','100%+'])

In [None]:
# need to move this into a function to rerun...
df_trend['prev_value'] = df_trend.groupby('county')['weekly_deaths'].shift(1)
df_trend['deaths_wow'] = (df_trend['weekly_deaths']-df_trend['prev_value'])/df_trend['prev_value']
# 0/0 will result in NaNs; we'll define those as 0% growth
df_trend.loc[np.isnan(df_trend['deaths_wow']), 'deaths_wow'] = 0.0
# number/0 will result in Inf. we'll define these as 100% growth
df_trend.loc[~np.isfinite(df_trend['deaths_wow']), 'deaths_wow'] = 110.0

df_trend['deaths_wow2'] = pd.cut(df_trend['deaths_wow'],bins=[-1,-0.5,-0.05,0.05,.5,1,99999],labels=['-100% to -50%','-50% to -5%','no change','5% to 50%','50% to 100%','100%+'])

In [None]:
df_trend

## Get the rolling trend on the weekly data

In [None]:
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm

def rolling_trend_linear(y):
    # this will be a slope per week, this X assumes all weeks are included
    x=np.arange(0,len(y),1).reshape(-1,1)
    regressor = LinearRegression()
    try:
        regressor.fit(x,y) 
    except:
        return 
    
    return regressor.coef_


# incomplete
def rolling_trend_logistic(y):
    # this will be a slope per week, this X assumes all weeks are included
    x=np.arange(0,len(y),1).reshape(-1,1)
    regressor = LinearRegression()
    try:
        regressor.fit(x,y) 
    except:
        return 
    
    return regressor.coef_

In [None]:
# use rollng to apply linear regression
temp = df_trend.rolling(4)['weekly_cases'].apply(rolling_trend_linear)
temp = temp.to_frame().rename(columns={'weekly_cases' : 'cases_4w_trend'})
# type(temp)

In [None]:
temp

In [None]:
df_new = pd.merge(df_trend,temp,how='left', on=['county','date'])

In [None]:
# use rollng to apply linear regression
temp = df_trend.rolling(4)['weekly_deaths'].apply(rolling_trend_linear)
temp = temp.to_frame().rename(columns={'weekly_deaths' : 'deaths_4w_trend'})

In [None]:
temp

In [None]:
df_new = pd.merge(df_new,temp,how='left', on=['county','date'])

In [None]:
df_new

In [None]:
df_new.reset_index(level=0, inplace=True)

In [None]:
df_new[df_new.date == '2020-06-27'].to_csv('NY_trends_6-27.csv')

In [None]:
df_new.to_csv('NY_county_covid_trends.csv')

## Old code

In [None]:
def countna(array_like):
    missings = sum(np.isnan(array_like))
    if missings > 0:
        return 
    return sum(array_like)

# logic = {'deaths' : 'sum','missings' : countna}

offset = pd.offsets.timedelta(days=-6)