In [1]:
import pandas as pd
import numpy as np
import json
import statsmodels.api as sm


  import pandas.util.testing as tm


In [140]:
us = pd.read_csv('/content/drive/MyDrive/Citadel/data/new_folder/US_daily_cleaned.csv')
us[(us.RegionCode == 'US_AK')&(us.date == '2021-02-17')].positive

0    54799.0
Name: positive, dtype: float64

In [139]:
tracking[(tracking.state == 'AK')&(us.date == '2021-02-16')].positive

56    54799.0
Name: positive, dtype: float64

In [2]:

mobility = pd.read_csv('/content/drive/MyDrive/Citadel/data/new_folder/US_mobility_state_cleaned.csv')
tracking = pd.read_csv('/content/drive/MyDrive/Citadel/data/3_covidtracking/all-states-history.csv')
with open('/content/drive/MyDrive/Citadel/data/new_folder/states_hash.json', 'r+') as file:
    states_hash = json.load(file)
states_hash = { v:k for (k,v) in states_hash.items()}
mobility['state'] = mobility['state'].map(states_hash)
df = tracking.merge(mobility, on = ['date','state'], suffixes = ['_tracking', '_mobility'])

df.isna().mean().sort_values().tail(5)

onVentilatorCumulative         0.930866
positiveTestsPeopleAntibody    0.942321
negativeTestsPeopleAntibody    0.948020
totalTestsPeopleAntigen        0.949114
positiveTestsPeopleAntigen     0.967534
dtype: float64

### the metrics to use: deathIncrease, hospitalizedIncrease, inIcuCurrently, onVentilatorCurrently, PositiveRate

In [3]:
y_var_candidates = ['deathIncrease', 'hospitalizedIncrease', 'inIcuCurrently', 'onVentilatorCurrently']

In [4]:

df = df.sort_values(by = ['state', 'date'])
df['date'] = pd.to_datetime(df['date'])
df['day_since_record'] = df.groupby('state')['date'].rank(ascending = True, method = 'first')
df.head(1)

Unnamed: 0.1,date,state,death,deathConfirmed,deathIncrease,deathProbable,hospitalized,hospitalizedCumulative,hospitalizedCurrently,hospitalizedIncrease,inIcuCumulative,inIcuCurrently,negative,negativeIncrease,negativeTestsAntibody,negativeTestsPeopleAntibody,negativeTestsViral,onVentilatorCumulative,onVentilatorCurrently,positive,positiveCasesViral,positiveIncrease,positiveScore,positiveTestsAntibody,positiveTestsAntigen,positiveTestsPeopleAntibody,positiveTestsPeopleAntigen,positiveTestsViral,recovered,totalTestEncountersViral,totalTestEncountersViralIncrease,totalTestResults,totalTestResultsIncrease,totalTestsAntibody,totalTestsAntigen,totalTestsPeopleAntibody,totalTestsPeopleAntigen,totalTestsPeopleViral,totalTestsPeopleViralIncrease,totalTestsViral,totalTestsViralIncrease,Unnamed: 0,retail_and_recreation_percent_change_from_baseline,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,transit_stations_percent_change_from_baseline,workplaces_percent_change_from_baseline,residential_percent_change_from_baseline,day_since_record
17150,2020-03-06,AK,0.0,,0,,,,,0,,,,0,,,,,,,,0,0,,,,,,,,0,8.0,0,,,,,,0,8.0,8,23119,9.0,4.0,10.0,9.0,0.0,1.0,1.0


In [5]:
#since the death nan data date are at the beginning of the pandemic, it is likely it has not been recorded. Set it to 0
df[df['death'].isna()].date.unique().max()

numpy.datetime64('2020-03-31T00:00:00.000000000')

In [47]:
X_cols = [x for x in mobility.columns.values if x not in ['Unnamed: 0', 'state', 'date']]
X_cols.append('day_since_record')
X_cols

['retail_and_recreation_percent_change_from_baseline',
 'grocery_and_pharmacy_percent_change_from_baseline',
 'parks_percent_change_from_baseline',
 'transit_stations_percent_change_from_baseline',
 'workplaces_percent_change_from_baseline',
 'residential_percent_change_from_baseline',
 'day_since_record']

In [7]:
y_var_candidates

['deathIncrease',
 'hospitalizedIncrease',
 'inIcuCurrently',
 'onVentilatorCurrently']

# An overall OLS regression analysis on all state data

In [8]:
# ['deathIncrease', 'hospitalizedIncrease', 'inIcuCurrently', 'onVentilatorCurrently', 'PositiveRate']
y_variable = 'inIcuCurrently'
y = df[y_variable].fillna(0).values
x = df[X_cols].fillna(0).values
x = sm.add_constant(x)
model = sm.OLS(y,x)
results = model.fit()
print(results.summary2())
print(results.pvalues)

                  Results: Ordinary least squares
Model:              OLS              Adj. R-squared:     0.106      
Dependent Variable: y                AIC:                263130.9657
Date:               2021-02-19 18:41 BIC:                263193.0666
No. Observations:   17372            Log-Likelihood:     -1.3156e+05
Df Model:           7                F-statistic:        294.5      
Df Residuals:       17364            Prob (F-statistic): 0.00       
R-squared:          0.106            Scale:              2.2157e+05 
---------------------------------------------------------------------
            Coef.    Std.Err.     t      P>|t|     [0.025     0.975] 
---------------------------------------------------------------------
const     -171.7756   13.8431  -12.4087  0.0000  -198.9095  -144.6417
x1          -7.9304    0.5120  -15.4895  0.0000    -8.9340    -6.9269
x2           5.3920    0.5515    9.7761  0.0000     4.3109     6.4730
x3          -0.2042    0.0608   -3.3577  0.0008

# Individual State OLS 

In [9]:
from tqdm import tqdm
# y in  ['deathIncrease', 'hospitalizedIncrease', 'inIcuCurrently', 'onVentilatorCurrently', 'PositiveRate']
y_variable = 'hospitalizedIncrease'
coeffs = []
p_values = []
states = df.state.unique()
for state in tqdm(states):
    if y_variable == 'PositiveRate':
        df_reg = df[(df['state'] == state)&(df['PositiveRate']!=np.inf)]
    df_reg = df[df['state'] == state]
    y = df_reg[y_variable].fillna(0).values
    x = df_reg[X_cols].fillna(0).values
    x = sm.add_constant(x)
    model = sm.OLS(y,x)
    results = model.fit()
    coeffs.append(results.params)
    p_values.append(results.pvalues)


  return self.params / self.bse
100%|██████████| 50/50 [00:00<00:00, 235.25it/s]


In [83]:
#modeling on week
df['week'] = df['day_since_record'].apply(lambda x: x//7)
weekly_df = df.groupby(['state','week']).mean().reset_index()

weekly_df.head(1)

Unnamed: 0.1,state,week,death,deathConfirmed,deathIncrease,deathProbable,hospitalized,hospitalizedCumulative,hospitalizedCurrently,hospitalizedIncrease,inIcuCumulative,inIcuCurrently,negative,negativeIncrease,negativeTestsAntibody,negativeTestsPeopleAntibody,negativeTestsViral,onVentilatorCumulative,onVentilatorCurrently,positive,positiveCasesViral,positiveIncrease,positiveScore,positiveTestsAntibody,positiveTestsAntigen,positiveTestsPeopleAntibody,positiveTestsPeopleAntigen,positiveTestsViral,recovered,totalTestEncountersViral,totalTestEncountersViralIncrease,totalTestResults,totalTestResultsIncrease,totalTestsAntibody,totalTestsAntigen,totalTestsPeopleAntibody,totalTestsPeopleAntigen,totalTestsPeopleViral,totalTestsPeopleViralIncrease,totalTestsViral,totalTestsViralIncrease,Unnamed: 0,retail_and_recreation_percent_change_from_baseline,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,transit_stations_percent_change_from_baseline,workplaces_percent_change_from_baseline,residential_percent_change_from_baseline,day_since_record
0,AK,0.0,0.0,,0.0,,1.0,1.0,,0.166667,,,,0.0,,,,,,,,0.0,0.0,,,,,,,,0.0,21.0,6.333333,,,,,,0.0,21.0,7.666667,23121.5,11.0,6.833333,17.0,5.833333,-3.333333,0.833333,3.5


In [78]:
df['date'] = pd.to_datetime(df['date'])
df['smoothed_' + y_variable] = df.groupby('state')[y_variable].rolling(7, min_periods = 1).sum().values

array([ 0.,  0.,  0., ..., 23., 23., 23.])

In [86]:
from tqdm import tqdm
pd.options.mode.chained_assignment = None 
# y in  ['deathIncrease', 'hospitalizedIncrease', 'inIcuCurrently', 'onVentilatorCurrently', 'PositiveRate']
y_variable = 'deathIncrease'
lag = 14
coeffs = []
p_values = []
r_squared_adjs = []
states = df.state.unique()
smoothing = True

# data = weekly_df
data = df
data = data.copy()

if smoothing == True:
# 7 day rolling
    data[y_variable] = data.groupby('state')[y_variable].rolling(7, min_periods = 1).sum().values


for state in tqdm(states):
    if y_variable == 'PositiveRate':
        df_reg = data[(data['state'] == state)&(data['PositiveRate']!=np.inf)]
    df_reg = data[data['state'] == state]
    df_reg[y_variable] = data[y_variable].shift(lag)
    df_reg = df_reg[df_reg[y_variable].isna() == False]
    y = df_reg[y_variable].fillna(0).values
    x = df_reg[X_cols].fillna(0).values
    x = sm.add_constant(x)
    model = sm.OLS(y,x)
    results = model.fit()
    coeffs.append(results.params)
    p_values.append(results.pvalues)
    r_squared_adjs.append(results.rsquared_adj)
df_pvalues = pd.DataFrame(p_values, index = states, columns = ['constant'] + X_cols).apply(lambda x: round(x, 4))
df_r_square_adj = pd.DataFrame(r_squared_adjs, index = states).apply(lambda x: round(x, 4))
df_params = pd.DataFrame(coeffs, index = states, columns = ['constant'] + X_cols).apply(lambda x: round(x, 4))


100%|██████████| 50/50 [00:00<00:00, 167.56it/s]


In [87]:
# how many percent statistically significant
df_pvalues.applymap(lambda x: 1 if x<0.05 else 0).mean()

constant                                              0.74
retail_and_recreation_percent_change_from_baseline    0.50
grocery_and_pharmacy_percent_change_from_baseline     0.46
parks_percent_change_from_baseline                    0.54
transit_stations_percent_change_from_baseline         0.62
workplaces_percent_change_from_baseline               0.68
residential_percent_change_from_baseline              0.50
day_since_record                                      0.82
dtype: float64

In [89]:
df_r_square_adj.mean()

0    0.427398
dtype: float64

### Experiment on Fixed Effect model(Currently Working On)

In [91]:
!pip install linearmodels

Collecting linearmodels
[?25l  Downloading https://files.pythonhosted.org/packages/23/b6/7f050705cf7fc988863a8676c7e361946ee5972fb2b099907f82954a7021/linearmodels-4.19.tar.gz (1.8MB)
[K     |████████████████████████████████| 1.8MB 7.9MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting statsmodels>=0.11
[?25l  Downloading https://files.pythonhosted.org/packages/0d/7b/c17815648dc31396af865b9c6627cc3f95705954e30f61106795361c39ee/statsmodels-0.12.2-cp36-cp36m-manylinux1_x86_64.whl (9.5MB)
[K     |████████████████████████████████| 9.5MB 31.6MB/s 
[?25hCollecting mypy-extensions>=0.4
  Downloading https://files.pythonhosted.org/packages/5c/eb/975c7c080f3223a5cdaff09612f3a5221e4ba534f7039db34c35d95fa6a5/mypy_extensions-0.4.3-py2.py3-none-any.whl
Collecting pyhdfe>=0.1
  Downloading https://files.pythonhosted.org/packages/81/38/a53257196401029dd34c83c0528de2

In [94]:
X_cols

['retail_and_recreation_percent_change_from_baseline',
 'grocery_and_pharmacy_percent_change_from_baseline',
 'parks_percent_change_from_baseline',
 'transit_stations_percent_change_from_baseline',
 'workplaces_percent_change_from_baseline',
 'residential_percent_change_from_baseline',
 'day_since_record']

In [122]:
from linearmodels import RandomEffects, PanelOLS
df_reg = data[data['state'] == 'AK']
df_reg = df_reg.set_index('date')
exog = sm.add_constant(df_reg['workplaces_percent_change_from_baseline'])
endog = df_reg['deathIncrease']
# random effects model
# model_re = RandomEffects(endog, exog) 
# re_res = model_re.fit() 
# fixed effects model
model_fe = PanelOLS(endog, exog, entity_effects = True, time_effects=True) 
fe_res = model_fe.fit() 
#print results
print(re_res)
print(fe_res)

ValueError: ignored

In [109]:
len(exog )

17372

In [110]:
exog 

array([[  1.,   0.],
       [  1.,   3.],
       [  1.,   2.],
       ...,
       [  1., -18.],
       [  1., -17.],
       [  1., -19.]])

In [111]:
endog

array([[ 0.],
       [ 0.],
       [ 0.],
       ...,
       [23.],
       [23.],
       [23.]])