In [1]:
import pandas as pd
import numpy as np
import json
import statsmodels.api as sm
import plotly.graph_objects as go
import plotly.express as px

In [2]:
policy_cols = ['C1_School closing', 'C1_Flag',
       'C2_Workplace closing', 'C2_Flag', 'C3_Cancel public events',
       'C3_Flag', 'C4_Restrictions on gatherings', 'C4_Flag',
       'C5_Close public transport', 'C5_Flag',
       'C6_Stay at home requirements', 'C6_Flag',
       'C7_Restrictions on internal movement', 'C7_Flag',
       'C8_International travel controls']
policy = pd.read_csv('/content/drive/MyDrive/Citadel/data/new_folder/daily_df.csv')[policy_cols + ['date', 'state']].fillna(0)


In [3]:

mobility = pd.read_csv('/content/drive/MyDrive/Citadel/data/new_folder/US_mobility_state_cleaned.csv')
tracking = pd.read_csv('/content/drive/MyDrive/Citadel/data/3_covidtracking/all-states-history.csv')
temperature = pd.read_csv('/content/drive/MyDrive/Citadel/data/new_folder/us_cross_sectional.csv')[['state', 'AverageTemperature']]
urbanization = pd.read_csv('/content/drive/MyDrive/Citadel/data/new_folder/educationStateDataset.csv')[['State', 'Urbanization %']].rename(columns = {'State':'state'})


with open('/content/drive/MyDrive/Citadel/data/new_folder/states_hash.json', 'r+') as file:
    states_hash = json.load(file)
states_hash = { v:k for (k,v) in states_hash.items()}
mobility['state'] = mobility['state'].map(states_hash)
temperature['state'] = temperature['state'].map(states_hash)
df = tracking.merge(mobility, on = ['date','state'], suffixes = ['_tracking', '_mobility'])
df = df.merge(temperature, on ='state', how = 'left' )
df = df.merge(urbanization, on ='state', how = 'left' )
df = df.merge(policy, on =['state', 'date'], how = 'left' )

#tentative


df.isna().mean().sort_values().tail(5)

C2_Workplace closing                1.0
C1_Flag                             1.0
C1_School closing                   1.0
C4_Restrictions on gatherings       1.0
C8_International travel controls    1.0
dtype: float64

### the metrics to use: deathIncrease, hospitalizedIncrease, inIcuCurrently, onVentilatorCurrently, PositiveRate

In [4]:

df = df.sort_values(by = ['state', 'date'])
df['date'] = pd.to_datetime(df['date'])
df['day_since_record'] = df.groupby('state')['date'].rank(ascending = True, method = 'first')
df['resident'] = df.groupby('state')['residential_percent_change_from_baseline'].cumprod()
df.head(1)

Unnamed: 0.1,date,state,death,deathConfirmed,deathIncrease,deathProbable,hospitalized,hospitalizedCumulative,hospitalizedCurrently,hospitalizedIncrease,inIcuCumulative,inIcuCurrently,negative,negativeIncrease,negativeTestsAntibody,negativeTestsPeopleAntibody,negativeTestsViral,onVentilatorCumulative,onVentilatorCurrently,positive,positiveCasesViral,positiveIncrease,positiveScore,positiveTestsAntibody,positiveTestsAntigen,positiveTestsPeopleAntibody,positiveTestsPeopleAntigen,positiveTestsViral,recovered,totalTestEncountersViral,totalTestEncountersViralIncrease,totalTestResults,totalTestResultsIncrease,totalTestsAntibody,totalTestsAntigen,totalTestsPeopleAntibody,totalTestsPeopleAntigen,totalTestsPeopleViral,totalTestsPeopleViralIncrease,totalTestsViral,totalTestsViralIncrease,Unnamed: 0,retail_and_recreation_percent_change_from_baseline,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,transit_stations_percent_change_from_baseline,workplaces_percent_change_from_baseline,residential_percent_change_from_baseline,AverageTemperature,Urbanization %,C1_School closing,C1_Flag,C2_Workplace closing,C2_Flag,C3_Cancel public events,C3_Flag,C4_Restrictions on gatherings,C4_Flag,C5_Close public transport,C5_Flag,C6_Stay at home requirements,C6_Flag,C7_Restrictions on internal movement,C7_Flag,C8_International travel controls,day_since_record,resident
17150,2020-03-06,AK,0.0,,0,,,,,0,,,,0,,,,,,,,0,0,,,,,,,,0,8.0,0,,,,,,0,8.0,8,23119,9.0,4.0,10.0,9.0,0.0,1.0,-3.0,66.0,,,,,,,,,,,,,,,,1.0,1.0


In [5]:
df['deathIncreaseDiff'] = df['deathIncrease'] - df['deathIncrease'].shift(1)
df[['deathIncreaseDiff', 'residential_percent_change_from_baseline']].corr()

Unnamed: 0,deathIncreaseDiff,residential_percent_change_from_baseline
deathIncreaseDiff,1.0,0.047458
residential_percent_change_from_baseline,0.047458,1.0


In [6]:
#since the death nan data date are at the beginning of the pandemic, it is likely it has not been recorded. Set it to 0
df[df['death'].isna()].date.unique().max()

numpy.datetime64('2020-03-31T00:00:00.000000000')

# An overall OLS regression analysis on all state data

In [7]:
X_cols = ['AverageTemperature', 'Urbanization %']
base = ['retail_and_recreation_percent_change_from_baseline','grocery_and_pharmacy_percent_change_from_baseline','parks_percent_change_from_baseline','transit_stations_percent_change_from_baseline','workplaces_percent_change_from_baseline','residential_percent_change_from_baseline']
X_cols = base + X_cols 

# X_cols = base
X_cols

['retail_and_recreation_percent_change_from_baseline',
 'grocery_and_pharmacy_percent_change_from_baseline',
 'parks_percent_change_from_baseline',
 'transit_stations_percent_change_from_baseline',
 'workplaces_percent_change_from_baseline',
 'residential_percent_change_from_baseline',
 'AverageTemperature',
 'Urbanization %']

In [8]:
df.head(1)

Unnamed: 0.1,date,state,death,deathConfirmed,deathIncrease,deathProbable,hospitalized,hospitalizedCumulative,hospitalizedCurrently,hospitalizedIncrease,inIcuCumulative,inIcuCurrently,negative,negativeIncrease,negativeTestsAntibody,negativeTestsPeopleAntibody,negativeTestsViral,onVentilatorCumulative,onVentilatorCurrently,positive,positiveCasesViral,positiveIncrease,positiveScore,positiveTestsAntibody,positiveTestsAntigen,positiveTestsPeopleAntibody,positiveTestsPeopleAntigen,positiveTestsViral,recovered,totalTestEncountersViral,totalTestEncountersViralIncrease,totalTestResults,totalTestResultsIncrease,totalTestsAntibody,totalTestsAntigen,totalTestsPeopleAntibody,totalTestsPeopleAntigen,totalTestsPeopleViral,totalTestsPeopleViralIncrease,totalTestsViral,totalTestsViralIncrease,Unnamed: 0,retail_and_recreation_percent_change_from_baseline,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,transit_stations_percent_change_from_baseline,workplaces_percent_change_from_baseline,residential_percent_change_from_baseline,AverageTemperature,Urbanization %,C1_School closing,C1_Flag,C2_Workplace closing,C2_Flag,C3_Cancel public events,C3_Flag,C4_Restrictions on gatherings,C4_Flag,C5_Close public transport,C5_Flag,C6_Stay at home requirements,C6_Flag,C7_Restrictions on internal movement,C7_Flag,C8_International travel controls,day_since_record,resident,deathIncreaseDiff
17150,2020-03-06,AK,0.0,,0,,,,,0,,,,0,,,,,,,,0,0,,,,,,,,0,8.0,0,,,,,,0,8.0,8,23119,9.0,4.0,10.0,9.0,0.0,1.0,-3.0,66.0,,,,,,,,,,,,,,,,1.0,1.0,


In [9]:
# df= df[df.date > pd.to_datetime('2020-04-01')]

In [10]:
px.line(data_frame = df[df.state == 'CA'], y = 'residential_percent_change_from_baseline', x = 'date')

In [11]:
# ['deathIncrease', 'hospitalizedIncrease', 'inIcuCurrently', 'onVentilatorCurrently']

from tqdm import tqdm
pd.options.mode.chained_assignment = None 

y_variable_options = ['deathIncrease', 'hospitalizedIncrease', 'inIcuCurrently', 'onVentilatorCurrently']
lag = -120
smoothing = True

# data = weekly_df
data = df
data = data.copy()



for y_variable in tqdm(y_variable_options):
    if smoothing == True:
        # 7 day rolling
        data = data.sort_values(by = ['state','date'])
        data[y_variable] = data.groupby('state')[y_variable].rolling(7, min_periods = 1).sum().values
        for x_col in X_cols:
            if x_col.split('_')[-1] == 'baseline':
                data[x_col] = data.groupby('state')[x_col].rolling(7, min_periods = 1).sum().values

    df_reg = data.copy()
    df_reg[y_variable] = df_reg[y_variable].shift(lag)
    df_reg = df_reg[df_reg[y_variable].isna() == False]
    y = df_reg[y_variable].fillna(0).values
    x = df_reg[X_cols].fillna(0).values
    x = sm.add_constant(x)
    model = sm.OLS(y,x)
    results = model.fit()
    print(results.summary())


 50%|█████     | 2/4 [00:00<00:00,  7.11it/s]

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.088
Model:                            OLS   Adj. R-squared:                  0.087
Method:                 Least Squares   F-statistic:                     206.8
Date:                Sun, 21 Feb 2021   Prob (F-statistic):               0.00
Time:                        22:53:02   Log-Likelihood:            -1.2513e+05
No. Observations:               17252   AIC:                         2.503e+05
Df Residuals:                   17243   BIC:                         2.503e+05
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       -196.4622     17.472    -11.244      0.0

100%|██████████| 4/4 [00:00<00:00,  6.76it/s]

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.114
Model:                            OLS   Adj. R-squared:                  0.113
Method:                 Least Squares   F-statistic:                     157.6
Date:                Sun, 21 Feb 2021   Prob (F-statistic):          7.47e-251
Time:                        22:53:02   Log-Likelihood:                -95154.
No. Observations:                9798   AIC:                         1.903e+05
Df Residuals:                    9789   BIC:                         1.904e+05
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -3883.5391    287.295    -13.518      0.0




# Individual State OLS 

In [11]:
#modeling on week
# df['week'] = df['day_since_record'].apply(lambda x: x//7)
# weekly_df = df.groupby(['state','week']).mean().reset_index()
# weekly_df.head(1)

In [12]:
X_cols

['AverageTemperature',
 'Urbanization %',
 'retail_and_recreation_percent_change_from_baseline',
 'grocery_and_pharmacy_percent_change_from_baseline',
 'parks_percent_change_from_baseline',
 'transit_stations_percent_change_from_baseline',
 'workplaces_percent_change_from_baseline',
 'residential_percent_change_from_baseline']

In [12]:
from tqdm import tqdm
pd.options.mode.chained_assignment = None 
# y in  ['deathIncrease', 'hospitalizedIncrease', 'inIcuCurrently', 'onVentilatorCurrently', 'PositiveRate']
y_variable = 'deathIncrease'
lag = -120
coeffs = []
p_values = []
r_squared_adjs = []
states = df.state.unique()
smoothing = True

# data = weekly_df
data = df
data = data.copy()

if smoothing == True:
# 7 day rolling
    data[y_variable] = data.groupby('state')[y_variable].rolling(7, min_periods = 1).sum().values
    for x_col in X_cols:
        if x_col.split('_')[-1] == 'baseline':
            data[x_col] = data.groupby('state')[x_col].rolling(7, min_periods = 1).sum().values


for state in tqdm(states):
    if y_variable == 'PositiveRate':
        df_reg = data[(data['state'] == state)&(data['PositiveRate']!=np.inf)]
    df_reg = data[data['state'] == state]
    df_reg[y_variable] = data[y_variable].shift(lag)
    df_reg = df_reg[df_reg[y_variable].isna() == False]
    y = df_reg[y_variable].fillna(0).values
    x = df_reg[X_cols].fillna(0).values
    # x = df_reg[['residential_percent_change_from_baseline', 'day_since_record', 'Density']].fillna(0).values
    x = sm.add_constant(x, has_constant='add')
    model = sm.OLS(y,x)
    results = model.fit()
    coeffs.append(results.params)
    p_values.append(results.pvalues)
    r_squared_adjs.append(results.rsquared_adj)
df_pvalues = pd.DataFrame(p_values, index = states, columns = ['constant'] + X_cols).apply(lambda x: round(x, 4))
# df_pvalues = pd.DataFrame(p_values, index = states, columns = ['constant', 'residential_percent_change_from_baseline', 'day_since_record', 'Density'] ).apply(lambda x: round(x, 4))
df_r_square_adj = pd.DataFrame(r_squared_adjs, index = states).apply(lambda x: round(x, 4))
df_params = pd.DataFrame(coeffs, index = states, columns = ['constant'] + X_cols).apply(lambda x: round(x, 4))


100%|██████████| 50/50 [00:00<00:00, 109.50it/s]


In [13]:
# how many percent statistically significant
df_pvalues.applymap(lambda x: 1 if x<0.05 else 0).mean().mean()

0.7466666666666666

In [15]:
df_params.describe()

Unnamed: 0,constant,retail_and_recreation_percent_change_from_baseline,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,transit_stations_percent_change_from_baseline,workplaces_percent_change_from_baseline,residential_percent_change_from_baseline,AverageTemperature,Urbanization %
count,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0
mean,0.024658,-0.170734,-0.54176,-0.356562,0.581712,-1.975228,-5.194228,0.307664,1.877654
std,0.043067,5.146793,1.227513,3.136257,3.64786,3.893417,11.520015,0.56173,3.557995
min,-0.0784,-18.9096,-4.6253,-20.8028,-4.8319,-18.7088,-41.9098,-0.5144,-4.7284
25%,0.0053,-1.4284,-0.594,-0.100825,-0.535875,-2.542875,-7.942675,0.0518,0.4455
50%,0.01495,-0.3198,-0.25205,-0.00925,-0.02425,-1.2667,-5.1133,0.15955,0.9792
75%,0.032675,0.62335,-0.070425,0.181275,0.473675,-0.49805,-0.26715,0.395375,2.226025
max,0.2301,20.3851,2.7987,5.5132,17.8097,7.9838,41.9924,3.5022,21.8602


In [161]:
pval_plot = df_pvalues.applymap(lambda x: 1 if x<0.05 else 0).mean().reset_index()
px.bar(data_frame=pval_plot, x=0, y='index', orientation='h')
# , title='Percentage of states with P-values less than 0.05'

In [17]:
df_r_square_adj.mean()

0    0.52499
dtype: float64

#PCA to reduce Multicollinearity

In [16]:
from sklearn.decomposition import PCA
X_cols = ['mobility','AverageTemperature', 'Urbanization %', 'day_since_record']
pca = PCA(1)
base = ['retail_and_recreation_percent_change_from_baseline','grocery_and_pharmacy_percent_change_from_baseline','parks_percent_change_from_baseline','transit_stations_percent_change_from_baseline','workplaces_percent_change_from_baseline','residential_percent_change_from_baseline']
vars = df[base].fillna(0)
pca.fit(vars)
print('pca explained variance ratio:', pca.explained_variance_ratio_)
df['mobility'] = pca.transform(vars)

pca explained variance ratio: [0.87925078]


In [17]:
df[X_cols].corr()

Unnamed: 0,mobility,AverageTemperature,Urbanization %,day_since_record
mobility,1.0,-0.327753,-0.2163,-0.243754
AverageTemperature,-0.327753,1.0,0.230669,0.002258
Urbanization %,-0.2163,0.230669,1.0,0.011922
day_since_record,-0.243754,0.002258,0.011922,1.0


#### All states together

In [18]:
X_cols

['mobility', 'AverageTemperature', 'Urbanization %', 'day_since_record']

In [19]:
df['day_since_record_square'] = df['day_since_record'].apply(lambda x:x**2)
df['mobility_temperature'] = df.apply(lambda x:x['mobility'] * x['AverageTemperature'], axis = 1)
df['mobility_day'] = df.apply(lambda x:x['mobility'] * x['day_since_record'], axis = 1)
df['mobility_urban'] = df.apply(lambda x:x['mobility'] * x['Urbanization %'], axis = 1)


In [20]:

from tqdm import tqdm
pd.options.mode.chained_assignment = None 

y_variable_options = ['deathIncrease', 'hospitalizedIncrease', 'inIcuCurrently', 'onVentilatorCurrently']
lag = -90
smoothing = True

# data = weekly_df
data = df
data = data.copy()


for y_variable in tqdm(y_variable_options):
    if smoothing == True:
        # 7 day rolling
        data = data.sort_values(by = ['state','date'])
        data[y_variable] = data.groupby('state')[y_variable].rolling(7, min_periods = 1).sum().values
        data['mobility'] = data.groupby('state')['mobility'].rolling(7, min_periods = 1).sum().values
        for x_col in X_cols:
            if x_col.split('_')[-1] == 'baseline':
                data[x_col] = data.groupby('state')[x_col].rolling(7, min_periods = 1).sum().values

    df_reg = data.copy()
    df_reg = df_reg.groupby(['state']).apply(lambda x: x.sort_values(by = 'date').shift(lag)).reset_index(drop = True)
    df_reg = df_reg[df_reg[y_variable].notna()]
    df_reg[y_variable] = df_reg[y_variable].shift(lag)
    df_reg = df_reg[df_reg[y_variable].isna() == False]
    y = df_reg[y_variable].fillna(0).values
    x = df_reg[X_cols].fillna(0).values
    x = sm.add_constant(x)
    model = sm.OLS(y,x)
    results = model.fit()
    print(results.summary())


 25%|██▌       | 1/4 [00:00<00:00,  4.87it/s]

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.085
Model:                            OLS   Adj. R-squared:                  0.085
Method:                 Least Squares   F-statistic:                     296.0
Date:                Sun, 21 Feb 2021   Prob (F-statistic):          6.63e-244
Time:                        22:54:44   Log-Likelihood:                -91777.
No. Observations:               12782   AIC:                         1.836e+05
Df Residuals:                   12777   BIC:                         1.836e+05
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       -293.7326     22.388    -13.120      0.0

 50%|█████     | 2/4 [00:00<00:00,  4.78it/s]

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.031
Model:                            OLS   Adj. R-squared:                  0.030
Method:                 Least Squares   F-statistic:                     101.2
Date:                Sun, 21 Feb 2021   Prob (F-statistic):           5.21e-85
Time:                        22:54:44   Log-Likelihood:            -1.0278e+05
No. Observations:               12782   AIC:                         2.056e+05
Df Residuals:                   12777   BIC:                         2.056e+05
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       -545.6328     53.182    -10.260      0.0

 75%|███████▌  | 3/4 [00:00<00:00,  4.68it/s]

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.068
Model:                            OLS   Adj. R-squared:                  0.068
Method:                 Least Squares   F-statistic:                     155.8
Date:                Sun, 21 Feb 2021   Prob (F-statistic):          7.06e-129
Time:                        22:54:45   Log-Likelihood:                -82365.
No. Observations:                8524   AIC:                         1.647e+05
Df Residuals:                    8519   BIC:                         1.648e+05
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -2528.1279    354.195     -7.138      0.0

100%|██████████| 4/4 [00:00<00:00,  4.62it/s]

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.032
Model:                            OLS   Adj. R-squared:                  0.031
Method:                 Least Squares   F-statistic:                     54.72
Date:                Sun, 21 Feb 2021   Prob (F-statistic):           1.84e-45
Time:                        22:54:45   Log-Likelihood:                -56199.
No. Observations:                6626   AIC:                         1.124e+05
Df Residuals:                    6621   BIC:                         1.124e+05
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       -535.5617    113.869     -4.703      0.0




In [23]:
# X_cols = X_cols + ['mobility_urban']

In [21]:
# X_cols

#### Individual States

In [33]:
from tqdm import tqdm
pd.options.mode.chained_assignment = None 
# y in  ['deathIncrease', 'hospitalizedIncrease', 'inIcuCurrently', 'onVentilatorCurrently', 'PositiveRate']
y_variable = 'deathIncrease'
lag = -120
coeffs = []
p_values = []
r_squared_adjs = []
states = df.state.unique()
smoothing = True

# data = weekly_df
data = df
data = data.copy()

if smoothing == True:
# 7 day rolling
    data[y_variable] = data.groupby('state')[y_variable].rolling(7, min_periods = 1).sum().values
    for x_col in X_cols:
        if x_col.split('_')[-1] == 'baseline':
            data[x_col] = data.groupby('state')[x_col].rolling(7, min_periods = 1).sum().values


for state in tqdm(states):
    df_reg = data[data['state'] == state]
    df_reg[y_variable] = data[y_variable].shift(lag)
    df_reg = df_reg[df_reg[y_variable].isna() == False]
    y = df_reg[y_variable].fillna(0).values
    x = df_reg[X_cols].fillna(0).values
    # x = df_reg[['residential_percent_change_from_baseline', 'day_since_record', 'Density']].fillna(0).values
    x = sm.add_constant(x, has_constant='add')
    model = sm.OLS(y,x)
    results = model.fit()
    coeffs.append(results.params)
    p_values.append(results.pvalues)
    r_squared_adjs.append(results.rsquared_adj)
df_pvalues = pd.DataFrame(p_values, index = states, columns = ['constant'] + X_cols).apply(lambda x: round(x, 4))
# df_pvalues = pd.DataFrame(p_values, index = states, columns = ['constant', 'residential_percent_change_from_baseline', 'day_since_record', 'Density'] ).apply(lambda x: round(x, 4))
df_r_square_adj = pd.DataFrame(r_squared_adjs, index = states).apply(lambda x: round(x, 4))
df_params = pd.DataFrame(coeffs, index = states, columns = ['constant'] + X_cols).apply(lambda x: round(x, 4))


100%|██████████| 50/50 [00:00<00:00, 161.19it/s]


In [34]:
# how many percent statistically significant
df_pvalues.applymap(lambda x: 1 if x<0.05 else 0).mean()

constant              0.88
mobility              0.86
AverageTemperature    0.88
Urbanization %        0.88
day_since_record      0.86
dtype: float64

In [35]:
df_r_square_adj.mean()

0    0.278192
dtype: float64

In [36]:
df_params.mean()

constant              0.015234
mobility              0.191050
AverageTemperature    0.261468
Urbanization %        1.178634
day_since_record      0.336664
dtype: float64

# Panel Regression(Random Effect)

In [28]:
!pip install linearmodels



In [29]:
df['14days_death'] = df['deathIncrease'].shift(14)
df['14days_death'] = df['14days_death'].fillna(0)

In [32]:
from linearmodels import RandomEffects, PanelOLS
y_variable_options = ['deathIncrease', 'hospitalizedIncrease', 'inIcuCurrently', 'onVentilatorCurrently']
data = df
data = data.copy()
lag = -120

for y_value in y_variable_options:
    df_reg = data
    df_reg = df_reg.groupby(['state']).apply(lambda x: x.sort_values(by = 'date').shift(lag)).reset_index(drop = True)
    df_reg = df_reg[df_reg[y_value].notna()]
    df_reg['date']= pd.to_datetime(df_reg['date'])
    df_reg = df_reg.set_index(['state','date'])
    exog = sm.add_constant(df_reg[['mobility']])
    endog = df_reg[y_value]
    # random effects model
    model_re = RandomEffects(endog, exog) 
    re_res = model_re.fit() 
    # fixed effects model
    model_fe = PanelOLS(endog, exog, entity_effects = True, time_effects=True) 
    fe_res = model_fe.fit() 
    #print results
    # print(re_res)
    # pvlaues.append(re_res.pvalues)

    print(fe_res)

                          PanelOLS Estimation Summary                           
Dep. Variable:          deathIncrease   R-squared:                        0.0002
Estimator:                   PanelOLS   R-squared (Between):             -0.0115
No. Observations:               11372   R-squared (Within):              -0.0072
Date:                Sun, Feb 21 2021   R-squared (Overall):             -0.0086
Time:                        22:56:18   Log-likelihood                -6.011e+04
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      1.7798
Entities:                          50   P-value                           0.1822
Avg Obs:                       227.44   Distribution:                 F(1,11078)
Min Obs:                       223.00                                           
Max Obs:                       244.00   F-statistic (robust):             1.7798
                            

In [None]:
data