In [1]:
from linearmodels.panel import PooledOLS, PanelOLS

import statsmodels.api as sm
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
import statsmodels.formula.api as smf

from statsmodels.iolib.summary2 import summary_col

## Read in data

In [2]:
us_daily = pd.read_csv("data/cleaned/daily_df.csv", index_col=0)
us_daily["date"] = pd.to_datetime(us_daily["date"])
us_daily = us_daily.set_index("date")

In [3]:
us_monthly = us_daily.groupby([pd.Grouper(freq='M'), 'state']).mean()
us_monthly
us_monthly = pd.DataFrame(us_monthly.to_records())

In [21]:
micperceptions =  pd.read_csv("data/cleaned/misperception_state_inferred.csv", index_col=0)
micperceptions = pd.DataFrame(micperceptions.to_records())
micperceptions.columns = ["state", "pc_misperception"]
micperceptions.pc_misperception = micperceptions.pc_misperception.str.rstrip('%').astype('float') / 100.0


In [22]:
us_cross_sectional = us_daily.loc[(us_daily.index >= "2020-03-01") & (us_daily.index <= "2021-01-31")].groupby("state").mean()

In [23]:
us_cross_sectional = pd.merge(us_cross_sectional, micperceptions, on="state")

## Disease outcomes

In [24]:
us_cross_sectional.columns

Index(['state', 'positive', 'probableCases', 'negative', 'pending',
       'totalTestResults', 'hospitalizedCurrently', 'hospitalizedCumulative',
       'inIcuCurrently', 'inIcuCumulative', 'onVentilatorCurrently',
       'onVentilatorCumulative', 'recovered', 'totalTestsViral',
       'positiveTestsViral', 'negativeTestsViral', 'positiveCasesViral',
       'deathConfirmed', 'deathProbable', 'totalTestEncountersViral',
       'totalTestsPeopleViral', 'totalTestsAntibody', 'positiveTestsAntibody',
       'negativeTestsAntibody', 'totalTestsPeopleAntibody',
       'positiveTestsPeopleAntibody', 'negativeTestsPeopleAntibody',
       'totalTestsPeopleAntigen', 'positiveTestsPeopleAntigen',
       'totalTestsAntigen', 'positiveTestsAntigen', 'fips', 'positiveIncrease',
       'negativeIncrease', 'total', 'totalTestResultsIncrease', 'posNeg',
       'dataQualityGrade', 'deathIncrease', 'hospitalizedIncrease',
       'commercialScore', 'negativeRegularScore', 'negativeScore',
       'positive

In [25]:
us_cross_sectional["deaths_pc"] = us_cross_sectional["deathIncrease"] / us_cross_sectional["pop_2019"]

In [26]:
us_cross_sectional["hospitalized_pc"] = us_cross_sectional["hospitalizedIncrease"] / us_cross_sectional["pop_2019"]

In [27]:
us_cross_sectional["positivity_rate"] = us_cross_sectional["positiveIncrease"] / us_cross_sectional["totalTestResultsIncrease"]

In [29]:
us_cross_sectional.loc[:,["pc_misperception", "positivity_rate", "hospitalized_pc", "deaths_pc"]].corr()

Unnamed: 0,pc_misperception,positivity_rate,hospitalized_pc,deaths_pc
pc_misperception,1.0,0.291373,0.204244,0.291632
positivity_rate,0.291373,1.0,0.26758,0.236759
hospitalized_pc,0.204244,0.26758,1.0,0.364557
deaths_pc,0.291632,0.236759,0.364557,1.0


There is generally positive relationship between the level of misperceptions and severity of disease outcomes. 

In [59]:
cross_sectional_reg_1_posrate = smf.ols('positivity_rate ~ pc_misperception', data=us_cross_sectional).fit()
cross_sectional_reg_1_posrate.summary()

0,1,2,3
Dep. Variable:,positivity_rate,R-squared:,0.085
Model:,OLS,Adj. R-squared:,0.066
Method:,Least Squares,F-statistic:,4.453
Date:,"Sat, 20 Feb 2021",Prob (F-statistic):,0.0401
Time:,15:02:45,Log-Likelihood:,73.832
No. Observations:,50,AIC:,-143.7
Df Residuals:,48,BIC:,-139.8
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.0239,0.058,-0.411,0.683,-0.141,0.093
pc_misperception,0.8763,0.415,2.110,0.040,0.041,1.711

0,1,2,3
Omnibus:,26.837,Durbin-Watson:,1.835
Prob(Omnibus):,0.0,Jarque-Bera (JB):,45.997
Skew:,1.716,Prob(JB):,1.03e-10
Kurtosis:,6.208,Cond. No.,53.1


In [66]:
us_cross_sectional["res"] = cross_sectional_reg_2_posrate.resid

In [67]:
us_cross_sectional[["state", "res"]].sort_values("res")

Unnamed: 0,state,res
47,West Virginia,-0.073303
18,Maine,-0.07149
44,Vermont,-0.060082
17,Louisiana,-0.058439
30,New Mexico,-0.054073
31,New York,-0.052927
21,Michigan,-0.044995
38,Rhode Island,-0.044389
1,Alaska,-0.038509
36,Oregon,-0.037931


In [42]:
cross_sectional_reg_2_hosp = smf.ols('hospitalized_pc ~ pc_misperception', data=us_cross_sectional).fit()
cross_sectional_reg_2_hosp.summary()

0,1,2,3
Dep. Variable:,hospitalized_pc,R-squared:,0.042
Model:,OLS,Adj. R-squared:,0.022
Method:,Least Squares,F-statistic:,2.09
Date:,"Sat, 20 Feb 2021",Prob (F-statistic):,0.155
Time:,14:52:49,Log-Likelihood:,522.52
No. Observations:,50,AIC:,-1041.0
Df Residuals:,48,BIC:,-1037.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-1.617e-06,7.36e-06,-0.220,0.827,-1.64e-05,1.32e-05
pc_misperception,7.604e-05,5.26e-05,1.446,0.155,-2.97e-05,0.000

0,1,2,3
Omnibus:,2.557,Durbin-Watson:,2.33
Prob(Omnibus):,0.279,Jarque-Bera (JB):,1.684
Skew:,0.218,Prob(JB):,0.431
Kurtosis:,2.214,Cond. No.,53.1


In [43]:
cross_sectional_reg_3_dea = smf.ols('deaths_pc ~ pc_misperception', data=us_cross_sectional).fit()
cross_sectional_reg_3_dea.summary()

0,1,2,3
Dep. Variable:,deaths_pc,R-squared:,0.085
Model:,OLS,Adj. R-squared:,0.066
Method:,Least Squares,F-statistic:,4.462
Date:,"Sat, 20 Feb 2021",Prob (F-statistic):,0.0399
Time:,14:52:49,Log-Likelihood:,600.36
No. Observations:,50,AIC:,-1197.0
Df Residuals:,48,BIC:,-1193.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,5.258e-07,1.55e-06,0.339,0.736,-2.59e-06,3.65e-06
pc_misperception,2.343e-05,1.11e-05,2.112,0.040,1.13e-06,4.57e-05

0,1,2,3
Omnibus:,0.662,Durbin-Watson:,1.949
Prob(Omnibus):,0.718,Jarque-Bera (JB):,0.754
Skew:,0.242,Prob(JB):,0.686
Kurtosis:,2.643,Cond. No.,53.1


In [44]:
cross_sectional_reg_2_posrate = smf.ols('positivity_rate ~ pc_misperception + Median_Household_Income_2019', data=us_cross_sectional).fit()
cross_sectional_reg_2_posrate.summary()

0,1,2,3
Dep. Variable:,positivity_rate,R-squared:,0.19
Model:,OLS,Adj. R-squared:,0.155
Method:,Least Squares,F-statistic:,5.505
Date:,"Sat, 20 Feb 2021",Prob (F-statistic):,0.00711
Time:,14:52:50,Log-Likelihood:,76.876
No. Observations:,50,AIC:,-147.8
Df Residuals:,47,BIC:,-142.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.1636,0.094,1.741,0.088,-0.025,0.353
pc_misperception,0.4378,0.433,1.011,0.317,-0.433,1.309
Median_Household_Income_2019,-1.946e-06,7.89e-07,-2.467,0.017,-3.53e-06,-3.59e-07

0,1,2,3
Omnibus:,24.725,Durbin-Watson:,1.927
Prob(Omnibus):,0.0,Jarque-Bera (JB):,39.722
Skew:,1.613,Prob(JB):,2.37e-09
Kurtosis:,5.943,Cond. No.,3830000.0


In [47]:
cross_sectional_reg_2_dea = smf.ols('deaths_pc ~ pc_misperception + Median_Household_Income_2019', data=us_cross_sectional).fit()
cross_sectional_reg_2_dea.summary()

0,1,2,3
Dep. Variable:,deaths_pc,R-squared:,0.085
Model:,OLS,Adj. R-squared:,0.046
Method:,Least Squares,F-statistic:,2.186
Date:,"Sat, 20 Feb 2021",Prob (F-statistic):,0.124
Time:,14:54:28,Log-Likelihood:,600.36
No. Observations:,50,AIC:,-1195.0
Df Residuals:,47,BIC:,-1189.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,6.234e-07,2.67e-06,0.234,0.816,-4.74e-06,5.99e-06
pc_misperception,2.32e-05,1.23e-05,1.887,0.065,-1.53e-06,4.79e-05
Median_Household_Income_2019,-1.013e-12,2.24e-11,-0.045,0.964,-4.6e-11,4.4e-11

0,1,2,3
Omnibus:,0.706,Durbin-Watson:,1.949
Prob(Omnibus):,0.702,Jarque-Bera (JB):,0.784
Skew:,0.254,Prob(JB):,0.676
Kurtosis:,2.655,Cond. No.,3830000.0


In [51]:
cross_sectional_reg_2_hosp = smf.ols('hospitalized_pc ~ pc_misperception + Median_Household_Income_2019', data=us_cross_sectional).fit()
cross_sectional_reg_2_hosp.summary()

0,1,2,3
Dep. Variable:,hospitalized_pc,R-squared:,0.047
Model:,OLS,Adj. R-squared:,0.006
Method:,Least Squares,F-statistic:,1.156
Date:,"Sat, 20 Feb 2021",Prob (F-statistic):,0.324
Time:,14:56:35,Log-Likelihood:,522.66
No. Observations:,50,AIC:,-1039.0
Df Residuals:,47,BIC:,-1034.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-6.761e-06,1.26e-05,-0.536,0.595,-3.21e-05,1.86e-05
pc_misperception,8.807e-05,5.81e-05,1.515,0.137,-2.89e-05,0.000
Median_Household_Income_2019,5.339e-11,1.06e-10,0.504,0.617,-1.6e-10,2.66e-10

0,1,2,3
Omnibus:,2.387,Durbin-Watson:,2.319
Prob(Omnibus):,0.303,Jarque-Bera (JB):,1.669
Skew:,0.237,Prob(JB):,0.434
Kurtosis:,2.241,Cond. No.,3830000.0


In [53]:
cross_sectional_reg_3_posrate = smf.ols('positivity_rate ~ pc_misperception + Median_Household_Income_2019 + Urbanization_rate', data=us_cross_sectional).fit()
cross_sectional_reg_3_posrate.summary()

0,1,2,3
Dep. Variable:,positivity_rate,R-squared:,0.193
Model:,OLS,Adj. R-squared:,0.14
Method:,Least Squares,F-statistic:,3.66
Date:,"Sat, 20 Feb 2021",Prob (F-statistic):,0.019
Time:,14:57:10,Log-Likelihood:,76.965
No. Observations:,50,AIC:,-145.9
Df Residuals:,46,BIC:,-138.3
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.1643,0.095,1.732,0.090,-0.027,0.355
pc_misperception,0.4047,0.444,0.911,0.367,-0.490,1.299
Median_Household_Income_2019,-2.195e-06,1.01e-06,-2.182,0.034,-4.22e-06,-1.7e-07
Urbanization_rate,0.0003,0.001,0.405,0.687,-0.001,0.002

0,1,2,3
Omnibus:,25.951,Durbin-Watson:,1.916
Prob(Omnibus):,0.0,Jarque-Bera (JB):,42.931
Skew:,1.682,Prob(JB):,4.76e-10
Kurtosis:,6.047,Cond. No.,3890000.0


In [56]:
cross_sectional_reg_3_dea = smf.ols('deaths_pc ~ pc_misperception + Median_Household_Income_2019 + Urbanization_rate ', data=us_cross_sectional).fit()
cross_sectional_reg_3_dea.summary()

0,1,2,3
Dep. Variable:,deaths_pc,R-squared:,0.176
Model:,OLS,Adj. R-squared:,0.103
Method:,Least Squares,F-statistic:,2.404
Date:,"Sat, 20 Feb 2021",Prob (F-statistic):,0.0635
Time:,14:58:25,Log-Likelihood:,602.98
No. Observations:,50,AIC:,-1196.0
Df Residuals:,45,BIC:,-1186.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,7.292e-07,2.66e-06,0.274,0.785,-4.62e-06,6.08e-06
pc_misperception,1.824e-05,1.27e-05,1.434,0.159,-7.38e-06,4.39e-05
Median_Household_Income_2019,-3.848e-11,2.79e-11,-1.378,0.175,-9.47e-11,1.78e-11
Urbanization_rate,4.109e-08,1.88e-08,2.190,0.034,3.3e-09,7.89e-08
Unemployment_rate_2019,-7.152e-10,2.98e-07,-0.002,0.998,-6.01e-07,6e-07

0,1,2,3
Omnibus:,2.012,Durbin-Watson:,1.974
Prob(Omnibus):,0.366,Jarque-Bera (JB):,1.871
Skew:,0.455,Prob(JB):,0.392
Kurtosis:,2.733,Cond. No.,4070000.0


In [55]:
cross_sectional_reg_3_hosp = smf.ols('hospitalized_pc ~ pc_misperception + Median_Household_Income_2019 + Urbanization_rate', data=us_cross_sectional).fit()
cross_sectional_reg_3_hosp.summary()

0,1,2,3
Dep. Variable:,hospitalized_pc,R-squared:,0.052
Model:,OLS,Adj. R-squared:,-0.01
Method:,Least Squares,F-statistic:,0.8401
Date:,"Sat, 20 Feb 2021",Prob (F-statistic):,0.479
Time:,14:57:23,Log-Likelihood:,522.79
No. Observations:,50,AIC:,-1038.0
Df Residuals:,46,BIC:,-1030.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-6.647e-06,1.27e-05,-0.523,0.604,-3.23e-05,1.9e-05
pc_misperception,8.263e-05,5.96e-05,1.386,0.172,-3.74e-05,0.000
Median_Household_Income_2019,1.239e-11,1.35e-10,0.092,0.927,-2.59e-10,2.84e-10
Urbanization_rate,4.496e-08,9.06e-08,0.496,0.622,-1.37e-07,2.27e-07

0,1,2,3
Omnibus:,1.812,Durbin-Watson:,2.32
Prob(Omnibus):,0.404,Jarque-Bera (JB):,1.366
Skew:,0.191,Prob(JB):,0.505
Kurtosis:,2.286,Cond. No.,3890000.0
