# EDA of Google Trends data

In [29]:
from linearmodels import PanelOLS
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
import statsmodels.formula.api as smf

In [2]:
trend_month_state = pd.read_csv("data/cleaned/trend_month_state.csv", index_col=0)
trend_month_DMA = pd.read_csv("data/cleaned/trend_month_DMA.csv", index_col=0)
trend_all_time_state = pd.read_csv("data/cleaned/trend_all_time_state.csv", index_col=0)
trend_all_time_DMA = pd.read_csv("data/cleaned/trend_all_time_DMA.csv", index_col=0)

In [3]:
trend_all_time_state.replace(0, np.nan, inplace = True)
trend_month_state.replace(0, np.nan, inplace = True)

## EDA

In [4]:
trend_all_time_state.head()

Unnamed: 0,date_range,state,covid conspiracy,covid hoax,pizzagate,plandemic,wuhan lab
0,2020-03-01 2021-01-31,Alabama,52.0,38.0,80.0,57.0,39.0
1,2020-03-01 2021-01-31,Alaska,90.0,94.0,49.0,77.0,46.0
2,2020-03-01 2021-01-31,Arizona,63.0,48.0,79.0,53.0,49.0
3,2020-03-01 2021-01-31,Arkansas,51.0,46.0,79.0,51.0,52.0
4,2020-03-01 2021-01-31,California,54.0,49.0,56.0,58.0,49.0


In [5]:
trend_all_time_state[
    ["covid conspiracy", "covid hoax", "pizzagate", "plandemic", "wuhan lab"]
].corr(method="pearson")

Unnamed: 0,covid conspiracy,covid hoax,pizzagate,plandemic,wuhan lab
covid conspiracy,1.0,0.253764,-0.120353,0.487821,0.135147
covid hoax,0.253764,1.0,-0.112811,0.317223,0.234428
pizzagate,-0.120353,-0.112811,1.0,-0.239771,-0.300099
plandemic,0.487821,0.317223,-0.239771,1.0,0.128978
wuhan lab,0.135147,0.234428,-0.300099,0.128978,1.0


In [6]:
trend_all_time_state[["covid conspiracy", "covid hoax", "plandemic", "wuhan lab"]].corr(
    method="pearson"
)

Unnamed: 0,covid conspiracy,covid hoax,plandemic,wuhan lab
covid conspiracy,1.0,0.253764,0.487821,0.135147
covid hoax,0.253764,1.0,0.317223,0.234428
plandemic,0.487821,0.317223,1.0,0.128978
wuhan lab,0.135147,0.234428,0.128978,1.0


It turns out that popularity of COVID-related conspiracy theories is not strongly correlated with "pizzagate", which is not related to COVID-19 per se. Among all covid-19 related keywords, there is a positive, yet not necessarily strong correlation. 

### Principle component analyses

In [7]:
pca_pipe =  Pipeline([('inputer', SimpleImputer()), ('PCA', PCA(n_components=2))])
principalComponents = pca_pipe.fit_transform(
    trend_all_time_state[["covid conspiracy", "covid hoax", "plandemic", "wuhan lab"]]
)
principalDf = pd.DataFrame(data=principalComponents)

In [8]:
pca_pipe["PCA"].explained_variance_ratio_

array([0.46055991, 0.23254269])

In [9]:
pca_pipe["PCA"].components_

array([[ 0.62681881,  0.47529325,  0.54963197,  0.28124579],
       [-0.43683202,  0.43500537, -0.25854643,  0.74370818]])

## Panel analyses

## Develop an index

### with "Wuhan Lab"

In [10]:
trend_all_time_state["index_1"] = trend_all_time_state[["covid hoax", "covid conspiracy", "plandemic", "wuhan lab"]].mean(axis = 1)

### without "Wuhan Lab"

In [11]:
trend_all_time_state["index_2"] = trend_all_time_state[["covid hoax", "covid conspiracy", "plandemic"]].mean(axis = 1)

In [12]:
trend_all_time_state.sort_values("index_2", ascending = False)

Unnamed: 0,date_range,state,covid conspiracy,covid hoax,pizzagate,plandemic,wuhan lab,index_1,index_2
1,2020-03-01 2021-01-31,Alaska,90.0,94.0,49.0,77.0,46.0,76.75,87.0
29,2020-03-01 2021-01-31,New Hampshire,92.0,78.0,55.0,65.0,91.0,81.5,78.333333
45,2020-03-01 2021-01-31,Vermont,55.0,100.0,48.0,79.0,100.0,83.5,78.0
23,2020-03-01 2021-01-31,Minnesota,82.0,50.0,57.0,94.0,52.0,69.5,75.333333
34,2020-03-01 2021-01-31,North Dakota,100.0,39.0,54.0,85.0,38.0,65.5,74.666667
27,2020-03-01 2021-01-31,Nebraska,78.0,58.0,72.0,81.0,39.0,64.0,72.333333
50,2020-03-01 2021-01-31,Wyoming,71.0,,61.0,72.0,32.0,58.333333,71.5
37,2020-03-01 2021-01-31,Oregon,70.0,65.0,67.0,75.0,43.0,63.25,70.0
5,2020-03-01 2021-01-31,Colorado,77.0,67.0,63.0,66.0,35.0,61.25,70.0
12,2020-03-01 2021-01-31,Idaho,53.0,89.0,69.0,66.0,42.0,62.5,69.333333


## Regression analyses

### Relationship between conspiracy theory popularity and compliance (mobility)

In [13]:
us_daily = pd.read_csv("data/cleaned/daily_df.csv", index_col=0)
us_daily["date"] = pd.to_datetime(us_daily["date"])
us_daily = us_daily.set_index("date")

In [14]:
us_monthly = us_daily.groupby([pd.Grouper(freq='M'), 'state']).mean()
us_monthly

Unnamed: 0_level_0,Unnamed: 1_level_0,positive,probableCases,negative,pending,totalTestResults,hospitalizedCurrently,hospitalizedCumulative,inIcuCurrently,inIcuCumulative,onVentilatorCurrently,...,C6_Stay at home requirements,C6_Flag,C7_Restrictions on internal movement,C7_Flag,C8_International travel controls,ConfirmedCases,ConfirmedDeaths,StringencyIndex,GovernmentResponseIndex,ContainmentHealthIndex
date,state,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2020-01-31,Florida,0.000000,,,,2.000000e+00,,,,,,...,0.0,,0.0,,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
2020-01-31,Massachusetts,,,,,2.700000e+00,,,,,,...,0.6,1.0,0.0,,0.0,0.300000,0.000000,2.220000,1.332000,1.536000
2020-01-31,Virginia,,,,,,,,,,,...,0.0,,0.0,,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
2020-01-31,Washington,1.333333,,,,0.000000e+00,,,,,,...,0.0,,0.0,,0.0,1.000000,0.000000,0.000000,3.394211,3.913684
2020-02-29,Florida,0.000000,,,,1.024138e+01,,,,,,...,0.0,,0.0,,0.0,0.000000,0.000000,0.958621,0.574138,0.663793
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-02-28,Virginia,533521.941176,108944.411765,,251.411765,5.468171e+06,2196.411765,22329.588235,446.235294,,280.117647,...,2.0,1.0,0.0,,0.0,533521.941176,6835.823529,53.373529,53.283529,57.631176
2021-02-28,Washington,321918.647059,15809.117647,,,4.761709e+06,740.529412,18326.117647,175.411765,,69.176471,...,1.0,0.0,1.0,0.0,2.0,323048.647059,4541.058824,62.960000,63.220000,61.410000
2021-02-28,West Virginia,125328.235294,24920.941176,1.907528e+06,,2.032857e+06,373.470588,,100.058824,,47.000000,...,1.0,0.0,1.0,1.0,2.0,125328.235294,2144.235294,46.300000,50.110000,57.820000
2021-02-28,Wisconsin,602149.941176,51173.529412,2.543102e+06,38.176471,6.377024e+06,531.882353,24978.058824,137.176471,2206.823529,,...,1.0,1.0,1.0,1.0,0.0,602149.941176,6648.176471,47.690000,54.112000,58.588000


In [15]:
us_monthly = pd.DataFrame(us_monthly.to_records())

In [16]:
trend_month_state["date"] = pd.to_datetime(trend_month_state["date_range"].str.slice(start = -10))

In [17]:
trend_month_state = trend_month_state.drop("date_range", axis = 1)

In [18]:
trend_month_state

Unnamed: 0,state,covid conspiracy,covid hoax,pizzagate,plandemic,wuhan lab,date
0,Alabama,29.0,10.0,76.0,,41.0,2020-03-31
1,Alaska,73.0,80.0,97.0,,44.0,2020-03-31
2,Arizona,34.0,25.0,60.0,,24.0,2020-03-31
3,Arkansas,100.0,,11.0,,50.0,2020-03-31
4,California,41.0,18.0,44.0,,29.0,2020-03-31
...,...,...,...,...,...,...,...
556,Virginia,19.0,43.0,10.0,5.0,14.0,2021-01-31
557,Washington,23.0,,12.0,,52.0,2021-01-31
558,West Virginia,,,35.0,,,2021-01-31
559,Wisconsin,11.0,25.0,45.0,8.0,,2021-01-31


In [19]:
us_monthly = pd.merge(trend_month_state, us_monthly, on = ["state", "date"], how = "left")

In [20]:
us_monthly = us_monthly.set_index("date")

#### Cross-sectional

In [21]:
us_cross_sectional = us_daily.groupby("state").mean()

In [22]:
us_cross_sectional.head()

Unnamed: 0_level_0,positive,probableCases,negative,pending,totalTestResults,hospitalizedCurrently,hospitalizedCumulative,inIcuCurrently,inIcuCumulative,onVentilatorCurrently,...,C6_Stay at home requirements,C6_Flag,C7_Restrictions on internal movement,C7_Flag,C8_International travel controls,ConfirmedCases,ConfirmedDeaths,StringencyIndex,GovernmentResponseIndex,ContainmentHealthIndex
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alabama,152878.247126,32034.665399,812813.5,33.0,934423.2,1184.939577,16614.2125,,1531.476038,,...,1.057803,0.966565,0.0,,0.0,153084.663793,2474.939655,35.718439,40.565723,42.81604
Alaska,15237.02071,,,8.5,510032.2,56.078431,389.485549,,,7.664151,...,0.758621,0.116883,1.706897,0.92638,1.936782,15512.859599,75.020057,53.132787,52.47181,53.053736
American Samoa,0.0,,1253.12,10.5,1197.673,,,,,,...,,,,,,,,,,
Arizona,230197.700855,12375.980159,1094707.0,44.24,2228371.0,1982.344051,18215.883191,531.459807,,347.585209,...,0.643678,1.0,0.212644,1.0,0.212644,230206.666667,4653.578348,42.127989,47.673678,46.597615
Arkansas,91312.756447,24916.214286,839528.9,47.588235,918856.8,538.387387,5427.229102,308.421384,43.0,100.00303,...,0.282421,1.0,0.236311,1.0,0.236311,91338.467049,1449.977077,44.070519,45.216657,47.682075


In [23]:
trend_all_time_state.drop("date_range", axis = 1)

Unnamed: 0,state,covid conspiracy,covid hoax,pizzagate,plandemic,wuhan lab,index_1,index_2
0,Alabama,52.0,38.0,80.0,57.0,39.0,46.5,49.0
1,Alaska,90.0,94.0,49.0,77.0,46.0,76.75,87.0
2,Arizona,63.0,48.0,79.0,53.0,49.0,53.25,54.666667
3,Arkansas,51.0,46.0,79.0,51.0,52.0,50.0,49.333333
4,California,54.0,49.0,56.0,58.0,49.0,52.5,53.666667
5,Colorado,77.0,67.0,63.0,66.0,35.0,61.25,70.0
6,Connecticut,39.0,49.0,48.0,63.0,64.0,53.75,50.333333
7,Delaware,17.0,60.0,78.0,28.0,60.0,41.25,35.0
8,District of Columbia,60.0,45.0,28.0,85.0,79.0,67.25,63.333333
9,Florida,50.0,39.0,59.0,45.0,44.0,44.5,44.666667


In [24]:
us_cross_sectional = pd.merge(us_cross_sectional, trend_all_time_state.drop("date_range", axis = 1), on="state")

In [25]:
us_cross_sectional[
    [
        "covid conspiracy",
        "covid hoax",
        "pizzagate",
        "plandemic",
        "wuhan lab",
        "index_1",
        "index_2",
        "retail_and_recreation_percent_change_from_baseline",
        "grocery_and_pharmacy_percent_change_from_baseline",
        "parks_percent_change_from_baseline",
        "transit_stations_percent_change_from_baseline",
        "workplaces_percent_change_from_baseline",
        "residential_percent_change_from_baseline",
    ]
].corr()

Unnamed: 0,covid conspiracy,covid hoax,pizzagate,plandemic,wuhan lab,index_1,index_2,retail_and_recreation_percent_change_from_baseline,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,transit_stations_percent_change_from_baseline,workplaces_percent_change_from_baseline,residential_percent_change_from_baseline
covid conspiracy,1.0,0.253764,-0.120353,0.487821,0.135147,0.7216,0.784104,-0.01255,-0.022287,0.179848,-0.053732,0.069281,-0.004129
covid hoax,0.253764,1.0,-0.112811,0.317223,0.234428,0.673981,0.691606,0.222925,0.241081,0.29888,0.119585,0.125759,-0.126679
pizzagate,-0.120353,-0.112811,1.0,-0.239771,-0.300099,-0.286249,-0.20842,0.40476,0.307774,-0.000874,0.404453,0.366606,-0.48548
plandemic,0.487821,0.317223,-0.239771,1.0,0.128978,0.724384,0.787842,0.011192,0.214272,0.443996,0.063424,0.05799,-0.038358
wuhan lab,0.135147,0.234428,-0.300099,0.128978,1.0,0.544144,0.205043,-0.329921,-0.395268,0.020604,-0.454625,-0.403559,0.430357
index_1,0.7216,0.673981,-0.286249,0.724384,0.544144,1.0,0.931693,-0.033854,0.019215,0.354191,-0.113211,-0.046036,0.088378
index_2,0.784104,0.691606,-0.20842,0.787842,0.205043,0.931693,1.0,0.107221,0.199177,0.40852,0.073439,0.126204,-0.088919
retail_and_recreation_percent_change_from_baseline,-0.01255,0.222925,0.40476,0.011192,-0.329921,-0.033854,0.107221,1.0,0.8744,0.622192,0.859456,0.919312,-0.915298
grocery_and_pharmacy_percent_change_from_baseline,-0.022287,0.241081,0.307774,0.214272,-0.395268,0.019215,0.199177,0.8744,1.0,0.686546,0.845084,0.826631,-0.835964
parks_percent_change_from_baseline,0.179848,0.29888,-0.000874,0.443996,0.020604,0.354191,0.40852,0.622192,0.686546,1.0,0.489512,0.537091,-0.47314


In [26]:
us_cross_sectional.columns

Index(['state', 'positive', 'probableCases', 'negative', 'pending',
       'totalTestResults', 'hospitalizedCurrently', 'hospitalizedCumulative',
       'inIcuCurrently', 'inIcuCumulative', 'onVentilatorCurrently',
       'onVentilatorCumulative', 'recovered', 'totalTestsViral',
       'positiveTestsViral', 'negativeTestsViral', 'positiveCasesViral',
       'deathConfirmed', 'deathProbable', 'totalTestEncountersViral',
       'totalTestsPeopleViral', 'totalTestsAntibody', 'positiveTestsAntibody',
       'negativeTestsAntibody', 'totalTestsPeopleAntibody',
       'positiveTestsPeopleAntibody', 'negativeTestsPeopleAntibody',
       'totalTestsPeopleAntigen', 'positiveTestsPeopleAntigen',
       'totalTestsAntigen', 'positiveTestsAntigen', 'fips', 'positiveIncrease',
       'negativeIncrease', 'total', 'totalTestResultsIncrease', 'posNeg',
       'dataQualityGrade', 'deathIncrease', 'hospitalizedIncrease',
       'commercialScore', 'negativeRegularScore', 'negativeScore',
       'positive

In [28]:
X = us_cross_sectional[["covid conspiracy",
        "covid hoax",
        "pizzagate",
        "plandemic",
        "wuhan lab",
        "index_1",
        "index_2",
    ]]

In [32]:
results = smf.ols('un_residential_percent_change_from_baseline ~ plandemic + `covid hoax`', data=us_cross_sectional).fit()

PatsyError: error tokenizing input (maybe an unclosed string?)
    un_residential_percent_change_from_baseline ~ plandemic + `covid hoax`
                                                             ^