In [None]:
# Perform test for stationarity
def adf_test(timeseries):
    dftest = adfuller(timeseries, autolag='AIC')
    result = pd.Series(dftest[0:4], index=['Test Statistic','P-value','Lags Used','No of Observations'])
    for key,value in dftest[4].items():
        result['Critical Value (%s)'%key] = value
    return result

# Detrend the relative search values
def detrend(data):
  adf_table = data[covid_symptoms]
  adf_table.columns = covid_symptoms_short
  adf_table = adf_table.fillna(0)
  results = adf_table.apply(adf_test, axis = 0)
  results = results.reset_index()
  results[results['index'] == 'P-value']
  p_values = results[results['index'] == 'P-value']
  p_values = p_values.melt(id_vars=['index'], value_vars = covid_symptoms_short)
  #detrend_cols = list(p_values[p_values['value'] > 0.05]['variable'])
  detrended = signal.detrend(adf_table)
  detrended_df = pd.DataFrame(detrended, columns = adf_table.columns)
  adf_table = detrended_df
  adf_table['new_confirmed'] = data['new_confirmed']
  adf_table['new_deceased'] = data['new_deceased']
  #adf_table[detrend_cols] = detrended_df
  return adf_table

# Calculate cross correlation
def cross_correlation(predictors):
  data_cases = pd.DataFrame(columns = predictors.columns)
  data_deaths = pd.DataFrame(columns = predictors.columns)
  for i in predictors.columns:
    corrs_cases = sm.tsa.stattools.ccf(predictors['new_confirmed'], predictors[[i]], adjusted=False)
    corrs_deaths = sm.tsa.stattools.ccf(predictors['new_deceased'], predictors[[i]], adjusted=False)
    data_cases[i] = corrs_cases[0:14]
    data_deaths[i] = corrs_deaths[0:22]
  data_cases, data_deaths = data_cases.abs(), data_deaths.abs()
  data_cases = data_cases.reset_index().rename(columns={"index": "lag"})
  data_deaths = data_deaths.reset_index().rename(columns={"index": "lag"})
  corr_coeff_cases = data_cases[data_cases['lag'] == 0]
  corr_coeff_deaths = data_deaths[data_deaths['lag'] == 0]
  data_cases = data_cases[data_cases['lag']!=0]
  data_deaths = data_deaths[data_deaths['lag']!=0]
  return data_cases, data_deaths, corr_coeff_cases, corr_coeff_deaths

# Plot the cross correlation values
def plot_corrs(cases, deaths):
  cases = cases[['lag', 'fever', 'sore_throat', 'cough', 'shortness_of_breath', 'pneumonia', 'shallow_breathing']]
  deaths = deaths[['lag', 'fever', 'sore_throat', 'cough', 'shortness_of_breath', 'pneumonia', 'shallow_breathing']]
  cases.plot(x='lag', figsize=(20, 10), linewidth = 3)
  plt.title('Time lagged correlation: COVID Confirmed Cases Vs. Symptom Search')
  plt.xlabel('Days of Delay')
  plt.ylabel('Pearson r')
  plt.legend()
  deaths.plot(x='lag', figsize=(20, 10), linewidth = 3)
  plt.title('Time lagged correlation: COVID Fatalities Vs. Symptom Search')
  plt.xlabel('Days of Delay')
  plt.ylabel('Pearson r')
  plt.legend()

detrend_data = detrend(df_wave1)
adf_table = df_wave1[covid_symptoms]
adf_table.columns = covid_symptoms_short
adf_table['new_confirmed'] = df_wave1['new_confirmed']
adf_table['new_deceased'] = df_wave1['new_deceased']
df_corrs_cases, df_corrs_deaths, corr_coeff_cases, corr_coeff_deaths = cross_correlation(adf_table)
plot_corrs(df_corrs_cases, df_corrs_deaths)

detrend_data = detrend(df_wave2)
adf_table = df_wave2[covid_symptoms]
adf_table.columns = covid_symptoms_short
adf_table['new_confirmed'] = df_wave2['new_confirmed']
adf_table['new_deceased'] = df_wave2['new_deceased']
df_corrs_cases, df_corrs_deaths, corr_coeff_cases, corr_coeff_deaths = cross_correlation(adf_table)
plot_corrs(df_corrs_cases, df_corrs_deaths)

# Windowed time lagged cross correlation
no_splits = 4
samples_per_split = df_wave2.shape[0]/no_splits
rss=[]
for t in range(0, no_splits):
    d1 = df_wave2['cases'].loc[(t)*samples_per_split:(t+1)*samples_per_split]
    d2 = df_wave2['search_trends_sore_throat'].loc[(t)*samples_per_split:(t+1)*samples_per_split]
    rs = [crosscorr(d1,d2, lag) for lag in range(0, 22)]
    rss.append(rs)
rss = pd.DataFrame(rss)
f,ax = plt.subplots(figsize=(20,5))
sns.heatmap(rss,cmap='RdBu_r',ax=ax)
ax.set(title=f'Windowed Time Lagged Cross Correlation',xlim=[0, 22], xlabel='lag offset', ylabel='Window epochs')
ax.set_xticklabels([int(item) for item in ax.get_xticks()]);

# Windowed time lagged cross correlation
no_splits = 6
samples_per_split = df_wave3.shape[0]/no_splits
rss=[]
for t in range(0, no_splits):
    d1 = df_wave3['cases'].loc[(t)*samples_per_split:(t+1)*samples_per_split]
    d2 = df_wave3['search_trends_sore_throat'].loc[(t)*samples_per_split:(t+1)*samples_per_split]
    rs = [crosscorr(d1,d2, lag) for lag in range(0, 22)]
    rss.append(rs)
rss = pd.DataFrame(rss)
f,ax = plt.subplots(figsize=(20,5))
sns.heatmap(rss,cmap='RdBu_r',ax=ax)
ax.set(title=f'Windowed Time Lagged Cross Correlation',xlim=[0, 22], xlabel='lag offset', ylabel='Window epochs')
ax.set_xticklabels([int(item) for item in ax.get_xticks()]);

In [None]:
# regression via sklearn
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

X = df_did[['g', 't', 'gt']]
y = df_did.fever

lr.fit(X, y)
lr.coef_  # the coefficient for gt is the DID, which is 2.75

In [None]:
np.random.seed(123)
x = np.linspace(0, 10, 200)
y = df_main['search_trends_cough'][:200]
fig, ax = plt.subplots(figsize=(12, 5))
ax.plot(x, y)

ylim = ax.get_ylim()

grad1 = ax.imshow(np.linspace(0, 1, 256).reshape(-1, 1), cmap='PuRd', vmin=-0.5, aspect='auto',
                  extent=[x.min(), x.max(), 0, y.max()], origin='lower')
poly_pos = ax.fill_between(x, y.min(), y, alpha=0.1)
grad1.set_clip_path(poly_pos.get_paths()[0], transform=ax.transData)
poly_pos.remove()

ax.set_ylim(ylim)
ax.axhline(0, color='black')  # show a line at x=0
plt.show()