In [1]:
import sys
# adding to the path variables the one folder higher (locally, not changing system variables)
sys.path.append("..")
import pandas as pd
import numpy as np
import warnings
import mlflow
from modeling.config import TRACKING_URI, EXPERIMENT_NAME

pd.set_option('display.max_columns', None)

RSEED = 42
# Modeling Libraries

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px  # pip install plotly needs to executed
import plotly.graph_objects as go

In [2]:
df = pd.read_csv('../data/Flu_Shot_Data_cleaned_2.csv')

In [5]:
df.drop('Unnamed: 0', inplace=True, axis=1)

## Examining correlations between features 

In [6]:
df.corr(method='spearman')

Unnamed: 0,h1n1_vaccine,seasonal_vaccine,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,doctor_recc_seasonal,chronic_med_condition,child_under_6_months,health_worker,health_insurance,opinion_h1n1_vacc_effective,opinion_h1n1_risk,opinion_h1n1_sick_from_vacc,opinion_seas_vacc_effective,opinion_seas_risk,opinion_seas_sick_from_vacc,household_adults,household_children
h1n1_vaccine,1.0,0.377143,0.122666,0.120748,0.040608,0.04769,0.070498,0.074712,0.017822,0.021768,0.071648,0.39389,0.209864,0.095207,0.066962,0.169768,0.12117,0.291602,0.30584,0.059032,0.195961,0.256199,-0.002808,0.01336,-0.002012
seasonal_vaccine,0.377143,1.0,0.154467,0.121633,0.006277,0.076395,0.050083,0.112414,0.064025,0.053509,0.120228,0.198607,0.36919,0.170174,0.012097,0.127311,0.200858,0.201753,0.219161,0.030128,0.397874,0.389286,-0.075427,-0.058912,-0.121141
h1n1_concern,0.122666,0.154467,1.0,0.058046,0.090914,0.230672,0.157175,0.291068,0.259242,0.250697,0.248786,0.152314,0.136396,0.096359,0.050583,0.034141,-0.003231,0.220374,0.386032,0.368674,0.222948,0.336218,0.225148,-0.013091,0.054138
h1n1_knowledge,0.120748,0.121633,0.058046,1.0,-0.007249,0.082288,0.033785,0.08801,-0.045385,-0.063482,0.086909,0.095197,0.073031,-0.018905,0.024308,0.174527,0.119876,0.128278,0.076609,-0.026393,0.081695,0.077751,-0.072194,0.036928,0.054768
behavioral_antiviral_meds,0.040608,0.006277,0.090914,-0.007249,1.0,0.049247,0.146261,0.064119,0.106287,0.127679,0.070868,0.051235,0.030909,0.008465,0.028788,0.009465,-0.063988,0.036185,0.091523,0.071785,0.022263,0.082382,0.076568,0.040678,0.086445
behavioral_avoidance,0.04769,0.076395,0.230672,0.082288,0.049247,1.0,0.064946,0.33813,0.227675,0.220348,0.335335,0.068145,0.074088,0.039435,-0.000414,0.00118,0.032662,0.104317,0.129188,0.142265,0.112245,0.13385,0.087983,0.026497,0.047839
behavioral_face_mask,0.070498,0.050083,0.157175,0.033785,0.146261,0.064946,1.0,0.083363,0.180907,0.163382,0.104335,0.084282,0.069481,0.068113,0.039726,0.069992,-0.040257,0.043002,0.115158,0.095864,0.049258,0.106315,0.078267,0.00884,0.005663
behavioral_wash_hands,0.074712,0.112414,0.291068,0.08801,0.064119,0.33813,0.083363,1.0,0.195364,0.192619,0.365064,0.088729,0.102044,0.03026,0.036188,0.053761,0.031919,0.131807,0.177711,0.163299,0.138282,0.175515,0.09595,0.017651,0.052621
behavioral_large_gatherings,0.017822,0.064025,0.259242,-0.045385,0.106287,0.227675,0.180907,0.195364,1.0,0.584085,0.253683,0.082242,0.093557,0.104721,0.021168,-0.032319,-0.059,0.055668,0.115254,0.179292,0.093324,0.127798,0.124246,-0.036554,-0.011233
behavioral_outside_home,0.021768,0.053509,0.250697,-0.063482,0.127679,0.220348,0.163382,0.192619,0.584085,1.0,0.267719,0.070346,0.085622,0.098858,0.018195,-0.034619,-0.061381,0.056771,0.112681,0.16643,0.083213,0.114996,0.128918,-0.033192,-0.01206


In [31]:
# Checking out highest positive correlations

s = df.corr(method='spearman').unstack().sort_values(kind="quicksort", ascending=False).drop_duplicates()
print(s[1:20])

doctor_recc_seasonal         doctor_recc_h1n1               0.591868
behavioral_outside_home      behavioral_large_gatherings    0.584085
opinion_seas_risk            opinion_h1n1_risk              0.564522
opinion_h1n1_sick_from_vacc  opinion_seas_sick_from_vacc    0.502192
opinion_h1n1_vacc_effective  opinion_seas_vacc_effective    0.444043
seasonal_vaccine             opinion_seas_vacc_effective    0.397874
h1n1_vaccine                 doctor_recc_h1n1               0.393890
seasonal_vaccine             opinion_seas_risk              0.389286
h1n1_concern                 opinion_h1n1_risk              0.386032
h1n1_vaccine                 seasonal_vaccine               0.377143
doctor_recc_seasonal         seasonal_vaccine               0.369190
h1n1_concern                 opinion_h1n1_sick_from_vacc    0.368674
opinion_seas_vacc_effective  opinion_seas_risk              0.366353
behavioral_wash_hands        behavioral_touch_face          0.365064
behavioral_avoidance         behav

### Relevant positive correlations between features (>0.5)
- doctor_recc_h1n1 AND doctor_recc_seasonal 
- behavioral_large_gatherings AND behavioral_outside_home
- opinion_seas_risk AND opinion_h1n1_risk
- opinion_h1n1_sick_from_vacc AND opinion_seas_sick_from_vacc
#### further positive correlations (>0.35)
- opinion_h1n1_vacc_effective AND opinion_seas_vacc_effective
- opinion_seas_risk AND seasonal_vaccine 
- opinion_h1n1_risk AND h1n1_concern
- seasonal_vaccine  AND doctor_recc_seasonal
- behavioral_touch_face AND behavioral_wash_hands
- opinion_seas_risk AND opinion_seas_vacc_effective
- h1n1_concern AND opinion_h1n1_sick_from_vacc

Assumption: correlations below 0.35 can be neglected. 
When deciding about dropping of features, we should first consider correlations above 0.5

In [34]:
# Checking out highest negative correlations

s_neg = df.corr(method='spearman').unstack().sort_values(kind="quicksort", ascending=True).drop_duplicates()
print(s_neg[:5])

household_children           seasonal_vaccine        -0.121141
                             chronic_med_condition   -0.116512
opinion_seas_vacc_effective  household_children      -0.085451
household_adults             chronic_med_condition   -0.076951
opinion_seas_sick_from_vacc  seasonal_vaccine        -0.075427
dtype: float64


- negative correlations are very small so we don't need to consider them 

In [38]:
# maybe make plots for all highly correlated features

## Investigating causality between features

In [2]:
# pip install dowhy
import dowhy
from dowhy import CausalModel

ModuleNotFoundError: No module named 'dowhy'