In [None]:
import sys
# adding to the path variables the one folder higher (locally, not changing system variables)
sys.path.append("..")
import pandas as pd
import numpy as np
import warnings
import mlflow
from modeling.config import TRACKING_URI, EXPERIMENT_NAME

pd.set_option('display.max_columns', None)

RSEED = 42
# Modeling Libraries

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px  # pip install plotly needs to executed
import plotly.graph_objects as go

In [None]:
df = pd.read_csv('../data/Flu_Shot_Data_cleaned_2.csv')

In [None]:
df.drop('Unnamed: 0', inplace=True, axis=1)

## Examining correlations between features 

In [None]:
df.corr(method='spearman')

In [None]:
# Checking out highest positive correlations

s = df.corr(method='spearman').unstack().sort_values(kind="quicksort", ascending=False).drop_duplicates()
print(s[1:20])

### Relevant positive correlations between features (>0.5)
- doctor_recc_h1n1 AND doctor_recc_seasonal 
- behavioral_large_gatherings AND behavioral_outside_home
- opinion_seas_risk AND opinion_h1n1_risk
- opinion_h1n1_sick_from_vacc AND opinion_seas_sick_from_vacc
#### further positive correlations (>0.35)
- opinion_h1n1_vacc_effective AND opinion_seas_vacc_effective
- opinion_seas_risk AND seasonal_vaccine 
- opinion_h1n1_risk AND h1n1_concern
- seasonal_vaccine  AND doctor_recc_seasonal
- behavioral_touch_face AND behavioral_wash_hands
- opinion_seas_risk AND opinion_seas_vacc_effective
- h1n1_concern AND opinion_h1n1_sick_from_vacc

Assumption: correlations below 0.35 can be neglected. 
When deciding about dropping of features, we should first consider correlations above 0.5

In [None]:
# Checking out highest negative correlations

s_neg = df.corr(method='spearman').unstack().sort_values(kind="quicksort", ascending=True).drop_duplicates()
print(s_neg[:5])

- negative correlations are very small so we don't need to consider them 

## Plotting correlations matrices 

In [None]:
# with Pandas 
corr = df.corr(method='spearman')
corr.style.background_gradient(cmap='coolwarm').set_precision(2)

In [None]:
# with Seaborn
fig, ax = plt.subplots(figsize=(11,11)) 
sns.heatmap(corr, 
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values)

In [None]:
# with plotly 

fig = px.imshow(corr)
fig.show()

## Investigating causality between features

In [None]:
# pip install dowhy
# although installed, the code does not work 
import dowhy
from dowhy import CausalModel