In [1]:
import sys
# adding to the path variables the one folder higher (locally, not changing system variables)
sys.path.append("..")
import pandas as pd
import numpy as np
import warnings
import mlflow
from modeling.config import TRACKING_URI, EXPERIMENT_NAME

RSEED = 42
# Modeling Libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.dummy import DummyClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_predict, cross_val_score, cross_validate
from sklearn.metrics import roc_curve, confusion_matrix, accuracy_score, recall_score, precision_score, f1_score, roc_auc_score

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier

warnings.filterwarnings('ignore')


## Objectives of this notebook
- Implement feature importance by using SHAP approach 
- Using the results of our random forest model 
- Data that are used: cleaned data of second iteration (including all missing values)

In [2]:
df = pd.read_csv('../data/Flu_Shot_Data_cleaned_1.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21853 entries, 0 to 21852
Data columns (total 32 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Unnamed: 0                   21853 non-null  int64  
 1   h1n1_vaccine                 21853 non-null  int64  
 2   seasonal_vaccine             21853 non-null  int64  
 3   h1n1_concern                 21853 non-null  float64
 4   h1n1_knowledge               21853 non-null  float64
 5   behavioral_antiviral_meds    21853 non-null  float64
 6   behavioral_avoidance         21853 non-null  float64
 7   behavioral_face_mask         21853 non-null  float64
 8   behavioral_wash_hands        21853 non-null  float64
 9   behavioral_large_gatherings  21853 non-null  float64
 10  behavioral_outside_home      21853 non-null  float64
 11  behavioral_touch_face        21853 non-null  float64
 12  doctor_recc_h1n1             21853 non-null  float64
 13  doctor_recc_seas

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,h1n1_vaccine,seasonal_vaccine,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,...,opinion_seas_sick_from_vacc,age_group,education,race,sex,rent_or_own,hhs_geo_region,census_msa,household_adults,household_children
0,0,0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,55 - 64 Years,< 12 Years,White,Female,Own,oxchjgsf,Non-MSA,0.0,0.0
1,1,0,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,...,4.0,35 - 44 Years,12 Years,White,Male,Rent,bhuqouqj,"MSA, Not Principle City",0.0,0.0
2,2,0,1,1.0,1.0,0.0,1.0,0.0,1.0,1.0,...,1.0,65+ Years,12 Years,White,Female,Rent,lrircsnp,"MSA, Principle City",0.0,0.0
3,3,0,0,2.0,1.0,0.0,1.0,0.0,1.0,1.0,...,4.0,45 - 54 Years,Some College,White,Female,Own,qufhixun,"MSA, Not Principle City",1.0,0.0
4,4,0,0,3.0,1.0,0.0,1.0,0.0,1.0,0.0,...,4.0,65+ Years,12 Years,White,Male,Own,atmpeygn,"MSA, Principle City",2.0,3.0


In [5]:
# Column 'unnamed: 0' is another index and we will drop it 
df = df.drop('Unnamed: 0', axis=1)

## Setting up the model
- Preparing data 
- Encoding the categorical variables 
- Instatiating the Random Forest model 
- sklearn preprocessing cannot be uses because it does not fit with the SHAP implementation

### First trial: H1N1 as target variable and seasonal flu not included in features

In [6]:
# Renaming the features and the target H1N1 into X and y to make further operations easier

y = df[['h1n1_vaccine']].copy()
y = y.to_numpy()

In [7]:
X = df.drop(columns=['h1n1_vaccine', 'seasonal_vaccine'])

- First, we create dummy variables for our categorical features with on-hot encoding

In [8]:
cat_features = list(X.columns)

In [9]:
encoder = OneHotEncoder(handle_unknown='error', drop='first').fit(X)
#encoder = ColumnTransformer([('cat', OneHotEncoder(handle_unknown='error', drop='first'), [0, 1])]
#encoder = OneHotEncoder(handle_unknown='error', drop='first')
encoder.fit(X)

OneHotEncoder(drop='first')

In [10]:
# one hot encoding does not encode floats. So float columns need to be transformed to strings upfront

In [11]:
floats = df.select_dtypes(include=['float64'])
floats

Unnamed: 0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,child_under_6_months,health_worker,opinion_h1n1_vacc_effective,opinion_h1n1_risk,opinion_h1n1_sick_from_vacc,opinion_seas_vacc_effective,opinion_seas_risk,opinion_seas_sick_from_vacc,household_adults,household_children
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,3.0,1.0,2.0,2.0,1.0,2.0,0.0,0.0
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,5.0,4.0,4.0,4.0,2.0,4.0,0.0,0.0
2,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,3.0,3.0,5.0,5.0,4.0,1.0,0.0,0.0
3,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,0.0,0.0,3.0,3.0,2.0,3.0,1.0,4.0,1.0,0.0
4,3.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,5.0,2.0,1.0,5.0,4.0,4.0,2.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21848,2.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,4.0,2.0,4.0,4.0,2.0,4.0,3.0,0.0
21849,2.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,3.0,1.0,1.0,5.0,2.0,2.0,0.0,0.0
21850,1.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,4.0,2.0,2.0,5.0,1.0,1.0,1.0,0.0
21851,2.0,2.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,...,0.0,0.0,4.0,4.0,2.0,5.0,4.0,2.0,0.0,0.0


In [16]:
floats_convert = floats.astype(str)
floats_convert

Unnamed: 0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,child_under_6_months,health_worker,opinion_h1n1_vacc_effective,opinion_h1n1_risk,opinion_h1n1_sick_from_vacc,opinion_seas_vacc_effective,opinion_seas_risk,opinion_seas_sick_from_vacc,household_adults,household_children
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,3.0,1.0,2.0,2.0,1.0,2.0,0.0,0.0
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,5.0,4.0,4.0,4.0,2.0,4.0,0.0,0.0
2,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,3.0,3.0,5.0,5.0,4.0,1.0,0.0,0.0
3,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,0.0,0.0,3.0,3.0,2.0,3.0,1.0,4.0,1.0,0.0
4,3.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,5.0,2.0,1.0,5.0,4.0,4.0,2.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21848,2.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,4.0,2.0,4.0,4.0,2.0,4.0,3.0,0.0
21849,2.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,3.0,1.0,1.0,5.0,2.0,2.0,0.0,0.0
21850,1.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,4.0,2.0,2.0,5.0,1.0,1.0,1.0,0.0
21851,2.0,2.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,...,0.0,0.0,4.0,4.0,2.0,5.0,4.0,2.0,0.0,0.0


In [17]:
df = df.drop(floats, axis=1)


AttributeError: 'NoneType' object has no attribute 'drop'

In [14]:
df.info()

AttributeError: 'NoneType' object has no attribute 'info'

In [None]:
X

Unnamed: 0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,hhs_geo_region_dqpwygqj,hhs_geo_region_fpwskwrf,hhs_geo_region_kbazzjca,hhs_geo_region_lrircsnp,hhs_geo_region_lzgpxyit,hhs_geo_region_mlyzmhmf,hhs_geo_region_oxchjgsf,hhs_geo_region_qufhixun,"census_msa_MSA, Principle City",census_msa_Non-MSA
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0,0,0,0,0,0,1,0,0,1
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0,0,0,1,0,0,0,0,1,0
3,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,1,0,0
4,3.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21848,2.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0,1,0,0,0,0,0,0,1,0
21849,2.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,1
21850,1.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0,0,0,0,1,0,0,0,1,0
21851,2.0,2.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,...,0,0,0,0,1,0,0,0,0,0


- for some unknown reason, the code above does not work. It runs but it does not create dummy variables. 
- need to figure out why this is not working
- using get_dummies for encoding next

In [None]:
cat_features = X

In [None]:
X = pd.get_dummies(cat_features,drop_first=True)

In [None]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21853 entries, 0 to 21852
Data columns (total 45 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   h1n1_concern                    21853 non-null  float64
 1   h1n1_knowledge                  21853 non-null  float64
 2   behavioral_antiviral_meds       21853 non-null  float64
 3   behavioral_avoidance            21853 non-null  float64
 4   behavioral_face_mask            21853 non-null  float64
 5   behavioral_wash_hands           21853 non-null  float64
 6   behavioral_large_gatherings     21853 non-null  float64
 7   behavioral_outside_home         21853 non-null  float64
 8   behavioral_touch_face           21853 non-null  float64
 9   doctor_recc_h1n1                21853 non-null  float64
 10  doctor_recc_seasonal            21853 non-null  float64
 11  chronic_med_condition           21853 non-null  float64
 12  child_under_6_months            

In [None]:
# Train test split for h1n1_vaccine

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size=0.2, random_state=RSEED)

In [None]:
rf_clf = RandomForestClassifier(n_estimators=100, 
                               random_state=RSEED, 
                               max_features = 'auto',
                               n_jobs=-1, verbose = 1)

In [None]:
'''rf_clf.fit(X_train, y_train)'''

# This code gives the following error: ValueError: Input contains NaN, infinity or a value too large for dtype('float32').
# I assume that Random Forest cannot deal with missing values 
# This is strange because Random Forest model worked when using sklearn pipeline. 

'rf_clf.fit(X_train, y_train)'

In [None]:
# Quick and dirty replacement of missing values

X_test = X_test.fillna(X_test.mode(), inplace=True)
X_train = X_train.fillna(X_train.mode(), inplace=True)



AttributeError: 'NoneType' object has no attribute 'fillna'

In [None]:
X_test = np.array(X_test).reshape(1, -1)
X_train = np.array(X_train).reshape(1, -1)

array([[None]], dtype=object)

In [None]:
rf_clf.fit(X_train, y_train)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

## Implementing SHAP values

In [None]:
# Importing the module 
# Prior to that, we need to install shap in the venv: pip install shap 
# we need to add shap to our requirements  

import shap

In [None]:
# This does not work: 'Model type not yet supported by TreeExplainer: <class 'sklearn.pipeline.Pipeline'>'
# Need to set up the model without a pipeline 
'''explainer = shap.TreeExplainer(full_pipeline_rand_forst)'''