In [1]:
import sys
# adding to the path variables the one folder higher (locally, not changing system variables)
sys.path.append("..")
import pandas as pd
import numpy as np
import warnings
import mlflow
from modeling.config import TRACKING_URI, EXPERIMENT_NAME

RSEED = 42
# Modeling Libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.dummy import DummyClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_predict, cross_val_score, cross_validate
from sklearn.metrics import roc_curve, confusion_matrix, accuracy_score, recall_score, precision_score, f1_score, roc_auc_score

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier

warnings.filterwarnings('ignore')


## Objectives of this notebook
- Implement feature importance by using SHAP approach 
- Using the results of our random forest model 
- Data that are used: cleaned data of second iteration (including all missing values)

In [2]:
df = pd.read_csv('/Users/julianeberek/neuefische/TheFluShot/data/Flu_Shot_Data_cleaned_2.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26707 entries, 0 to 26706
Data columns (total 38 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Unnamed: 0                   26707 non-null  int64  
 1   h1n1_vaccine                 26707 non-null  int64  
 2   seasonal_vaccine             26707 non-null  int64  
 3   h1n1_concern                 26615 non-null  float64
 4   h1n1_knowledge               26591 non-null  float64
 5   behavioral_antiviral_meds    26636 non-null  float64
 6   behavioral_avoidance         26499 non-null  float64
 7   behavioral_face_mask         26688 non-null  float64
 8   behavioral_wash_hands        26665 non-null  float64
 9   behavioral_large_gatherings  26620 non-null  float64
 10  behavioral_outside_home      26625 non-null  float64
 11  behavioral_touch_face        26579 non-null  float64
 12  doctor_recc_h1n1             24547 non-null  float64
 13  doctor_recc_seas

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,h1n1_vaccine,seasonal_vaccine,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
0,0,0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Below Poverty,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,
1,1,0,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe
2,2,0,0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo
3,3,0,1,1.0,1.0,0.0,1.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,
4,4,0,0,2.0,1.0,0.0,1.0,0.0,1.0,1.0,...,"<= $75,000, Above Poverty",Married,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb


In [5]:
# Column 'unnamed: 0' is another index and we will drop it 
df = df.drop('Unnamed: 0', axis=1)

## Setting up the model
- Preparing data 
- Encoding the categorical variables 
- Instatiating the Random Forest model 
- sklearn preprocessing cannot be uses because it does not fit with the SHAP implementation

### First trial: H1N1 as target variable and seasonal flu not included in features

In [8]:
# Renaming the features and the target H1N1 into X and y to make further operations easier

y = df[['h1n1_vaccine']].copy()
y = y.to_numpy()

In [9]:
X = df.drop(columns=['h1n1_vaccine', 'seasonal_vaccine'])

In [16]:
'''rand_forst_train_pred = full_pipeline_rand_forst.predict(X_train)
rand_forst_test_pred = full_pipeline_rand_forst.predict(X_test)'''

'rand_forst_train_pred = full_pipeline_rand_forst.predict(X_train)\nrand_forst_test_pred = full_pipeline_rand_forst.predict(X_test)'

In [17]:
'''print("Accuracy: {:.2f}".format(accuracy_score(y_test, rand_forst_test_pred)))
print("Recall: {:.2f}".format(recall_score(y_test, rand_forst_test_pred)))
print("Precision: {:.2f}".format(precision_score(y_test, rand_forst_test_pred)))
print("F1: {:.2f}".format(f1_score(y_test, rand_forst_test_pred)))
print("ROC: {:.2f}".format(roc_auc_score(y_test, rand_forst_test_pred)))'''

'print("Accuracy: {:.2f}".format(accuracy_score(y_test, rand_forst_test_pred)))\nprint("Recall: {:.2f}".format(recall_score(y_test, rand_forst_test_pred)))\nprint("Precision: {:.2f}".format(precision_score(y_test, rand_forst_test_pred)))\nprint("F1: {:.2f}".format(f1_score(y_test, rand_forst_test_pred)))\nprint("ROC: {:.2f}".format(roc_auc_score(y_test, rand_forst_test_pred)))'

- First, we create dummy variables for our categorical features with on-hot encoding

In [57]:
cat_features = list(X.columns)

In [55]:


encoder = ColumnTransformer([('cat', OneHotEncoder(handle_unknown='error', drop='first'), [0, 1])]

#encoder = OneHotEncoder(handle_unknown='error', drop='first')
encoder.fit_transform(X)

SyntaxError: invalid syntax (1574507622.py, line 4)

- for some unknown reason, the code above does not work. It runs but it does not create dummy variables. 
- need to figure out why this is not working
- using get_dummies for encoding next

In [29]:
X = pd.get_dummies(X,drop_first=True)

In [47]:
X.shape

(26707, 93)

In [31]:
# Train test split for h1n1_vaccine

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size=0.2, random_state=RSEED)

In [41]:
rf_clf = RandomForestClassifier(n_estimators=100, 
                               random_state=RSEED, 
                               max_features = 'sqrt',
                               n_jobs=-1, verbose = 1)

In [44]:
'''rf_clf.fit(X_train, y_train)'''

# This code gives the following error: ValueError: Input contains NaN, infinity or a value too large for dtype('float32').
# I assume that Random Forest cannot deal with missing values 
# This is strange because Random Forest model worked when using sklearn pipeline. 

'rf_clf.fit(X_train, y_train)'

In [50]:
# Quick and dirty replacement of missing values

X_test = X_test.fillna(X_test.mode(), inplace=True)
X_train = X_train.fillna(X_train.mode(), inplace=True)



AttributeError: 'NoneType' object has no attribute 'fillna'

In [53]:
X_test = np.array(X_test).reshape(1, -1)
X_train = np.array(X_train).reshape(1, -1)

array([[None]], dtype=object)

In [52]:
rf_clf.fit(X_train, y_train)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

## Implementing SHAP values

In [None]:
# Importing the module 
# Prior to that, we need to install shap in the venv: pip install shap 
# we need to add shap to our requirements  

import shap

In [None]:
# This does not work: 'Model type not yet supported by TreeExplainer: <class 'sklearn.pipeline.Pipeline'>'
# Need to set up the model without a pipeline 
'''explainer = shap.TreeExplainer(full_pipeline_rand_forst)'''