In [1]:
import sys
# adding to the path variables the one folder higher (locally, not changing system variables)
sys.path.append("..")
import pandas as pd
import numpy as np
import warnings
import mlflow
from modeling.config import TRACKING_URI, EXPERIMENT_NAME

RSEED = 42
# Modeling Libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer

from sklearn.dummy import DummyClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_predict, cross_val_score, cross_validate
from sklearn.metrics import roc_curve, confusion_matrix, accuracy_score, recall_score, precision_score, f1_score, roc_auc_score
from sklearn import metrics

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier

warnings.filterwarnings('ignore')


## Objectives of this notebook
- Implement feature importance by using SHAP approach 
- Using the results of our random forest model 
- Data that are used: cleaned data of second iteration (including all missing values)

In [2]:
df = pd.read_csv('../data/Flu_Shot_Data_cleaned_2.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26707 entries, 0 to 26706
Data columns (total 38 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Unnamed: 0                   26707 non-null  int64  
 1   h1n1_vaccine                 26707 non-null  int64  
 2   seasonal_vaccine             26707 non-null  int64  
 3   h1n1_concern                 26615 non-null  float64
 4   h1n1_knowledge               26591 non-null  float64
 5   behavioral_antiviral_meds    26636 non-null  float64
 6   behavioral_avoidance         26499 non-null  float64
 7   behavioral_face_mask         26688 non-null  float64
 8   behavioral_wash_hands        26665 non-null  float64
 9   behavioral_large_gatherings  26620 non-null  float64
 10  behavioral_outside_home      26625 non-null  float64
 11  behavioral_touch_face        26579 non-null  float64
 12  doctor_recc_h1n1             24547 non-null  float64
 13  doctor_recc_seas

In [4]:
# Column 'unnamed: 0' is another index and we will drop it 
df = df.drop('Unnamed: 0', axis=1)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26707 entries, 0 to 26706
Data columns (total 37 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   h1n1_vaccine                 26707 non-null  int64  
 1   seasonal_vaccine             26707 non-null  int64  
 2   h1n1_concern                 26615 non-null  float64
 3   h1n1_knowledge               26591 non-null  float64
 4   behavioral_antiviral_meds    26636 non-null  float64
 5   behavioral_avoidance         26499 non-null  float64
 6   behavioral_face_mask         26688 non-null  float64
 7   behavioral_wash_hands        26665 non-null  float64
 8   behavioral_large_gatherings  26620 non-null  float64
 9   behavioral_outside_home      26625 non-null  float64
 10  behavioral_touch_face        26579 non-null  float64
 11  doctor_recc_h1n1             24547 non-null  float64
 12  doctor_recc_seasonal         24547 non-null  float64
 13  chronic_med_cond

Convert features with string values to numerical:

In [6]:
cleanup = {"age_group": {"18 - 34 Years": 1, "35 - 44 Years": 2, "45 - 54 Years": 3, "55 - 64 Years": 4,
                                  "65+ Years": 5},
            "education": {"< 12 Years": 1, "12 Years": 2, "Some College": 3, "College Graduate": 4},
            "race": {"White": 1, "Black": 2, "Hispanic": 3, "Other or Multiple": 4},
            "sex" : {"Female": 1, "Male": 2},
            "rent_or_own" : {"Own": 1, "Rent": 2},
            "hhs_geo_region" : {"lzgpxyit": 1, "fpwskwrf": 2, "qufhixun": 3, "bhuqouqj": 4, "oxchjgsf": 5, "kbazzjca": 6, "mlyzmhmf": 7, "atmpeygn": 8, "lrircsnp": 9, "dqpwygqj": 10},
            "census_msa" : {"MSA, Not Principle  City": 1, "MSA, Principle City": 2, "Non-MSA": 3},
            "income_poverty" : {"Below Poverty": 1, "<= $75,000, Above Poverty": 2, "> $75,000": 3},
            "employment_industry" : {"fcxhlnwr": 1, "wxleyezf": 2, "ldnlellj": 3, "pxcmvdjn": 4, "atmlpfrs": 5, "arjwrbjb": 6, "xicduogh": 7, "mfikgejo": 8, "vjjrobsf": 9,
                                    "rucpziij": 10, "xqicxuve": 11, "saaquncn": 12, "cfqqtusy": 13, "nduyfdeo": 14, "mcubkhph": 15, "wlfvacwt": 16, "dotnnunm": 17, "haxffmxo": 18, "msuufmds": 19, "phxvnwax": 20,
                                    "qnlwzans": 21},
           "employment_occupation" : {"xtkaffoo": 1, "mxkfnird": 2, "emcorrxb": 3, "cmhcxjea": 4, "xgwztkwe": 5, "hfxkjkmi": 6, "qxajmpny": 7, "xqwwgdyp": 8, "kldqjyjy": 9,
                                    "uqqtjvyb": 10, "tfqavkke": 11, "ukymxvdu": 12, "vlluhbov": 13, "oijqvulv": 14, "ccgxvspp": 15, "bxpfxfdn": 16, "haliazsg": 17, "rcertsgn": 18, "xzmlyyjv": 19, "dlvbwzss": 20,
                                    "hodpvpew": 21, "dcjcmpih": 22, "pvmttkik": 23},
           "marital_status" : {"Married": 1, "Not Married": 2},
           "employment_status" : {"Employed": 1, "Not in Labor Force": 2, "Unemployed": 3}
                                  }

In [7]:
df_num = df.replace(cleanup)

In [8]:
df_num_cols = df_num.columns

Random Forest doesn't work with null values (apparently it's an sklearn-specific thing https://datascience.stackexchange.com/questions/72764/can-random-forest-regressor-or-decision-trees-handle-missing-values-and-outliers ). Therefore, a dummy value (-999) is used (https://www.kaggle.com/questions-and-answers/60595 ):

In [9]:
imp_const = SimpleImputer(strategy='constant', fill_value=-999)

In [10]:
df_num_imp = imp_const.fit_transform(df_num)

In [11]:
df_num_imp = pd.DataFrame(df_num_imp, columns=df_num_cols)

In [12]:
df_num_imp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26707 entries, 0 to 26706
Data columns (total 37 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   h1n1_vaccine                 26707 non-null  float64
 1   seasonal_vaccine             26707 non-null  float64
 2   h1n1_concern                 26707 non-null  float64
 3   h1n1_knowledge               26707 non-null  float64
 4   behavioral_antiviral_meds    26707 non-null  float64
 5   behavioral_avoidance         26707 non-null  float64
 6   behavioral_face_mask         26707 non-null  float64
 7   behavioral_wash_hands        26707 non-null  float64
 8   behavioral_large_gatherings  26707 non-null  float64
 9   behavioral_outside_home      26707 non-null  float64
 10  behavioral_touch_face        26707 non-null  float64
 11  doctor_recc_h1n1             26707 non-null  float64
 12  doctor_recc_seasonal         26707 non-null  float64
 13  chronic_med_cond

## Setting up the model
- Preparing data 
- Encoding the categorical variables 
- Instatiating the Random Forest model 
- sklearn preprocessing cannot be uses because it does not fit with the SHAP implementation

- First, we create dummy variables for our categorical features with on-hot encoding

## H1N1 prediction

Setting up target variable:

In [36]:
y_h1n1 = df['h1n1_vaccine'].copy()
y_h1n1 = y_h1n1.to_numpy()
y_h1n1

array([0, 0, 0, ..., 0, 0, 0])

Setting up features:

In [37]:
X = df_num_imp.drop(columns=['h1n1_vaccine', 'seasonal_vaccine'])

Model:

In [38]:
rand_forst = RandomForestClassifier()

Train test split for h1n1_vaccine:

In [39]:
X_train, X_test, y_h1n1_train, y_h1n1_test = train_test_split(X, y_h1n1, stratify = y_h1n1, test_size=0.2, random_state=RSEED)

Fitting model and predicting for H1N1 vaccine:

In [40]:
rand_forst_h1n1_model = rand_forst.fit(X_train, y_h1n1_train)

In [41]:
# Predictions on train data
rand_forst_h1n1_train_pred = rand_forst_h1n1_model.predict(X_train)

# Predictions on test data
rand_forst_h1n1_test_pred = rand_forst_h1n1_model.predict(X_test)

Evaluation:

In [49]:
# H1N1 ROC train data
print(roc_auc_score(y_h1n1_train, rand_forst_h1n1_train_pred))

# H1N1 ROC test data
print(roc_auc_score(y_h1n1_test, rand_forst_h1n1_test_pred))

1.0
0.7092063887646873


In [50]:
# H1N1 ROC train data
print(accuracy_score(y_h1n1_train, rand_forst_h1n1_train_pred))

# H1N1 ROC test data
print(accuracy_score(y_h1n1_test, rand_forst_h1n1_test_pred))

1.0
0.8515537251965556


In [51]:
# H1N1 ROC train data
print(f1_score(y_h1n1_train, rand_forst_h1n1_train_pred))

# H1N1 ROC test data
print(f1_score(y_h1n1_test, rand_forst_h1n1_test_pred))

1.0
0.5692558392178164


In [52]:
# H1N1 ROC train data
print(precision_score(y_h1n1_train, rand_forst_h1n1_train_pred))

# H1N1 ROC test data
print(precision_score(y_h1n1_test, rand_forst_h1n1_test_pred))

1.0
0.7422096317280453


In [53]:
# H1N1 ROC train data
print(recall_score(y_h1n1_train, rand_forst_h1n1_train_pred))

# H1N1 ROC test data
print(recall_score(y_h1n1_test, rand_forst_h1n1_test_pred))

1.0
0.4616740088105727


**Feature importance**

In [43]:
# H1N1 feature importance

In [None]:
!pip install eli5
import eli5
from eli5.sklearn import PermutationImportance

In [45]:
perm = PermutationImportance(rand_forst_h1n1_model, random_state=RSEED).fit(X_test, y_h1n1_test)
eli5.show_weights(perm, top=None, feature_names = X_test.columns.tolist())

Weight,Feature
0.0379  ± 0.0030,doctor_recc_h1n1
0.0292  ± 0.0043,health_insurance
0.0180  ± 0.0038,opinion_h1n1_risk
0.0104  ± 0.0030,opinion_h1n1_vacc_effective
0.0060  ± 0.0026,opinion_seas_risk
0.0042  ± 0.0029,doctor_recc_seasonal
0.0034  ± 0.0008,employment_industry
0.0030  ± 0.0028,employment_occupation
0.0028  ± 0.0024,health_worker
0.0027  ± 0.0009,opinion_seas_vacc_effective


## Seasonal prediction

Setting up target variable:

In [22]:
y_seasonal = df['seasonal_vaccine'].copy()
y_seasonal = y_seasonal.to_numpy()
y_seasonal

array([0, 1, 0, ..., 1, 0, 0])

Setting up features:

In [23]:
X = df_num_imp.drop(columns=['h1n1_vaccine', 'seasonal_vaccine'])

Model:

In [24]:
rand_forst = RandomForestClassifier()

Train test split for seasonal vaccine:

In [25]:
X_train, X_test, y_seas_train, y_seas_test = train_test_split(X, y_seasonal, stratify = y_seasonal, test_size=0.2, random_state=RSEED)

Fitting model and predicting for seasonal vaccine:

In [26]:
rand_forst_seas_model = rand_forst.fit(X_train, y_seas_train)

In [27]:
# Predictions on train data
rand_forst_seas_train_pred = rand_forst_seas_model.predict(X_train)

# Predictions on test data
rand_forst_seas_test_pred = rand_forst_seas_model.predict(X_test)

Evaluation:

In [28]:
# seasonal ROC train data
print(roc_auc_score(y_seas_train, rand_forst_seas_train_pred))

# seasonal ROC test data
print(roc_auc_score(y_seas_test, rand_forst_seas_test_pred))

1.0
0.7819102766962638


In [29]:
# seasonal ROC train data
print(accuracy_score(y_seas_train, rand_forst_seas_train_pred))

# seasonal ROC test data
print(accuracy_score(y_seas_test, rand_forst_seas_test_pred))

1.0
0.7847248221639835


In [46]:
# seasonal ROC train data
print(f1_score(y_seas_train, rand_forst_seas_train_pred))

# seasonal ROC test data
print(f1_score(y_seas_test, rand_forst_seas_test_pred))

1.0
0.7622001654259718


In [47]:
# seasonal ROC train data
print(precision_score(y_seas_train, rand_forst_seas_train_pred))

# seasonal ROC test data
print(precision_score(y_seas_test, rand_forst_seas_test_pred))

1.0
0.7845891868880375


In [48]:
# seasonal ROC train data
print(recall_score(y_seas_train, rand_forst_seas_train_pred))

# seasonal ROC test data
print(recall_score(y_seas_test, rand_forst_seas_test_pred))

1.0
0.7410534780860475


**Feature importance**

In [32]:
perm = PermutationImportance(rand_forst_seas_model, random_state=RSEED).fit(X_test, y_seas_test)
eli5.show_weights(perm, top=None, feature_names = X_test.columns.tolist())

Weight,Feature
0.0519  ± 0.0109,doctor_recc_seasonal
0.0502  ± 0.0103,opinion_seas_risk
0.0450  ± 0.0108,opinion_seas_vacc_effective
0.0280  ± 0.0044,age_group
0.0066  ± 0.0033,opinion_seas_sick_from_vacc
0.0055  ± 0.0023,opinion_h1n1_risk
0.0040  ± 0.0045,health_insurance
0.0030  ± 0.0026,employment_industry
0.0029  ± 0.0016,chronic_med_condition
0.0028  ± 0.0023,education
