In [1]:
import sys
# adding to the path variables the one folder higher (locally, not changing system variables)
sys.path.append("..")
import pandas as pd
import numpy as np
import warnings
import mlflow
from modeling.config import TRACKING_URI, EXPERIMENT_NAME

RSEED = 42
# Modeling Libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.dummy import DummyClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_predict, cross_val_score, cross_validate
from sklearn.metrics import roc_curve, confusion_matrix, accuracy_score, recall_score, precision_score, f1_score, roc_auc_score

from sklearn.linear_model import LogisticRegression



warnings.filterwarnings('ignore')


In [2]:
df_features = pd.read_csv('../data/Flu_Shot_Learning_Predict_H1N1_and_Seasonal_Flu_Vaccines_-_Training_Features.csv')

In [3]:
df_features.head()

Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,Below Poverty,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,"<= $75,000, Above Poverty",Married,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb


In [4]:
df_target = pd.read_csv('../data/Flu_Shot_Learning_Predict_H1N1_and_Seasonal_Flu_Vaccines_-_Training_Labels.csv')

In [5]:
# Get info for the target
df_target.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26707 entries, 0 to 26706
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   respondent_id     26707 non-null  int64
 1   h1n1_vaccine      26707 non-null  int64
 2   seasonal_vaccine  26707 non-null  int64
dtypes: int64(3)
memory usage: 626.1 KB


In [6]:
df_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26707 entries, 0 to 26706
Data columns (total 36 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   respondent_id                26707 non-null  int64  
 1   h1n1_concern                 26615 non-null  float64
 2   h1n1_knowledge               26591 non-null  float64
 3   behavioral_antiviral_meds    26636 non-null  float64
 4   behavioral_avoidance         26499 non-null  float64
 5   behavioral_face_mask         26688 non-null  float64
 6   behavioral_wash_hands        26665 non-null  float64
 7   behavioral_large_gatherings  26620 non-null  float64
 8   behavioral_outside_home      26625 non-null  float64
 9   behavioral_touch_face        26579 non-null  float64
 10  doctor_recc_h1n1             24547 non-null  float64
 11  doctor_recc_seasonal         24547 non-null  float64
 12  chronic_med_condition        25736 non-null  float64
 13  child_under_6_mo

In [7]:
# We are concatenating both dataframes into one 

df = pd.merge(df_target, df_features, on=['respondent_id'])

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26707 entries, 0 to 26706
Data columns (total 38 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   respondent_id                26707 non-null  int64  
 1   h1n1_vaccine                 26707 non-null  int64  
 2   seasonal_vaccine             26707 non-null  int64  
 3   h1n1_concern                 26615 non-null  float64
 4   h1n1_knowledge               26591 non-null  float64
 5   behavioral_antiviral_meds    26636 non-null  float64
 6   behavioral_avoidance         26499 non-null  float64
 7   behavioral_face_mask         26688 non-null  float64
 8   behavioral_wash_hands        26665 non-null  float64
 9   behavioral_large_gatherings  26620 non-null  float64
 10  behavioral_outside_home      26625 non-null  float64
 11  behavioral_touch_face        26579 non-null  float64
 12  doctor_recc_h1n1             24547 non-null  float64
 13  doctor_recc_seas

We will drop the following columns for our first iteration:    
- health_insurance, employment_industry, employment_occupation, income_poverty, marital_status, employment_status

# Data cleaning

In [9]:
col_drop = ['health_insurance', 'employment_industry', 'employment_occupation', 'income_poverty', 'marital_status', 'employment_status']

df.drop(col_drop, axis=1, inplace=True)

In [10]:
# dropping rows containing 0 for ['doctor_recc_h1n1', 'doctor_recc_seasonal']

df.dropna(inplace=True)


In [11]:
df.isnull().sum(axis = 0)

respondent_id                  0
h1n1_vaccine                   0
seasonal_vaccine               0
h1n1_concern                   0
h1n1_knowledge                 0
behavioral_antiviral_meds      0
behavioral_avoidance           0
behavioral_face_mask           0
behavioral_wash_hands          0
behavioral_large_gatherings    0
behavioral_outside_home        0
behavioral_touch_face          0
doctor_recc_h1n1               0
doctor_recc_seasonal           0
chronic_med_condition          0
child_under_6_months           0
health_worker                  0
opinion_h1n1_vacc_effective    0
opinion_h1n1_risk              0
opinion_h1n1_sick_from_vacc    0
opinion_seas_vacc_effective    0
opinion_seas_risk              0
opinion_seas_sick_from_vacc    0
age_group                      0
education                      0
race                           0
sex                            0
rent_or_own                    0
hhs_geo_region                 0
census_msa                     0
household_

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21853 entries, 0 to 26706
Data columns (total 32 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   respondent_id                21853 non-null  int64  
 1   h1n1_vaccine                 21853 non-null  int64  
 2   seasonal_vaccine             21853 non-null  int64  
 3   h1n1_concern                 21853 non-null  float64
 4   h1n1_knowledge               21853 non-null  float64
 5   behavioral_antiviral_meds    21853 non-null  float64
 6   behavioral_avoidance         21853 non-null  float64
 7   behavioral_face_mask         21853 non-null  float64
 8   behavioral_wash_hands        21853 non-null  float64
 9   behavioral_large_gatherings  21853 non-null  float64
 10  behavioral_outside_home      21853 non-null  float64
 11  behavioral_touch_face        21853 non-null  float64
 12  doctor_recc_h1n1             21853 non-null  float64
 13  doctor_recc_seas

In [13]:
df.reset_index(inplace=True)

We dropped all rows with missing values
- Maybe later on, we will want to refine this approach.

In [14]:
 # We are looking for unique values in order to identify whether we have duplicates

df['respondent_id'].nunique()

21853

All values are unique, no duplicates. 

## EDA

In [15]:
# checking for balance in data

print(df.h1n1_vaccine.value_counts())
print(df.seasonal_vaccine.value_counts())

0    16906
1     4947
Name: h1n1_vaccine, dtype: int64
0    11371
1    10482
Name: seasonal_vaccine, dtype: int64


The H1N1 vaccine outcome appears to be unbalanced (almost 17k vs 5k) (read lit--what is considered unbalanced?)
The seasonal vaccine outcome appears to be fairly balanced

We may want to deal with the lack of balance later

In [16]:
# Understanding observations and rows

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21853 entries, 0 to 21852
Data columns (total 33 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   index                        21853 non-null  int64  
 1   respondent_id                21853 non-null  int64  
 2   h1n1_vaccine                 21853 non-null  int64  
 3   seasonal_vaccine             21853 non-null  int64  
 4   h1n1_concern                 21853 non-null  float64
 5   h1n1_knowledge               21853 non-null  float64
 6   behavioral_antiviral_meds    21853 non-null  float64
 7   behavioral_avoidance         21853 non-null  float64
 8   behavioral_face_mask         21853 non-null  float64
 9   behavioral_wash_hands        21853 non-null  float64
 10  behavioral_large_gatherings  21853 non-null  float64
 11  behavioral_outside_home      21853 non-null  float64
 12  behavioral_touch_face        21853 non-null  float64
 13  doctor_recc_h1n1

In [17]:
#columns to drop because they're little use
col_drop = ['index', 'respondent_id']

df.drop(col_drop, axis=1, inplace=True)

- our target variables are h1n1_vaccine and seasonal_vaccine
- at the moment we are working with 28 feature variables--all categorical (refer to challenge documentation for description; we will need to transfer this info to the README)
- seven of the variables are strings--we will convert these to numeric encoding so we can look at correlations in Profiler
household_adults and household_children are 'top-coded' up to 3--that means that household with 3+ adults (or children) will fall into the '3' group
- 'hhs_geo_region' is an anonymised string
- we should remember that the current column names (which would be used as labels in the graphs) are not really human-readable--we need to keep this in mind when we're making plots (either rename the columns beforehand, or include a plotting command to change the labels)

24  age_group                    21853 non-null  object 
 25  education                    21853 non-null  object 
 26  race                         21853 non-null  object 
 27  sex                          21853 non-null  object 
 28  rent_or_own                  21853 non-null  object 
 29  hhs_geo_region               21853 non-null  object 
 30  census_msa                   21853 non-null  object 

24  age_group                    21853 non-null  object 
 25  education                    21853 non-null  object 
 26  race                         21853 non-null  object 
 27  sex                          21853 non-null  object 
 28  rent_or_own                  21853 non-null  object 
 29  hhs_geo_region               21853 non-null  object 
 30  census_msa                   21853 non-null  object 

In [18]:
# conversion of string variables to numeric
#a separate dataframe is made for the Profiler; the original dataframe will be retained for one-hot encoding (so the column headings we get during one-hot encoding remain meaningful)
df["age_group"].value_counts()

65+ Years        5393
55 - 64 Years    4655
45 - 54 Years    4390
18 - 34 Years    4277
35 - 44 Years    3138
Name: age_group, dtype: int64

In [19]:
df["education"].value_counts()

College Graduate    8839
Some College        6123
12 Years            4963
< 12 Years          1928
Name: education, dtype: int64

In [20]:
df["race"].value_counts()

White                17485
Black                 1670
Hispanic              1428
Other or Multiple     1270
Name: race, dtype: int64

In [21]:
df["sex"].value_counts()

Female    13105
Male       8748
Name: sex, dtype: int64

In [22]:
df["rent_or_own"].value_counts()

Own     16647
Rent     5206
Name: rent_or_own, dtype: int64

In [23]:
df["hhs_geo_region"].value_counts()

lzgpxyit    3481
fpwskwrf    2626
qufhixun    2588
bhuqouqj    2373
oxchjgsf    2356
kbazzjca    2299
mlyzmhmf    1832
atmpeygn    1694
lrircsnp    1686
dqpwygqj     918
Name: hhs_geo_region, dtype: int64

In [24]:
df["census_msa"].value_counts()

MSA, Not Principle  City    9558
MSA, Principle City         6362
Non-MSA                     5933
Name: census_msa, dtype: int64

In [25]:
cleanup = {"age_group": {"18 - 34 Years": 1, "35 - 44 Years": 2, "45 - 54 Years": 3, "55 - 64 Years": 4,
                                  "65+ Years": 5},
            "education": {"< 12 Years": 1, "12 Years": 2, "Some College": 3, "College Graduate": 4},
            "race": {"White": 1, "Black": 2, "Hispanic": 3, "Other or Multiple": 4},
            "sex" : {"Female": 1, "Male": 2},
            "rent_or_own" : {"Own": 1, "Rent": 2},
            "hhs_geo_region" : {"lzgpxyit": 1, "fpwskwrf": 2, "qufhixun": 3, "bhuqouqj": 4, "oxchjgsf": 5, "kbazzjca": 6, "mlyzmhmf": 7, "atmpeygn": 8, "lrircsnp": 9, "dqpwygqj": 10},
            "census_msa" : {"MSA, Not Principle  City": 1, "MSA, Principle City": 2, "Non-MSA": 3}
                                  }

In [26]:
df_for_profiler = df.replace(cleanup)
df_for_profiler.head()

Unnamed: 0,h1n1_vaccine,seasonal_vaccine,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,...,opinion_seas_sick_from_vacc,age_group,education,race,sex,rent_or_own,hhs_geo_region,census_msa,household_adults,household_children
0,0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,2.0,4,1,1,1,1,5,3,0.0,0.0
1,0,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,...,4.0,2,2,1,2,2,4,1,0.0,0.0
2,0,1,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,1.0,5,2,1,1,2,9,2,0.0,0.0
3,0,0,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,4.0,3,3,1,1,1,3,1,1.0,0.0
4,0,0,3.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,4.0,5,2,1,2,1,8,2,2.0,3.0


In [27]:
# run Profiler to explore the data

#import Profiler
#before opening VS Code, run this command in the terminal: pip install pandas-profiling==2.11.0
from pandas_profiling import ProfileReport

In [28]:
profile = ProfileReport(df_for_profiler, title="Pandas Profiling Report", explorative=True)

In [29]:
#profile

**Possible multicollinearity (between features)**
- behavioral_large_gatherings vs behavioral_outside_home
- doctor_recc_h1n1 vs doctor_recc_seasonal
- opinion_h1n1_risk vs opinion_seas_risk
- household_children vs age_group

**Possible outliers and features to be aware of**
- behavioral_antiviral_meds : very unbalanced between categories; the people taking anviral meds could have something else going on (e.g. already sick, or worried about getting flu and taking meds profilactically)--be careful about this variable
- behavioral_face_mask
- behavioral_wash_hands
-child_under_6_months
- health_worker
- opinion_h1n1_vacc_effective and opinion_seasonal_vacc_effective (1.0 group)
- race (not many non-white respondents)
- household_adults and household_children (3.0 groups pretty small)

## Creating Pipelines

In [30]:
# Pipeline for categorical features 
cat_pipeline = Pipeline([

    ('1hot', OneHotEncoder(handle_unknown='ignore'))
])

In [31]:
cat_features = list(df.columns)



In [32]:
cat_features.remove('h1n1_vaccine')

In [33]:
cat_features.remove('seasonal_vaccine')

In [34]:
X = df


In [35]:
#y = pd.DataFrame([df.pop(x) for x in ['h1n1_vaccine', 'seasonal_vaccine']]).T

y = df[['h1n1_vaccine', 'seasonal_vaccine']].copy()

In [36]:
y = y.to_numpy()

In [37]:
X = df

In [38]:
preprocessor = ColumnTransformer([
    ('cat', cat_pipeline, cat_features)
])

In [39]:
estimators = MultiOutputClassifier(
    estimator=LogisticRegression()#(penalty="l2", C=1)
)


In [40]:
full_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", estimators),
])

In [41]:

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size=0.2, random_state=RSEED)

In [42]:
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

X_train shape: (17482, 31)
X_test shape: (4371, 31)
y_train shape: (17482, 2)
y_test shape: (4371, 2)


In [43]:
full_pipeline.fit(X_train, y_train)



Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat',
                                                  Pipeline(steps=[('1hot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['h1n1_concern',
                                                   'h1n1_knowledge',
                                                   'behavioral_antiviral_meds',
                                                   'behavioral_avoidance',
                                                   'behavioral_face_mask',
                                                   'behavioral_wash_hands',
                                                   'behavioral_large_gatherings',
                                                   'behavioral_outside_home',
                                                   'behavioral_touch_face',
                                               

In [44]:
# Figure it out later


#y_train_predicted = cross_val_predict(full_pipeline, X_train, y_train, cv=5)

In [45]:
preds = full_pipeline.predict(X_test)


In [46]:
# Evaluation Metrices for H1N1 Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test[:, 0], preds[:, 0])))
print("Recall: {:.2f}".format(recall_score(y_test[:, 0], preds[:, 0])))
print("Precision: {:.2f}".format(precision_score(y_test[:, 0], preds[:, 0])))
print("F1: {:.2f}".format(f1_score(y_test[:, 0], preds[:, 0])))
print("ROC: {:.2f}".format(roc_auc_score(y_test[:, 0], preds[:, 0])))

Accuracy: 0.83
Recall: 0.45
Precision: 0.67
F1: 0.54
ROC: 0.69


In [47]:
# Evaluation Metrices for Seasonal Flu Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test[:, 1], preds[:, 1])))
print("Recall: {:.2f}".format(recall_score(y_test[:, 1], preds[:, 1])))
print("Precision: {:.2f}".format(precision_score(y_test[:, 1], preds[:, 1])))
print("F1: {:.2f}".format(f1_score(y_test[:, 1], preds[:, 1])))
print("ROC: {:.2f}".format(roc_auc_score(y_test[:, 1], preds[:, 1])))

Accuracy: 0.78
Recall: 0.74
Precision: 0.78
F1: 0.76
ROC: 0.78


In [48]:
y_test[:, 0]

array([1, 0, 0, ..., 0, 0, 0])

In [49]:
y_test

array([[1, 1],
       [0, 0],
       [0, 0],
       ...,
       [0, 0],
       [0, 1],
       [0, 0]])

ROC is chosen for the following reasons:
1. curve consideres both -ves and +ves 
2. AUC_score tells how well model distinquishes between -ves and +ves 
3. Both outcomes are valuable because there is nor preference for either
4. For further reading https://developers.google.com/machine-learning/crash-course/classification/roc-and-auc


The goal for the AUC_score is pegged at 0.8 based of the challenge data and the 
benchmarks reached i the competition https://www.researchgate.net/post/What-is-the-value-of-the-area-under-the-roc-curve-AUC-to-conclude-that-a-classifier-is-excellent

## BASELINE MODEL RESULTS

In [50]:
dummy_classifier = DummyClassifier()
dummy_classifier.fit(X_train, y_train)


DummyClassifier()

In [51]:
dummy_train_pred = dummy_classifier.predict(X_train)
dummy_test_pred = dummy_classifier.predict(X_test)

In [52]:
print("ROC: {:.2f}".format(roc_auc_score(y_test[:, 0], dummy_test_pred[:, 0])))
print("ROC: {:.2f}".format(roc_auc_score(y_test[:, 1], dummy_test_pred[:, 1])))

ROC: 0.50
ROC: 0.50


In [53]:
'''# Feature Importance


importance = full_pipeline.coef_
# summarize feature importance
for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))'''


"# Feature Importance\n\n\nimportance = full_pipeline.coef_\n# summarize feature importance\nfor i,v in enumerate(importance):\n\tprint('Feature: %0d, Score: %.5f' % (i,v))"

In [54]:
from sklearn.feature_selection import RFE

In [55]:

feat_impts = [] 
for clf in full_pipeline.steps[1][1].estimators_:
    print(clf.coef_)

      

    #for i,v in enumerate(clf.coef_[0]):
    #    print(i,v)
#grid_fit.best_estimator_.feature_importances_})
#print(feat_impts)
#importance = np.mean(feat_impts, axis=0)

#for i,v in enumerate(importance):
#	print('Feature: %0d, Score: %.5f' % (i,v))

[[ 0.15238877  0.04014272 -0.02504355 -0.26575961 -0.13198219 -0.06149412
   0.09520464 -0.12861583  0.03034416 -0.0186226  -0.07964907 -0.13157711
   0.03330544 -0.05037398 -0.0478977   0.07092731 -0.16919898 -0.02505534
  -0.07321634 -0.06499694 -0.03327473 -1.0509715   0.95269983  0.21322897
  -0.31150065 -0.13978867  0.041517   -0.20180176  0.10353009 -0.48655959
   0.38828792 -0.72277202 -0.71244819 -0.18567147  0.32065568  1.20196433
  -0.83772063 -0.35810137 -0.04199662  0.39464115  0.7449058   0.16852666
  -0.08349717 -0.33736351  0.09248614  0.06157621 -0.10579033 -0.30197709
   0.35775056 -0.08901909  0.04076428 -0.54102217 -0.06782056  0.12441883
   0.14783574  0.23831649  0.25232121  0.13510332 -0.47912369  0.12986834
  -0.13644086 -0.20927567 -0.20949808 -0.15096788  0.16388179  0.30758817
  -0.01383501 -0.24250535  0.15231828  0.0057504  -0.28178467 -0.07816596
   0.19293274  0.06874622 -0.11300255  0.01473088 -0.05779951 -0.04047216
   0.05826039 -0.06203718 -0.1756088  

Feature importance extracted using ELI5
https://towardsdatascience.com/extracting-feature-importances-from-scikit-learn-pipelines-18c79b4ae09a

In [56]:
#pip install eli5 in external terminal
import eli5

In [57]:
full_pipeline.steps

[('preprocessor',
  ColumnTransformer(transformers=[('cat',
                                   Pipeline(steps=[('1hot',
                                                    OneHotEncoder(handle_unknown='ignore'))]),
                                   ['h1n1_concern', 'h1n1_knowledge',
                                    'behavioral_antiviral_meds',
                                    'behavioral_avoidance',
                                    'behavioral_face_mask',
                                    'behavioral_wash_hands',
                                    'behavioral_large_gatherings',
                                    'behavioral_outside_home',
                                    'behavioral_touch_face', 'doctor_recc_h1n1',
                                    'doctor_recc_se...l',
                                    'chronic_med_condition',
                                    'child_under_6_months', 'health_worker',
                                    'opinion_h1n1_vacc_effecti

In [58]:
onehot_columns = list(full_pipeline.named_steps['preprocessor'].named_transformers_['cat'].named_steps['1hot'].get_feature_names(input_features=cat_features))
#numeric_features_list = list(numeric_features)
#numeric_features_list.extend(onehot_columns)

In [59]:
for clf in full_pipeline.steps[1][1].estimators_:
    print(eli5.explain_weights(clf, feature_names=onehot_columns))

Explanation(estimator='LogisticRegression()', description="\nFeatures with largest coefficients.\nCaveats:\n1. Be careful with features which are not\n   independent - weights don't show their importance.\n2. If scale of input features is different then scale of coefficients\n   will also be different, making direct comparison between coefficient values\n   incorrect.\n3. Depending on regularization, rare features sometimes may have high\n   coefficients; this doesn't mean they contribute much to the\n   classification result for most examples.\n", error=None, method='linear model', is_regression=False, targets=[TargetExplanation(target=1, feature_weights=FeatureWeights(pos=[FeatureWeight(feature='opinion_h1n1_vacc_effective_5.0', weight=1.2019643327884548, std=None, value=None), FeatureWeight(feature='doctor_recc_h1n1_1.0', weight=0.9526998320299309, std=None, value=None), FeatureWeight(feature='opinion_h1n1_risk_5.0', weight=0.7449057973799844, std=None, value=None), FeatureWeight(fe

Trying using permutation_importance
https://scikit-learn.org/stable/modules/generated/sklearn.inspection.permutation_importance.html#sklearn.inspection.permutation_importance

In [60]:
from sklearn.inspection import permutation_importance

In [61]:
for clf in full_pipeline.steps[1][1].estimators_:
    print(permutation_importance(clf, X_test, y_test, random_state=RSEED))

ValueError: could not convert string to float: '45 - 54 Years'