In [830]:
import sys
# adding to the path variables the one folder higher (locally, not changing system variables)
sys.path.append("..")
import pandas as pd
import numpy as np
import warnings
import mlflow
from modeling.config import TRACKING_URI, EXPERIMENT_NAME

RSEED = 42

# Modeling Libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.dummy import DummyClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_predict, cross_val_score, cross_validate
from sklearn.metrics import roc_curve, confusion_matrix, accuracy_score, recall_score, precision_score, f1_score, roc_auc_score

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


warnings.filterwarnings('ignore')


In [831]:
df_features = pd.read_csv('../data/Flu_Shot_Learning_Predict_H1N1_and_Seasonal_Flu_Vaccines_-_Training_Features.csv')

In [832]:
df_features.head()

Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,Below Poverty,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,"<= $75,000, Above Poverty",Married,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb


In [833]:
df_target = pd.read_csv('../data/Flu_Shot_Learning_Predict_H1N1_and_Seasonal_Flu_Vaccines_-_Training_Labels.csv')

In [834]:
# Get info for the target
df_target.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26707 entries, 0 to 26706
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   respondent_id     26707 non-null  int64
 1   h1n1_vaccine      26707 non-null  int64
 2   seasonal_vaccine  26707 non-null  int64
dtypes: int64(3)
memory usage: 626.1 KB


In [835]:
df_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26707 entries, 0 to 26706
Data columns (total 36 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   respondent_id                26707 non-null  int64  
 1   h1n1_concern                 26615 non-null  float64
 2   h1n1_knowledge               26591 non-null  float64
 3   behavioral_antiviral_meds    26636 non-null  float64
 4   behavioral_avoidance         26499 non-null  float64
 5   behavioral_face_mask         26688 non-null  float64
 6   behavioral_wash_hands        26665 non-null  float64
 7   behavioral_large_gatherings  26620 non-null  float64
 8   behavioral_outside_home      26625 non-null  float64
 9   behavioral_touch_face        26579 non-null  float64
 10  doctor_recc_h1n1             24547 non-null  float64
 11  doctor_recc_seasonal         24547 non-null  float64
 12  chronic_med_condition        25736 non-null  float64
 13  child_under_6_mo

In [836]:
df_features.isnull().sum(axis = 0)

respondent_id                      0
h1n1_concern                      92
h1n1_knowledge                   116
behavioral_antiviral_meds         71
behavioral_avoidance             208
behavioral_face_mask              19
behavioral_wash_hands             42
behavioral_large_gatherings       87
behavioral_outside_home           82
behavioral_touch_face            128
doctor_recc_h1n1                2160
doctor_recc_seasonal            2160
chronic_med_condition            971
child_under_6_months             820
health_worker                    804
health_insurance               12274
opinion_h1n1_vacc_effective      391
opinion_h1n1_risk                388
opinion_h1n1_sick_from_vacc      395
opinion_seas_vacc_effective      462
opinion_seas_risk                514
opinion_seas_sick_from_vacc      537
age_group                          0
education                       1407
race                               0
sex                                0
income_poverty                  4423
m

- express missing values as %

Options:
- modelling for imputation (withput using our target variable!)
- or use prediction models that don't care about missing values
- or impute so that the overall distribution stays the same (based on statistics of this data set)
- or impute using a hypothesis (e.g. people who have missing values don't have health insurance?)
- for imputation we can try several approaches and see what gives best results :)

- remember--when we impute, we want to base assumptions based on the test set of a train-test split:
    - if we do multiple models, test-train split for each
    - create functions for imputation

- remember modelling for understanding (EDA style) =/= modelling for prediction

library for visualising missing values:
https://github.com/ResidentMario/missingno

In [837]:
# We are concatenating both dataframes into one 

df = pd.merge(df_target, df_features, on=['respondent_id'])

We will drop the following columns for our first iteration:    
- health_insurance, employment_industry, employment_occupation, income_poverty, marital_status, employment_status

What are the values in the features that have a lot of missing data?

In [838]:
df_features.health_insurance.value_counts()

1.0    12697
0.0     1736
Name: health_insurance, dtype: int64

Binary variable; 12% do not have health insurance, the rest do

In [839]:
df_features.employment_industry.value_counts()

fcxhlnwr    2468
wxleyezf    1804
ldnlellj    1231
pxcmvdjn    1037
atmlpfrs     926
arjwrbjb     871
xicduogh     851
mfikgejo     614
vjjrobsf     527
rucpziij     523
xqicxuve     511
saaquncn     338
cfqqtusy     325
nduyfdeo     286
mcubkhph     275
wlfvacwt     215
dotnnunm     201
haxffmxo     148
msuufmds     124
phxvnwax      89
qnlwzans      13
Name: employment_industry, dtype: int64

Anonymised variable with 21 values

In [840]:
df_features.employment_occupation.value_counts()

xtkaffoo    1778
mxkfnird    1509
emcorrxb    1270
cmhcxjea    1247
xgwztkwe    1082
hfxkjkmi     766
qxajmpny     548
xqwwgdyp     485
kldqjyjy     469
uqqtjvyb     452
tfqavkke     388
ukymxvdu     372
vlluhbov     354
oijqvulv     344
ccgxvspp     341
bxpfxfdn     331
haliazsg     296
rcertsgn     276
xzmlyyjv     248
dlvbwzss     227
hodpvpew     208
dcjcmpih     148
pvmttkik      98
Name: employment_occupation, dtype: int64

Anonymised variable with 23 values

# Data cleaning

Dropping of features with too many missing values:

In [841]:
col_drop = ['health_insurance', 'employment_industry', 'employment_occupation', 'income_poverty', 'marital_status', 'employment_status']

df.drop(col_drop, axis=1, inplace=True)

Dropping of all rows with null values:

In [842]:
df.dropna(inplace=True)

Check that all null values have been dropped:

In [843]:
df.isnull().sum(axis = 0)

respondent_id                  0
h1n1_vaccine                   0
seasonal_vaccine               0
h1n1_concern                   0
h1n1_knowledge                 0
behavioral_antiviral_meds      0
behavioral_avoidance           0
behavioral_face_mask           0
behavioral_wash_hands          0
behavioral_large_gatherings    0
behavioral_outside_home        0
behavioral_touch_face          0
doctor_recc_h1n1               0
doctor_recc_seasonal           0
chronic_med_condition          0
child_under_6_months           0
health_worker                  0
opinion_h1n1_vacc_effective    0
opinion_h1n1_risk              0
opinion_h1n1_sick_from_vacc    0
opinion_seas_vacc_effective    0
opinion_seas_risk              0
opinion_seas_sick_from_vacc    0
age_group                      0
education                      0
race                           0
sex                            0
rent_or_own                    0
hhs_geo_region                 0
census_msa                     0
household_

In [844]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21853 entries, 0 to 26706
Data columns (total 32 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   respondent_id                21853 non-null  int64  
 1   h1n1_vaccine                 21853 non-null  int64  
 2   seasonal_vaccine             21853 non-null  int64  
 3   h1n1_concern                 21853 non-null  float64
 4   h1n1_knowledge               21853 non-null  float64
 5   behavioral_antiviral_meds    21853 non-null  float64
 6   behavioral_avoidance         21853 non-null  float64
 7   behavioral_face_mask         21853 non-null  float64
 8   behavioral_wash_hands        21853 non-null  float64
 9   behavioral_large_gatherings  21853 non-null  float64
 10  behavioral_outside_home      21853 non-null  float64
 11  behavioral_touch_face        21853 non-null  float64
 12  doctor_recc_h1n1             21853 non-null  float64
 13  doctor_recc_seas

In [845]:
df.reset_index(inplace=True)

We dropped all rows with missing values
- Maybe later on, we will want to refine this approach.

In [846]:
 # We are looking for unique values in order to identify whether we have duplicates

df['respondent_id'].nunique()

21853

All values are unique, no duplicates. 

## EDA

In [847]:
# checking for balance in data

print(df.h1n1_vaccine.value_counts())
print(df.seasonal_vaccine.value_counts())

0    16906
1     4947
Name: h1n1_vaccine, dtype: int64
0    11371
1    10482
Name: seasonal_vaccine, dtype: int64


23% vaccination rate for H1N1 and 52% vaccination rate for seasonal flu
The H1N1 vaccine outcome appears to be unbalanced (almost 17k vs 5k) (read lit--what is considered unbalanced?)
The seasonal vaccine outcome appears to be fairly balanced

We may want to deal with the lack of balance later

In [848]:
# Understanding observations and rows

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21853 entries, 0 to 21852
Data columns (total 33 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   index                        21853 non-null  int64  
 1   respondent_id                21853 non-null  int64  
 2   h1n1_vaccine                 21853 non-null  int64  
 3   seasonal_vaccine             21853 non-null  int64  
 4   h1n1_concern                 21853 non-null  float64
 5   h1n1_knowledge               21853 non-null  float64
 6   behavioral_antiviral_meds    21853 non-null  float64
 7   behavioral_avoidance         21853 non-null  float64
 8   behavioral_face_mask         21853 non-null  float64
 9   behavioral_wash_hands        21853 non-null  float64
 10  behavioral_large_gatherings  21853 non-null  float64
 11  behavioral_outside_home      21853 non-null  float64
 12  behavioral_touch_face        21853 non-null  float64
 13  doctor_recc_h1n1

In [849]:
#columns to drop because they're little use
col_drop = ['index', 'respondent_id']

df.drop(col_drop, axis=1, inplace=True)

- our target variables are h1n1_vaccine and seasonal_vaccine
- at the moment we are working with 28 feature variables--all categorical (refer to challenge documentation for description; we will need to transfer this info to the README)
- seven of the variables are strings--we will convert these to numeric encoding so we can look at correlations in Profiler
household_adults and household_children are 'top-coded' up to 3--that means that household with 3+ adults (or children) will fall into the '3' group
- 'hhs_geo_region' is an anonymised string
- we should remember that the current column names (which would be used as labels in the graphs) are not really human-readable--we need to keep this in mind when we're making plots (either rename the columns beforehand, or include a plotting command to change the labels)

24  age_group                    21853 non-null  object 
 25  education                    21853 non-null  object 
 26  race                         21853 non-null  object 
 27  sex                          21853 non-null  object 
 28  rent_or_own                  21853 non-null  object 
 29  hhs_geo_region               21853 non-null  object 
 30  census_msa                   21853 non-null  object 

24  age_group                    21853 non-null  object 
 25  education                    21853 non-null  object 
 26  race                         21853 non-null  object 
 27  sex                          21853 non-null  object 
 28  rent_or_own                  21853 non-null  object 
 29  hhs_geo_region               21853 non-null  object 
 30  census_msa                   21853 non-null  object 

Conversion of string variables to numeric (so these variables het displayed in Profiler properly) vie manual numeric encoding:

In [850]:
#a separate dataframe is made for the Profiler; the original dataframe will be retained for one-hot encoding (so the column headings we get during one-hot encoding remain meaningful)
df["age_group"].value_counts()

65+ Years        5393
55 - 64 Years    4655
45 - 54 Years    4390
18 - 34 Years    4277
35 - 44 Years    3138
Name: age_group, dtype: int64

In [851]:
df["education"].value_counts()

College Graduate    8839
Some College        6123
12 Years            4963
< 12 Years          1928
Name: education, dtype: int64

In [852]:
df["race"].value_counts()

White                17485
Black                 1670
Hispanic              1428
Other or Multiple     1270
Name: race, dtype: int64

In [853]:
df["sex"].value_counts()

Female    13105
Male       8748
Name: sex, dtype: int64

In [854]:
df["rent_or_own"].value_counts()

Own     16647
Rent     5206
Name: rent_or_own, dtype: int64

In [855]:
df["hhs_geo_region"].value_counts()

lzgpxyit    3481
fpwskwrf    2626
qufhixun    2588
bhuqouqj    2373
oxchjgsf    2356
kbazzjca    2299
mlyzmhmf    1832
atmpeygn    1694
lrircsnp    1686
dqpwygqj     918
Name: hhs_geo_region, dtype: int64

In [856]:
df["census_msa"].value_counts()

MSA, Not Principle  City    9558
MSA, Principle City         6362
Non-MSA                     5933
Name: census_msa, dtype: int64

In [857]:
cleanup = {"age_group": {"18 - 34 Years": 1, "35 - 44 Years": 2, "45 - 54 Years": 3, "55 - 64 Years": 4,
                                  "65+ Years": 5},
            "education": {"< 12 Years": 1, "12 Years": 2, "Some College": 3, "College Graduate": 4},
            "race": {"White": 1, "Black": 2, "Hispanic": 3, "Other or Multiple": 4},
            "sex" : {"Female": 1, "Male": 2},
            "rent_or_own" : {"Own": 1, "Rent": 2},
            "hhs_geo_region" : {"lzgpxyit": 1, "fpwskwrf": 2, "qufhixun": 3, "bhuqouqj": 4, "oxchjgsf": 5, "kbazzjca": 6, "mlyzmhmf": 7, "atmpeygn": 8, "lrircsnp": 9, "dqpwygqj": 10},
            "census_msa" : {"MSA, Not Principle  City": 1, "MSA, Principle City": 2, "Non-MSA": 3}
                                  }

In [858]:
df_for_profiler = df.replace(cleanup)
df_for_profiler.head()

Unnamed: 0,h1n1_vaccine,seasonal_vaccine,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,...,opinion_seas_sick_from_vacc,age_group,education,race,sex,rent_or_own,hhs_geo_region,census_msa,household_adults,household_children
0,0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,2.0,4,1,1,1,1,5,3,0.0,0.0
1,0,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,...,4.0,2,2,1,2,2,4,1,0.0,0.0
2,0,1,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,1.0,5,2,1,1,2,9,2,0.0,0.0
3,0,0,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,4.0,3,3,1,1,1,3,1,1.0,0.0
4,0,0,3.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,4.0,5,2,1,2,1,8,2,2.0,3.0


Run Profiler to explore the data:

In [859]:
#import Profiler
#before opening VS Code, run this command in the terminal: pip install pandas-profiling==2.11.0
from pandas_profiling import ProfileReport

In [860]:
profile = ProfileReport(df_for_profiler, title="Pandas Profiling Report", explorative=True)

In [861]:
#profile

**Possible multicollinearity (between features) based on heatmap:**
- behavioral_large_gatherings vs behavioral_outside_home
- doctor_recc_h1n1 vs doctor_recc_seasonal
- opinion_h1n1_risk vs opinion_seas_risk
- household_children vs age_group

**Possible outliers and features to be aware of:**
- behavioral_antiviral_meds : very unbalanced between categories; the people taking anviral meds could have something else going on (e.g. already sick, or worried about getting flu and taking meds profilactically)--be careful about this variable
- behavioral_face_mask
- behavioral_wash_hands
-child_under_6_months
- health_worker
- opinion_h1n1_vacc_effective and opinion_seasonal_vacc_effective (1.0 group)
- race (not many non-white respondents)
- household_adults and household_children (3.0 groups pretty small)

## Creating Pipelines

In [862]:
# Pipeline for categorical features
#CHECK HERE--ARE WE DROPPING THE FIRST COLUMN TO PREVENT MULTICOLINEARITY?
cat_pipeline = Pipeline([
    ('1hot', OneHotEncoder(handle_unknown='error', drop='first'))
])

In [863]:
cat_features = list(df.columns)


Removal of target variables from cat_features list:

In [864]:
cat_features.remove('h1n1_vaccine')

In [865]:
cat_features.remove('seasonal_vaccine')

Rename the features and target to 'X' and 'y', to make the test-train split easier:

In [866]:
y = df[['h1n1_vaccine', 'seasonal_vaccine']].copy()

In [867]:
y = y.to_numpy()
y

array([[0, 0],
       [0, 1],
       [0, 1],
       ...,
       [0, 0],
       [0, 1],
       [0, 0]])

In [868]:

#NB: the H1N1 vaccine and seasonal vaccine are left in, otherwise the pipeline doesn't run properly
#X = df

#NB: dropping the 'h1n1_vaccine' and 'seasonal_vaccine' columns
X = df.drop(columns=['h1n1_vaccine', 'seasonal_vaccine'])

In [869]:
preprocessor = ColumnTransformer([
    ('cat', cat_pipeline, cat_features)
])

In [870]:
estimators = MultiOutputClassifier(
    estimator=LogisticRegression()#(penalty="l2", C=1)
)


In [871]:
full_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", estimators),
])

In [872]:

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size=0.2, random_state=RSEED)

In [873]:
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

X_train shape: (17482, 29)
X_test shape: (4371, 29)
y_train shape: (17482, 2)
y_test shape: (4371, 2)


In [874]:
full_pipeline.fit(X_train, y_train)



Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat',
                                                  Pipeline(steps=[('1hot',
                                                                   OneHotEncoder(drop='first'))]),
                                                  ['h1n1_concern',
                                                   'h1n1_knowledge',
                                                   'behavioral_antiviral_meds',
                                                   'behavioral_avoidance',
                                                   'behavioral_face_mask',
                                                   'behavioral_wash_hands',
                                                   'behavioral_large_gatherings',
                                                   'behavioral_outside_home',
                                                   'behavioral_touch_face',
                                                   'doctor

In [875]:
# Figure out later what this does and if we want to use it


#y_train_predicted = cross_val_predict(full_pipeline, X_train, y_train, cv=5)

In [876]:
preds = full_pipeline.predict(X_test)


Model evaluation

In [877]:
# Evaluation Metrices for H1N1 Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test[:, 0], preds[:, 0])))
print("Recall: {:.2f}".format(recall_score(y_test[:, 0], preds[:, 0])))
print("Precision: {:.2f}".format(precision_score(y_test[:, 0], preds[:, 0])))
print("F1: {:.2f}".format(f1_score(y_test[:, 0], preds[:, 0])))
print("ROC: {:.2f}".format(roc_auc_score(y_test[:, 0], preds[:, 0])))

Accuracy: 0.83
Recall: 0.44
Precision: 0.67
F1: 0.54
ROC: 0.69


In [878]:
# Evaluation Metrices for Seasonal Flu Vaccines
print("train data: {:.2f}".format(accuracy_score(y_test[:, 1], preds[:, 1])))
print("Recall: {:.2f}".format(recall_score(y_test[:, 1], preds[:, 1])))
print("Precision: {:.2f}".format(precision_score(y_test[:, 1], preds[:, 1])))
print("F1: {:.2f}".format(f1_score(y_test[:, 1], preds[:, 1])))
print("ROC: {:.2f}".format(roc_auc_score(y_test[:, 1], preds[:, 1])))

train data: 0.78
Recall: 0.74
Precision: 0.79
F1: 0.76
ROC: 0.78


In [879]:
y_test[:, 0]

array([1, 0, 0, ..., 0, 0, 0])

In [880]:
y_test

array([[1, 1],
       [0, 0],
       [0, 0],
       ...,
       [0, 0],
       [0, 1],
       [0, 0]])

ROC is chosen for the following reasons:
1. curve consideres both -ves and +ves 
2. AUC_score tells how well model distinquishes between -ves and +ves 
3. Both outcomes are valuable because there is nor preference for either
4. For further reading https://developers.google.com/machine-learning/crash-course/classification/roc-and-auc


The goal for the AUC_score is pegged at 0.8 based of the challenge data and the 
benchmarks reached i the competition https://www.researchgate.net/post/What-is-the-value-of-the-area-under-the-roc-curve-AUC-to-conclude-that-a-classifier-is-excellent

## BASELINE MODEL RESULTS

In [881]:
dummy_classifier = DummyClassifier()
dummy_classifier.fit(X_train, y_train)


DummyClassifier()

In [882]:
dummy_train_pred = dummy_classifier.predict(X_train)
dummy_test_pred = dummy_classifier.predict(X_test)

In [883]:
print("ROC: {:.2f}".format(roc_auc_score(y_test[:, 0], dummy_test_pred[:, 0])))
print("ROC: {:.2f}".format(roc_auc_score(y_test[:, 1], dummy_test_pred[:, 1])))

ROC: 0.50
ROC: 0.50


# Single Label Modelling

In [884]:
# pipeline for the single label

full_pipeline_1 = Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", LogisticRegression()),
    
])

## Predicting h1n1_vaccine  with Seasonal Flu Vaccine not in features

In [885]:
y = df['h1n1_vaccine'].copy() # for h1n1_vaccine only

In [886]:
y = y.to_numpy()
y

array([0, 0, 0, ..., 0, 0, 0])

In [887]:
#NB: the H1N1 vaccine and seasonal vaccine are left in, otherwise the pipeline doesn't run properly
#X = df

#NB: dropping the 'h1n1_vaccine' and 'seasonal_vaccine' columns
#X = df.drop(columns=['seasonal_vaccine'])


In [888]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size=0.2, random_state=RSEED) # split for h1n1_vaccine

In [889]:
full_pipeline_1.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat',
                                                  Pipeline(steps=[('1hot',
                                                                   OneHotEncoder(drop='first'))]),
                                                  ['h1n1_concern',
                                                   'h1n1_knowledge',
                                                   'behavioral_antiviral_meds',
                                                   'behavioral_avoidance',
                                                   'behavioral_face_mask',
                                                   'behavioral_wash_hands',
                                                   'behavioral_large_gatherings',
                                                   'behavioral_outside_home',
                                                   'behavioral_touch_face',
                                                   'doctor

In [890]:
preds = full_pipeline_1.predict(X_test)

In [891]:
# Evaluation Metrices for H1N1 Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test, preds)))
print("Recall: {:.2f}".format(recall_score(y_test, preds)))
print("Precision: {:.2f}".format(precision_score(y_test, preds)))
print("F1: {:.2f}".format(f1_score(y_test, preds)))
print("ROC: {:.2f}".format(roc_auc_score(y_test, preds)))

Accuracy: 0.83
Recall: 0.45
Precision: 0.67
F1: 0.54
ROC: 0.69


## Predicting Seasonal Flu Vaccine with h1n1_vaccine not in features

In [892]:
y = df['seasonal_vaccine'].copy() # for seasonal_vaccine only

In [893]:
y = y.to_numpy()
y

array([0, 1, 1, ..., 0, 1, 0])

In [894]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size=0.2, random_state=RSEED) # split for seasonal_vaccine

In [895]:
full_pipeline_1.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat',
                                                  Pipeline(steps=[('1hot',
                                                                   OneHotEncoder(drop='first'))]),
                                                  ['h1n1_concern',
                                                   'h1n1_knowledge',
                                                   'behavioral_antiviral_meds',
                                                   'behavioral_avoidance',
                                                   'behavioral_face_mask',
                                                   'behavioral_wash_hands',
                                                   'behavioral_large_gatherings',
                                                   'behavioral_outside_home',
                                                   'behavioral_touch_face',
                                                   'doctor

In [896]:
preds = full_pipeline_1.predict(X_test)

In [897]:
# Evaluation Metrices for H1N1 Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test, preds)))
print("Recall: {:.2f}".format(recall_score(y_test, preds)))
print("Precision: {:.2f}".format(precision_score(y_test, preds)))
print("F1: {:.2f}".format(f1_score(y_test, preds)))
print("ROC: {:.2f}".format(roc_auc_score(y_test, preds)))

Accuracy: 0.79
Recall: 0.76
Precision: 0.79
F1: 0.78
ROC: 0.79


## Predicting Seasonal Flu Vaccine with h1n1_vaccine in features

In [898]:
cat_features_new = list(df.columns)

In [899]:
cat_features_new.remove('seasonal_vaccine')

In [900]:
#NB: dropping the 'h1n1_vaccine' and 'seasonal_vaccine' columns
X = df.drop(columns=['seasonal_vaccine'])

In [901]:
y = df['seasonal_vaccine'].copy()
y

0        0
1        1
2        1
3        0
4        0
        ..
21848    0
21849    0
21850    0
21851    1
21852    0
Name: seasonal_vaccine, Length: 21853, dtype: int64

In [902]:
y = y.to_numpy()
y

array([0, 1, 1, ..., 0, 1, 0])

In [903]:
preprocessor = ColumnTransformer([
    ('cat', cat_pipeline, cat_features_new)
])

In [904]:
full_pipeline_1 = Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", LogisticRegression()),
    
])

In [905]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size=0.2, random_state=RSEED) # split for Seasonal_vaccine

In [906]:
full_pipeline_1.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat',
                                                  Pipeline(steps=[('1hot',
                                                                   OneHotEncoder(drop='first'))]),
                                                  ['h1n1_vaccine',
                                                   'h1n1_concern',
                                                   'h1n1_knowledge',
                                                   'behavioral_antiviral_meds',
                                                   'behavioral_avoidance',
                                                   'behavioral_face_mask',
                                                   'behavioral_wash_hands',
                                                   'behavioral_large_gatherings',
                                                   'behavioral_outside_home',
                                                   'behavioral_touc

In [907]:
preds = full_pipeline_1.predict(X_test)

In [908]:
# Evaluation Metrices for Seasonal Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test, preds)))
print("Recall: {:.2f}".format(recall_score(y_test, preds)))
print("Precision: {:.2f}".format(precision_score(y_test, preds)))
print("F1: {:.2f}".format(f1_score(y_test, preds)))
print("ROC: {:.2f}".format(roc_auc_score(y_test, preds)))

Accuracy: 0.81
Recall: 0.79
Precision: 0.82
F1: 0.80
ROC: 0.81


## Predicting h1n1_vaccine with Seasonal Flu Vaccine  in features

In [909]:
cat_features_new = list(df.columns)


In [910]:
cat_features_new.remove('h1n1_vaccine')

In [911]:
#NB: dropping the 'h1n1_vaccine' and 'seasonal_vaccine' columns
X = df.drop(columns=['h1n1_vaccine'])

In [912]:
y = df['h1n1_vaccine'].copy()
y

0        0
1        0
2        0
3        0
4        0
        ..
21848    0
21849    0
21850    0
21851    0
21852    0
Name: h1n1_vaccine, Length: 21853, dtype: int64

In [913]:
preprocessor = ColumnTransformer([
    ('cat', cat_pipeline, cat_features_new)
])

In [914]:
full_pipeline_1 = Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", LogisticRegression()),
    
])

In [915]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size=0.2, random_state=RSEED) # split for H1N1_vaccine

In [916]:
full_pipeline_1.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat',
                                                  Pipeline(steps=[('1hot',
                                                                   OneHotEncoder(drop='first'))]),
                                                  ['seasonal_vaccine',
                                                   'h1n1_concern',
                                                   'h1n1_knowledge',
                                                   'behavioral_antiviral_meds',
                                                   'behavioral_avoidance',
                                                   'behavioral_face_mask',
                                                   'behavioral_wash_hands',
                                                   'behavioral_large_gatherings',
                                                   'behavioral_outside_home',
                                                   'behavioral_

In [917]:
preds = full_pipeline_1.predict(X_test)

In [918]:
# Evaluation Metrices for H1N1 Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test, preds)))
print("Recall: {:.2f}".format(recall_score(y_test, preds)))
print("Precision: {:.2f}".format(precision_score(y_test, preds)))
print("F1: {:.2f}".format(f1_score(y_test, preds)))
print("ROC: {:.2f}".format(roc_auc_score(y_test, preds)))

Accuracy: 0.84
Recall: 0.54
Precision: 0.71
F1: 0.61
ROC: 0.74


# Modelling Algorithms

In addition to Logistic regression, we are trying four different models to compare performance in terms of predicting the Vaccine Intake:

- K nearest neighbours
- Random Forest
- Support Vector Machine
- Naive Bayes

Instantiate the models:

In [919]:
knn_model = KNeighborsClassifier()
rand_forst_model = RandomForestClassifier()
#svm_model = SVC() 



Create Pipeline for each:

In [920]:
# for KNN
estimators_knn= MultiOutputClassifier(
    estimator=knn_model
)

# for Random Forest
estimators_rand_forst= MultiOutputClassifier(
    estimator=rand_forst_model
)


In [946]:
cat_features = list(df.columns)
cat_features.remove('h1n1_vaccine')
cat_features.remove('seasonal_vaccine')
#cat_features

In [947]:
preprocessor = ColumnTransformer([
    ('cat', cat_pipeline, cat_features)
])

In [949]:
# for KNN
full_pipeline_knn = Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", estimators_knn),
])

# for Random Forest

full_pipeline_rand_forst= Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", estimators_rand_forst),
])

In [950]:
y = df[['h1n1_vaccine', 'seasonal_vaccine']].copy()
y

Unnamed: 0,h1n1_vaccine,seasonal_vaccine
0,0,0
1,0,1
2,0,1
3,0,0
4,0,0
...,...,...
21848,0,0
21849,0,0
21850,0,0
21851,0,1


In [951]:
y = y.to_numpy()
y


array([[0, 0],
       [0, 1],
       [0, 1],
       ...,
       [0, 0],
       [0, 1],
       [0, 0]])

In [961]:
#NB: dropping the 'h1n1_vaccine' and 'seasonal_vaccine' columns
X = df.drop(columns=['h1n1_vaccine', 'seasonal_vaccine'])

In [962]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size=0.2, random_state=RSEED)

Fit the data:

In [963]:
#decs_tree_model.fit(X_train, y_train)
full_pipeline_knn.fit(X_train, y_train)
full_pipeline_rand_forst.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat',
                                                  Pipeline(steps=[('1hot',
                                                                   OneHotEncoder(drop='first'))]),
                                                  ['h1n1_concern',
                                                   'h1n1_knowledge',
                                                   'behavioral_antiviral_meds',
                                                   'behavioral_avoidance',
                                                   'behavioral_face_mask',
                                                   'behavioral_wash_hands',
                                                   'behavioral_large_gatherings',
                                                   'behavioral_outside_home',
                                                   'behavioral_touch_face',
                                                   'doctor

Get predictions:

In [989]:
#KNN
knn_train_pred = full_pipeline_knn.predict(X_train)
knn_test_pred = full_pipeline_knn.predict(X_test)


In [990]:
#Random forest

rand_forst_train_pred = full_pipeline_rand_forst.predict(X_train)
rand_forst_test_pred = full_pipeline_rand_forst.predict(X_test)

### Evaluating model performance for Multilabel

KNN:

In [966]:
# Evaluation Metrices for H1N1 Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test[:, 0], knn_test_pred[:, 0])))
print("Recall: {:.2f}".format(recall_score(y_test[:, 0], knn_test_pred[:, 0])))
print("Precision: {:.2f}".format(precision_score(y_test[:, 0], knn_test_pred[:, 0])))
print("F1: {:.2f}".format(f1_score(y_test[:, 0], knn_test_pred[:, 0])))
print("ROC: {:.2f}".format(roc_auc_score(y_test[:, 0], knn_test_pred[:, 0])))

Accuracy: 0.80
Recall: 0.34
Precision: 0.58
F1: 0.43
ROC: 0.64


In [967]:
# Evaluation Metrices for Seasonal Flu Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test[:, 1], knn_test_pred[:, 1])))
print("Recall: {:.2f}".format(recall_score(y_test[:, 1], knn_test_pred[:, 1])))
print("Precision: {:.2f}".format(precision_score(y_test[:, 1], knn_test_pred[:, 1])))
print("F1: {:.2f}".format(f1_score(y_test[:, 1], knn_test_pred[:, 1])))
print("ROC: {:.2f}".format(roc_auc_score(y_test[:, 1], knn_test_pred[:, 1])))

Accuracy: 0.72
Recall: 0.67
Precision: 0.72
F1: 0.70
ROC: 0.72


Random Forest:

In [968]:
# Evaluation Metrices for H1N1 Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test[:, 0], rand_forst_test_pred[:, 0])))
print("Recall: {:.2f}".format(recall_score(y_test[:, 0], rand_forst_test_pred[:, 0])))
print("Precision: {:.2f}".format(precision_score(y_test[:, 0], rand_forst_test_pred[:, 0])))
print("F1: {:.2f}".format(f1_score(y_test[:, 0], rand_forst_test_pred[:, 0])))
print("ROC: {:.2f}".format(roc_auc_score(y_test[:, 0], rand_forst_test_pred[:, 0])))

Accuracy: 0.83
Recall: 0.40
Precision: 0.70
F1: 0.51
ROC: 0.68


In [969]:
# Evaluation Metrices for Seasonal Flu Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test[:, 1], rand_forst_test_pred[:, 1])))
print("Recall: {:.2f}".format(recall_score(y_test[:, 1], rand_forst_test_pred[:, 1])))
print("Precision: {:.2f}".format(precision_score(y_test[:, 1], rand_forst_test_pred[:, 1])))
print("F1: {:.2f}".format(f1_score(y_test[:, 1], rand_forst_test_pred[:, 1])))
print("ROC: {:.2f}".format(roc_auc_score(y_test[:, 1], rand_forst_test_pred[:, 1])))

Accuracy: 0.77
Recall: 0.74
Precision: 0.77
F1: 0.75
ROC: 0.77


# Single Label Modelling of the four other algorithms

## Predicting h1n1_vaccine  with Seasonal Flu Vaccine not in features

In [997]:
y = df['h1n1_vaccine'].copy() # for h1n1_vaccine only
y = y.to_numpy()
y

array([0, 0, 0, ..., 0, 0, 0])

In [995]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size=0.2, random_state=RSEED) # split for h1n1_vaccine

### Pipeline for the single label

In [986]:
# for KNN
full_pipeline_knn = Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", knn_model),
])

# for Random Forest

full_pipeline_rand_forst= Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", rand_forst_model),
    ])

In [987]:
#decs_tree_model.fit(X_train, y_train)
full_pipeline_knn.fit(X_train, y_train)
full_pipeline_rand_forst.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat',
                                                  Pipeline(steps=[('1hot',
                                                                   OneHotEncoder(drop='first'))]),
                                                  ['h1n1_concern',
                                                   'h1n1_knowledge',
                                                   'behavioral_antiviral_meds',
                                                   'behavioral_avoidance',
                                                   'behavioral_face_mask',
                                                   'behavioral_wash_hands',
                                                   'behavioral_large_gatherings',
                                                   'behavioral_outside_home',
                                                   'behavioral_touch_face',
                                                   'doctor

In [988]:
#KNN
knn_train_pred = full_pipeline_knn.predict(X_train)
knn_test_pred = full_pipeline_knn.predict(X_test)

In [991]:
#Random forest

rand_forst_train_pred = full_pipeline_rand_forst.predict(X_train)
rand_forst_test_pred = full_pipeline_rand_forst.predict(X_test)

### Evaluating model performance for Multilabel

KNN:

In [992]:
# Evaluation Metrices for H1N1 Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test, knn_test_pred)))
print("Recall: {:.2f}".format(recall_score(y_test, knn_test_pred)))
print("Precision: {:.2f}".format(precision_score(y_test, knn_test_pred)))
print("F1: {:.2f}".format(f1_score(y_test, knn_test_pred)))
print("ROC: {:.2f}".format(roc_auc_score(y_test, knn_test_pred)))

Accuracy: 0.80
Recall: 0.34
Precision: 0.59
F1: 0.43
ROC: 0.64


Random Forest:

In [993]:
# Evaluation Metrices for H1N1 Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test, rand_forst_test_pred)))
print("Recall: {:.2f}".format(recall_score(y_test, rand_forst_test_pred)))
print("Precision: {:.2f}".format(precision_score(y_test, rand_forst_test_pred)))
print("F1: {:.2f}".format(f1_score(y_test, rand_forst_test_pred)))
print("ROC: {:.2f}".format(roc_auc_score(y_test, rand_forst_test_pred)))

Accuracy: 0.82
Recall: 0.40
Precision: 0.69
F1: 0.51
ROC: 0.67


## Predicting Seasonal Flu Vaccine with h1n1_vaccine not in features

In [None]:
y = df['seasonal_vaccine'].copy() # seasonal_vaccine only
y = y.to_numpy()
y

In [994]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size=0.2, random_state=RSEED) # split for seasonal_vaccine

### Fitting Pipeline for the single label 

In [998]:
#decs_tree_model.fit(X_train, y_train)
full_pipeline_knn.fit(X_train, y_train)
full_pipeline_rand_forst.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat',
                                                  Pipeline(steps=[('1hot',
                                                                   OneHotEncoder(drop='first'))]),
                                                  ['h1n1_concern',
                                                   'h1n1_knowledge',
                                                   'behavioral_antiviral_meds',
                                                   'behavioral_avoidance',
                                                   'behavioral_face_mask',
                                                   'behavioral_wash_hands',
                                                   'behavioral_large_gatherings',
                                                   'behavioral_outside_home',
                                                   'behavioral_touch_face',
                                                   'doctor

In [999]:
#KNN
knn_train_pred = full_pipeline_knn.predict(X_train)
knn_test_pred = full_pipeline_knn.predict(X_test)

In [1000]:
#Random forest

rand_forst_train_pred = full_pipeline_rand_forst.predict(X_train)
rand_forst_test_pred = full_pipeline_rand_forst.predict(X_test)

KNN:

In [1002]:
# Evaluation Metrices for H1N1 Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test, knn_test_pred)))
print("Recall: {:.2f}".format(recall_score(y_test, knn_test_pred)))
print("Precision: {:.2f}".format(precision_score(y_test, knn_test_pred)))
print("F1: {:.2f}".format(f1_score(y_test, knn_test_pred)))
print("ROC: {:.2f}".format(roc_auc_score(y_test, knn_test_pred)))

Accuracy: 0.80
Recall: 0.34
Precision: 0.59
F1: 0.43
ROC: 0.64


Random Forest:

In [1003]:
# Evaluation Metrices for H1N1 Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test, rand_forst_test_pred)))
print("Recall: {:.2f}".format(recall_score(y_test, rand_forst_test_pred)))
print("Precision: {:.2f}".format(precision_score(y_test, rand_forst_test_pred)))
print("F1: {:.2f}".format(f1_score(y_test, rand_forst_test_pred)))
print("ROC: {:.2f}".format(roc_auc_score(y_test, rand_forst_test_pred)))

Accuracy: 0.82
Recall: 0.40
Precision: 0.68
F1: 0.50
ROC: 0.67


## Predicting h1n1_vaccine with Seasonal Flu Vaccine  in features

In [1054]:
y = df['h1n1_vaccine'].copy() # for h1n1_vaccine only
y = y.to_numpy()
y

array([0, 0, 0, ..., 0, 0, 0])

In [1055]:
cat_features = list(df.columns)
cat_features.remove('h1n1_vaccine')

#cat_features

In [1057]:
#NB: dropping the 'h1n1_vaccine' column
X = df.drop(columns=['h1n1_vaccine'])

In [1058]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size=0.2, random_state=RSEED)

In [1059]:
preprocessor = ColumnTransformer([
    ('cat', cat_pipeline, cat_features)
])

In [1060]:
# for KNN
full_pipeline_knn = Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", knn_model),
])

# for Random Forest

full_pipeline_rand_forst= Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", rand_forst_model),
    ])

In [1061]:
#decs_tree_model.fit(X_train, y_train)
full_pipeline_knn.fit(X_train, y_train)
full_pipeline_rand_forst.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat',
                                                  Pipeline(steps=[('1hot',
                                                                   OneHotEncoder(drop='first'))]),
                                                  ['seasonal_vaccine',
                                                   'h1n1_concern',
                                                   'h1n1_knowledge',
                                                   'behavioral_antiviral_meds',
                                                   'behavioral_avoidance',
                                                   'behavioral_face_mask',
                                                   'behavioral_wash_hands',
                                                   'behavioral_large_gatherings',
                                                   'behavioral_outside_home',
                                                   'behavioral_

In [1062]:
#KNN
knn_train_pred = full_pipeline_knn.predict(X_train)
knn_test_pred = full_pipeline_knn.predict(X_test)

In [1063]:
#Random forest

rand_forst_train_pred = full_pipeline_rand_forst.predict(X_train)
rand_forst_test_pred = full_pipeline_rand_forst.predict(X_test)

KNN:

In [1064]:
# Evaluation Metrices for H1N1 Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test, knn_test_pred)))
print("Recall: {:.2f}".format(recall_score(y_test, knn_test_pred)))
print("Precision: {:.2f}".format(precision_score(y_test, knn_test_pred)))
print("F1: {:.2f}".format(f1_score(y_test, knn_test_pred)))
print("ROC: {:.2f}".format(roc_auc_score(y_test, knn_test_pred)))

Accuracy: 0.80
Recall: 0.40
Precision: 0.60
F1: 0.48
ROC: 0.66


Random Forest:

In [1065]:
# Evaluation Metrices for H1N1 Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test, rand_forst_test_pred)))
print("Recall: {:.2f}".format(recall_score(y_test, rand_forst_test_pred)))
print("Precision: {:.2f}".format(precision_score(y_test, rand_forst_test_pred)))
print("F1: {:.2f}".format(f1_score(y_test, rand_forst_test_pred)))
print("ROC: {:.2f}".format(roc_auc_score(y_test, rand_forst_test_pred)))

Accuracy: 0.84
Recall: 0.50
Precision: 0.72
F1: 0.59
ROC: 0.72


## Predicting Seasonal Flu Vaccine with h1n1_vaccine in features

In [1040]:
y = df['seasonal_vaccine'].copy() # seasonal_vaccine only
y = y.to_numpy()
y

array([0, 1, 1, ..., 0, 1, 0])

In [1041]:
cat_features = list(df.columns)

cat_features.remove('seasonal_vaccine')
#cat_features

In [1042]:
#NB: dropping the 'seasonal_vaccine' column
X = df.drop(columns=['seasonal_vaccine'])

In [1043]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size=0.2, random_state=RSEED)

In [1044]:
preprocessor = ColumnTransformer([
    ('cat', cat_pipeline, cat_features)
])

In [1045]:
# for KNN
full_pipeline_knn = Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", knn_model),
])

# for Random Forest

full_pipeline_rand_forst= Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", rand_forst_model),
    ])

In [1046]:
#decs_tree_model.fit(X_train, y_train)
full_pipeline_knn.fit(X_train, y_train)
full_pipeline_rand_forst.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat',
                                                  Pipeline(steps=[('1hot',
                                                                   OneHotEncoder(drop='first'))]),
                                                  ['h1n1_vaccine',
                                                   'h1n1_concern',
                                                   'h1n1_knowledge',
                                                   'behavioral_antiviral_meds',
                                                   'behavioral_avoidance',
                                                   'behavioral_face_mask',
                                                   'behavioral_wash_hands',
                                                   'behavioral_large_gatherings',
                                                   'behavioral_outside_home',
                                                   'behavioral_touc

In [1047]:
#KNN
knn_train_pred = full_pipeline_knn.predict(X_train)
knn_test_pred = full_pipeline_knn.predict(X_test)

In [1048]:
#Random forest

rand_forst_train_pred = full_pipeline_rand_forst.predict(X_train)
rand_forst_test_pred = full_pipeline_rand_forst.predict(X_test)

KNN:

In [1049]:
# Evaluation Metrices for H1N1 Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test, knn_test_pred)))
print("Recall: {:.2f}".format(recall_score(y_test, knn_test_pred)))
print("Precision: {:.2f}".format(precision_score(y_test, knn_test_pred)))
print("F1: {:.2f}".format(f1_score(y_test, knn_test_pred)))
print("ROC: {:.2f}".format(roc_auc_score(y_test, knn_test_pred)))

Accuracy: 0.75
Recall: 0.71
Precision: 0.75
F1: 0.73
ROC: 0.75


Random Forest:

In [1050]:
# Evaluation Metrices for H1N1 Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test, rand_forst_test_pred)))
print("Recall: {:.2f}".format(recall_score(y_test, rand_forst_test_pred)))
print("Precision: {:.2f}".format(precision_score(y_test, rand_forst_test_pred)))
print("F1: {:.2f}".format(f1_score(y_test, rand_forst_test_pred)))
print("ROC: {:.2f}".format(roc_auc_score(y_test, rand_forst_test_pred)))

Accuracy: 0.81
Recall: 0.80
Precision: 0.80
F1: 0.80
ROC: 0.81


The dummy classifier predicts everything to belong to the same class and thus has no discriminatory ability (between negative and positive class). Therefore, the AUC of 0.5 is expected.

## FEATURE IMPORTANCE (STILL A WORK IN PROGRESS)

In [1052]:
from sklearn.feature_selection import RFE

### Trial with coef_ method from logistic regresion

In [1053]:
for clf in full_pipeline.steps[1][1].estimators_: #this part is necessary to go into the relevant step of the pipeline and get the logistic regression estimator
    print(clf.coef_)

      

    #for i,v in enumerate(clf.coef_[0]):
    #    print(i,v)
#grid_fit.best_estimator_.feature_importances_})
#print(feat_impts)
#importance = np.mean(feat_impts, axis=0)

#for i,v in enumerate(importance):
#	print('Feature: %0d, Score: %.5f' % (i,v))

[[-0.10357361 -0.16714731 -0.40651462  0.07085558  0.22729183  0.1573705
  -0.06095467  0.1647271   0.00332115 -0.23961287 -0.04880651  0.03207374
   1.99824315 -0.52051828  0.1820238   0.30417929  0.87285368 -0.12197159
   0.41472292  0.92064823  1.80182327  0.47073037  0.77778782  1.22196088
   1.56883148 -0.24939592 -0.48901931 -0.07287853 -0.10387168 -0.18307575
   0.4796205   0.03279002  0.16292894  0.4676016   0.64920137  0.68383914
   0.77427248 -0.11498134 -0.64939288 -0.12010861 -0.38608998 -0.00453318
   0.05347845  0.3672417   0.51079646 -0.22750724  0.16724167  0.02023139
   0.19683534  0.46612727  0.34602656  0.12714099  0.01542347 -0.11560047
  -0.23021309 -0.12981725 -0.03246989 -0.06401881 -0.16827476  0.07824455
   0.02153747 -0.00232839  0.03689621  0.05773653  0.13752812  0.0897294
  -0.02399786  0.14312771 -0.04949641 -0.0533244 ]]
[[ 0.01670619  0.0393174  -0.08075905  0.27998922  0.47864558  0.04363249
  -0.03454198  0.04841339  0.04244148 -0.03141068 -0.04117996 

The output gives us coefficients for features, but without knowing what features these coefficients belong to, this output is quite meaningless

### Feature importance extracted using ELI5  
https://towardsdatascience.com/extracting-feature-importances-from-scikit-learn-pipelines-18c79b4ae09a

In [936]:
#pip install eli5 in external terminal
import eli5

ModuleNotFoundError: No module named 'eli5'

In [None]:
full_pipeline.steps

[('preprocessor',
  ColumnTransformer(transformers=[('cat',
                                   Pipeline(steps=[('1hot',
                                                    OneHotEncoder(handle_unknown='ignore'))]),
                                   ['h1n1_concern', 'h1n1_knowledge',
                                    'behavioral_antiviral_meds',
                                    'behavioral_avoidance',
                                    'behavioral_face_mask',
                                    'behavioral_wash_hands',
                                    'behavioral_large_gatherings',
                                    'behavioral_outside_home',
                                    'behavioral_touch_face', 'doctor_recc_h1n1',
                                    'doctor_recc_se...l',
                                    'chronic_med_condition',
                                    'child_under_6_months', 'health_worker',
                                    'opinion_h1n1_vacc_effecti

In [None]:
onehot_columns = list(full_pipeline.named_steps['preprocessor'].named_transformers_['cat'].named_steps['1hot'].get_feature_names(input_features=cat_features))
#numeric_features_list = list(numeric_features)
#numeric_features_list.extend(onehot_columns)

In [None]:
for clf in full_pipeline.steps[1][1].estimators_:
    print(eli5.explain_weights(clf, feature_names=onehot_columns))

Explanation(estimator='LogisticRegression()', description="\nFeatures with largest coefficients.\nCaveats:\n1. Be careful with features which are not\n   independent - weights don't show their importance.\n2. If scale of input features is different then scale of coefficients\n   will also be different, making direct comparison between coefficient values\n   incorrect.\n3. Depending on regularization, rare features sometimes may have high\n   coefficients; this doesn't mean they contribute much to the\n   classification result for most examples.\n", error=None, method='linear model', is_regression=False, targets=[TargetExplanation(target=1, feature_weights=FeatureWeights(pos=[FeatureWeight(feature='opinion_h1n1_vacc_effective_5.0', weight=1.2019643327884548, std=None, value=None), FeatureWeight(feature='doctor_recc_h1n1_1.0', weight=0.9526998320299309, std=None, value=None), FeatureWeight(feature='opinion_h1n1_risk_5.0', weight=0.7449057973799844, std=None, value=None), FeatureWeight(fe

Here we get the weights of the features and the feature name--but it looks fairly unreadable. We should be able to get the visual table from ELI5 with the ranking of the features

### Trying using permutation_importance (not really working for now)
https://scikit-learn.org/stable/modules/generated/sklearn.inspection.permutation_importance.html#sklearn.inspection.permutation_importance

In [None]:
from sklearn.inspection import permutation_importance

In [None]:
for clf in full_pipeline.steps[1][1].estimators_:
    print(permutation_importance(clf, X_test, y_test, random_state=RSEED))

ValueError: could not convert string to float: '45 - 54 Years'