Notes on MLFlow:
- 'Run name' field: model name, type of output (multilabel vs unilabel), which vaccine (for multiclass only)
- 'Parameters' field: methods applied for data (data cleaning, data balancing, hyperparameters)--insert feature engineering info here, if relevant?
- 'Tags' field: details about the features used for the run (is one of the vaccines in the features?)

In [1]:
import sys
# adding to the path variables the one folder higher (locally, not changing system variables)
sys.path.append("..")
import pandas as pd
import numpy as np
import warnings

#mlflow import
import mlflow
from modeling.config import EXPERIMENT_NAME_multilabel, EXPERIMENT_NAME_h1n1, EXPERIMENT_NAME_seasonal, EXPERIMENT_NAME_multiclass
TRACKING_URI = open("../.mlflow_uri").read().strip()

RSEED = 42

# Modeling Libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

!pip install plotly
import plotly.express as px


from sklearn.dummy import DummyClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_predict, cross_val_score, cross_validate
from sklearn.metrics import roc_curve, confusion_matrix, accuracy_score, recall_score, precision_score, f1_score, roc_auc_score

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn import svm


warnings.filterwarnings('ignore')




In [2]:
df_features = pd.read_csv('../data/Flu_Shot_Learning_Predict_H1N1_and_Seasonal_Flu_Vaccines_-_Training_Features.csv')

In [3]:
df_features.head()

Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,Below Poverty,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,"<= $75,000, Above Poverty",Married,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb


In [4]:
df_target = pd.read_csv('../data/Flu_Shot_Learning_Predict_H1N1_and_Seasonal_Flu_Vaccines_-_Training_Labels.csv')

In [5]:
# Get info for the target
df_target.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26707 entries, 0 to 26706
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   respondent_id     26707 non-null  int64
 1   h1n1_vaccine      26707 non-null  int64
 2   seasonal_vaccine  26707 non-null  int64
dtypes: int64(3)
memory usage: 626.1 KB


In [6]:
df_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26707 entries, 0 to 26706
Data columns (total 36 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   respondent_id                26707 non-null  int64  
 1   h1n1_concern                 26615 non-null  float64
 2   h1n1_knowledge               26591 non-null  float64
 3   behavioral_antiviral_meds    26636 non-null  float64
 4   behavioral_avoidance         26499 non-null  float64
 5   behavioral_face_mask         26688 non-null  float64
 6   behavioral_wash_hands        26665 non-null  float64
 7   behavioral_large_gatherings  26620 non-null  float64
 8   behavioral_outside_home      26625 non-null  float64
 9   behavioral_touch_face        26579 non-null  float64
 10  doctor_recc_h1n1             24547 non-null  float64
 11  doctor_recc_seasonal         24547 non-null  float64
 12  chronic_med_condition        25736 non-null  float64
 13  child_under_6_mo

In [7]:
df_features.isnull().sum(axis = 0)

respondent_id                      0
h1n1_concern                      92
h1n1_knowledge                   116
behavioral_antiviral_meds         71
behavioral_avoidance             208
behavioral_face_mask              19
behavioral_wash_hands             42
behavioral_large_gatherings       87
behavioral_outside_home           82
behavioral_touch_face            128
doctor_recc_h1n1                2160
doctor_recc_seasonal            2160
chronic_med_condition            971
child_under_6_months             820
health_worker                    804
health_insurance               12274
opinion_h1n1_vacc_effective      391
opinion_h1n1_risk                388
opinion_h1n1_sick_from_vacc      395
opinion_seas_vacc_effective      462
opinion_seas_risk                514
opinion_seas_sick_from_vacc      537
age_group                          0
education                       1407
race                               0
sex                                0
income_poverty                  4423
m

- express missing values as %

Options:
- modelling for imputation (withput using our target variable!)
- or use prediction models that don't care about missing values
- or impute so that the overall distribution stays the same (based on statistics of this data set)
- or impute using a hypothesis (e.g. people who have missing values don't have health insurance?)
- for imputation we can try several approaches and see what gives best results :)

- remember--when we impute, we want to base assumptions based on the test set of a train-test split:
    - if we do multiple models, test-train split for each
    - create functions for imputation

- remember modelling for understanding (EDA style) =/= modelling for prediction

library for visualising missing values:
https://github.com/ResidentMario/missingno

In [8]:
# We are concatenating both dataframes into one 

df = pd.merge(df_target, df_features, on=['respondent_id'])

We will drop the following columns for our first iteration:    
- health_insurance, employment_industry, employment_occupation, income_poverty, marital_status, employment_status

What are the values in the features that have a lot of missing data?

In [9]:
df_features.health_insurance.value_counts()

1.0    12697
0.0     1736
Name: health_insurance, dtype: int64

Binary variable; 12% do not have health insurance, the rest do

In [10]:
df_features.employment_industry.value_counts()

fcxhlnwr    2468
wxleyezf    1804
ldnlellj    1231
pxcmvdjn    1037
atmlpfrs     926
arjwrbjb     871
xicduogh     851
mfikgejo     614
vjjrobsf     527
rucpziij     523
xqicxuve     511
saaquncn     338
cfqqtusy     325
nduyfdeo     286
mcubkhph     275
wlfvacwt     215
dotnnunm     201
haxffmxo     148
msuufmds     124
phxvnwax      89
qnlwzans      13
Name: employment_industry, dtype: int64

Anonymised variable with 21 values

In [11]:
df_features.employment_occupation.value_counts()

xtkaffoo    1778
mxkfnird    1509
emcorrxb    1270
cmhcxjea    1247
xgwztkwe    1082
hfxkjkmi     766
qxajmpny     548
xqwwgdyp     485
kldqjyjy     469
uqqtjvyb     452
tfqavkke     388
ukymxvdu     372
vlluhbov     354
oijqvulv     344
ccgxvspp     341
bxpfxfdn     331
haliazsg     296
rcertsgn     276
xzmlyyjv     248
dlvbwzss     227
hodpvpew     208
dcjcmpih     148
pvmttkik      98
Name: employment_occupation, dtype: int64

Anonymised variable with 23 values

# Data cleaning

Dropping of features with too many missing values:

In [12]:
col_drop = ['health_insurance', 'employment_industry', 'employment_occupation', 'income_poverty', 'marital_status', 'employment_status']

df.drop(col_drop, axis=1, inplace=True)

Dropping of all rows with null values:

In [13]:
df.dropna(inplace=True)

Check that all null values have been dropped:

In [14]:
df.isnull().sum(axis = 0)

respondent_id                  0
h1n1_vaccine                   0
seasonal_vaccine               0
h1n1_concern                   0
h1n1_knowledge                 0
behavioral_antiviral_meds      0
behavioral_avoidance           0
behavioral_face_mask           0
behavioral_wash_hands          0
behavioral_large_gatherings    0
behavioral_outside_home        0
behavioral_touch_face          0
doctor_recc_h1n1               0
doctor_recc_seasonal           0
chronic_med_condition          0
child_under_6_months           0
health_worker                  0
opinion_h1n1_vacc_effective    0
opinion_h1n1_risk              0
opinion_h1n1_sick_from_vacc    0
opinion_seas_vacc_effective    0
opinion_seas_risk              0
opinion_seas_sick_from_vacc    0
age_group                      0
education                      0
race                           0
sex                            0
rent_or_own                    0
hhs_geo_region                 0
census_msa                     0
household_

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21853 entries, 0 to 26706
Data columns (total 32 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   respondent_id                21853 non-null  int64  
 1   h1n1_vaccine                 21853 non-null  int64  
 2   seasonal_vaccine             21853 non-null  int64  
 3   h1n1_concern                 21853 non-null  float64
 4   h1n1_knowledge               21853 non-null  float64
 5   behavioral_antiviral_meds    21853 non-null  float64
 6   behavioral_avoidance         21853 non-null  float64
 7   behavioral_face_mask         21853 non-null  float64
 8   behavioral_wash_hands        21853 non-null  float64
 9   behavioral_large_gatherings  21853 non-null  float64
 10  behavioral_outside_home      21853 non-null  float64
 11  behavioral_touch_face        21853 non-null  float64
 12  doctor_recc_h1n1             21853 non-null  float64
 13  doctor_recc_seas

In [16]:
df.reset_index(inplace=True)

We dropped all rows with missing values
- Maybe later on, we will want to refine this approach.

In [17]:
 # We are looking for unique values in order to identify whether we have duplicates

df['respondent_id'].nunique()

21853

All values are unique, no duplicates. 

## EDA

In [18]:
# checking for balance in data

print(df.h1n1_vaccine.value_counts())
print(df.seasonal_vaccine.value_counts())

0    16906
1     4947
Name: h1n1_vaccine, dtype: int64
0    11371
1    10482
Name: seasonal_vaccine, dtype: int64


23% vaccination rate for H1N1 and 52% vaccination rate for seasonal flu
The H1N1 vaccine outcome appears to be unbalanced (almost 17k vs 5k) (read lit--what is considered unbalanced?)
The seasonal vaccine outcome appears to be fairly balanced

We may want to deal with the lack of balance later

In [19]:
# Understanding observations and rows

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21853 entries, 0 to 21852
Data columns (total 33 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   index                        21853 non-null  int64  
 1   respondent_id                21853 non-null  int64  
 2   h1n1_vaccine                 21853 non-null  int64  
 3   seasonal_vaccine             21853 non-null  int64  
 4   h1n1_concern                 21853 non-null  float64
 5   h1n1_knowledge               21853 non-null  float64
 6   behavioral_antiviral_meds    21853 non-null  float64
 7   behavioral_avoidance         21853 non-null  float64
 8   behavioral_face_mask         21853 non-null  float64
 9   behavioral_wash_hands        21853 non-null  float64
 10  behavioral_large_gatherings  21853 non-null  float64
 11  behavioral_outside_home      21853 non-null  float64
 12  behavioral_touch_face        21853 non-null  float64
 13  doctor_recc_h1n1

In [20]:
#columns to drop because they're little use
col_drop = ['index', 'respondent_id']

df.drop(col_drop, axis=1, inplace=True)

- our target variables are h1n1_vaccine and seasonal_vaccine
- at the moment we are working with 28 feature variables--all categorical (refer to challenge documentation for description; we will need to transfer this info to the README)
- seven of the variables are strings--we will convert these to numeric encoding so we can look at correlations in Profiler
household_adults and household_children are 'top-coded' up to 3--that means that household with 3+ adults (or children) will fall into the '3' group
- 'hhs_geo_region' is an anonymised string
- we should remember that the current column names (which would be used as labels in the graphs) are not really human-readable--we need to keep this in mind when we're making plots (either rename the columns beforehand, or include a plotting command to change the labels)

24  age_group                    21853 non-null  object 
 25  education                    21853 non-null  object 
 26  race                         21853 non-null  object 
 27  sex                          21853 non-null  object 
 28  rent_or_own                  21853 non-null  object 
 29  hhs_geo_region               21853 non-null  object 
 30  census_msa                   21853 non-null  object 

24  age_group                    21853 non-null  object 
 25  education                    21853 non-null  object 
 26  race                         21853 non-null  object 
 27  sex                          21853 non-null  object 
 28  rent_or_own                  21853 non-null  object 
 29  hhs_geo_region               21853 non-null  object 
 30  census_msa                   21853 non-null  object 

Conversion of string variables to numeric (so these variables het displayed in Profiler properly) vie manual numeric encoding:

In [21]:
#a separate dataframe is made for the Profiler; the original dataframe will be retained for one-hot encoding (so the column headings we get during one-hot encoding remain meaningful)
df["age_group"].value_counts()

65+ Years        5393
55 - 64 Years    4655
45 - 54 Years    4390
18 - 34 Years    4277
35 - 44 Years    3138
Name: age_group, dtype: int64

In [22]:
df["education"].value_counts()

College Graduate    8839
Some College        6123
12 Years            4963
< 12 Years          1928
Name: education, dtype: int64

In [23]:
df["race"].value_counts()

White                17485
Black                 1670
Hispanic              1428
Other or Multiple     1270
Name: race, dtype: int64

In [24]:
df["sex"].value_counts()

Female    13105
Male       8748
Name: sex, dtype: int64

In [25]:
df["rent_or_own"].value_counts()

Own     16647
Rent     5206
Name: rent_or_own, dtype: int64

In [26]:
df["hhs_geo_region"].value_counts()

lzgpxyit    3481
fpwskwrf    2626
qufhixun    2588
bhuqouqj    2373
oxchjgsf    2356
kbazzjca    2299
mlyzmhmf    1832
atmpeygn    1694
lrircsnp    1686
dqpwygqj     918
Name: hhs_geo_region, dtype: int64

In [27]:
df["census_msa"].value_counts()

MSA, Not Principle  City    9558
MSA, Principle City         6362
Non-MSA                     5933
Name: census_msa, dtype: int64

In [28]:
cleanup = {"age_group": {"18 - 34 Years": 1, "35 - 44 Years": 2, "45 - 54 Years": 3, "55 - 64 Years": 4,
                                  "65+ Years": 5},
            "education": {"< 12 Years": 1, "12 Years": 2, "Some College": 3, "College Graduate": 4},
            "race": {"White": 1, "Black": 2, "Hispanic": 3, "Other or Multiple": 4},
            "sex" : {"Female": 1, "Male": 2},
            "rent_or_own" : {"Own": 1, "Rent": 2},
            "hhs_geo_region" : {"lzgpxyit": 1, "fpwskwrf": 2, "qufhixun": 3, "bhuqouqj": 4, "oxchjgsf": 5, "kbazzjca": 6, "mlyzmhmf": 7, "atmpeygn": 8, "lrircsnp": 9, "dqpwygqj": 10},
            "census_msa" : {"MSA, Not Principle  City": 1, "MSA, Principle City": 2, "Non-MSA": 3}
                                  }

In [29]:
df_for_profiler = df.replace(cleanup)
df_for_profiler.head()

Unnamed: 0,h1n1_vaccine,seasonal_vaccine,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,...,opinion_seas_sick_from_vacc,age_group,education,race,sex,rent_or_own,hhs_geo_region,census_msa,household_adults,household_children
0,0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,2.0,4,1,1,1,1,5,3,0.0,0.0
1,0,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,...,4.0,2,2,1,2,2,4,1,0.0,0.0
2,0,1,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,1.0,5,2,1,1,2,9,2,0.0,0.0
3,0,0,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,4.0,3,3,1,1,1,3,1,1.0,0.0
4,0,0,3.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,4.0,5,2,1,2,1,8,2,2.0,3.0


Run Profiler to explore the data:

In [30]:
#import Profiler
#!pip install pandas-profiling==2.11.0
#from pandas_profiling import ProfileReport

In [31]:
#profile = ProfileReport(df_for_profiler, title="Pandas Profiling Report", explorative=True)

In [32]:
#profile

**Possible multicollinearity (between features) based on heatmap:**
- behavioral_large_gatherings vs behavioral_outside_home
- doctor_recc_h1n1 vs doctor_recc_seasonal
- opinion_h1n1_risk vs opinion_seas_risk
- household_children vs age_group

**Possible outliers and features to be aware of:**
- behavioral_antiviral_meds : very unbalanced between categories; the people taking anviral meds could have something else going on (e.g. already sick, or worried about getting flu and taking meds profilactically)--be careful about this variable
- behavioral_face_mask
- behavioral_wash_hands
-child_under_6_months
- health_worker
- opinion_h1n1_vacc_effective and opinion_seasonal_vacc_effective (1.0 group)
- race (not many non-white respondents)
- household_adults and household_children (3.0 groups pretty small)

# Set up for modelling (stays the same for all experiemnts)

Set up of pipeline preprocessor:

In [33]:
# Pipeline for categorical features
# This stays the same for everything
cat_pipeline = Pipeline([
    ('1hot', OneHotEncoder(handle_unknown='error', drop='first'))
])

In [34]:
cat_features = list(df.columns)

Instantiating the models:

In [35]:
# for Logistic Regression
logreg = LogisticRegression()

# for KNN
knn= KNeighborsClassifier()

# for Random Forest
rand_forst= RandomForestClassifier()


# for SVM
svm= svm.SVC(kernel='rbf')

# TheFluShot_multilabel: Multilabel prediction (both vaccinations)

Removal of target variables from cat_features list (this needs to be adjusted for each dataset):

In [36]:
cat_features_no_vacc = cat_features.copy()

In [37]:
cat_features_no_vacc.remove('h1n1_vaccine')

In [38]:
cat_features_no_vacc.remove('seasonal_vaccine')

In [39]:
cat_features

['h1n1_vaccine',
 'seasonal_vaccine',
 'h1n1_concern',
 'h1n1_knowledge',
 'behavioral_antiviral_meds',
 'behavioral_avoidance',
 'behavioral_face_mask',
 'behavioral_wash_hands',
 'behavioral_large_gatherings',
 'behavioral_outside_home',
 'behavioral_touch_face',
 'doctor_recc_h1n1',
 'doctor_recc_seasonal',
 'chronic_med_condition',
 'child_under_6_months',
 'health_worker',
 'opinion_h1n1_vacc_effective',
 'opinion_h1n1_risk',
 'opinion_h1n1_sick_from_vacc',
 'opinion_seas_vacc_effective',
 'opinion_seas_risk',
 'opinion_seas_sick_from_vacc',
 'age_group',
 'education',
 'race',
 'sex',
 'rent_or_own',
 'hhs_geo_region',
 'census_msa',
 'household_adults',
 'household_children']

Rename the features and target to 'X' and 'y', to make the test-train split easier:

In [40]:
y_both_vacc = df[['h1n1_vaccine', 'seasonal_vaccine']].copy()

In [41]:
y_both_vacc = y_both_vacc.to_numpy()
y_both_vacc

array([[0, 0],
       [0, 1],
       [0, 1],
       ...,
       [0, 0],
       [0, 1],
       [0, 0]])

In [42]:
#NB: dropping the 'h1n1_vaccine' and 'seasonal_vaccine' columns
X_no_vacc = df.drop(columns=['h1n1_vaccine', 'seasonal_vaccine'])

Performing test-train split (the same data can be used for each model in multilabelling):

In [43]:

X_no_vacc_train, X_no_vacc_test, y_both_vacc_train, y_both_vacc_test = train_test_split(X_no_vacc, y_both_vacc, stratify = y_both_vacc, test_size=0.2, random_state=RSEED)

In [44]:
print('X_no_vacc_train shape:', X_no_vacc_train.shape)
print('X_no_vacc_test shape:', X_no_vacc_test.shape)
print('y_both_vacc_train:', y_both_vacc_train.shape)
print('y_both_vacc_test:', y_both_vacc_test.shape)

X_no_vacc_train shape: (17482, 29)
X_no_vacc_test shape: (4371, 29)
y_both_vacc_train: (17482, 2)
y_both_vacc_test: (4371, 2)


Setting up the preprocessor (the same one can be used for each modelling in multilabelling):

In [45]:
preprocessor = ColumnTransformer([
    ('cat', cat_pipeline, cat_features_no_vacc)
])

Setting up the multilabel estimators for each model:

In [46]:
# for logistic regression
multilabel_est_logreg = MultiOutputClassifier(
    estimator=logreg
)

# for KNN
multilabel_est_knn= MultiOutputClassifier(
    estimator=knn
)

# for Random Forest
multilabel_est_rand_forst= MultiOutputClassifier(
    estimator=rand_forst
)


# for SVM
multilabel_est_SVC= MultiOutputClassifier(
    estimator=svm
)

Setting up the pipeline for each model:

In [47]:
# for logreg
logreg_multilabel_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", multilabel_est_logreg),
])

# for KNN
knn_multilabel_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", multilabel_est_knn),
])

# for Random Forest

rand_forst_multilabel_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", multilabel_est_rand_forst),
])

#for SVM
svm_multilabel_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", multilabel_est_SVC),
])

Training the models:

In [48]:
# for logreg
logreg_multilabel = logreg_multilabel_pipeline.fit(X_no_vacc_train,  y_both_vacc_train)

# for KNN
knn_multilabel = knn_multilabel_pipeline.fit(X_no_vacc_train,  y_both_vacc_train)

# for Random Forest
rand_forst_multilabel = rand_forst_multilabel_pipeline.fit(X_no_vacc_train,  y_both_vacc_train)

#for SVM
svm_multilabel = svm_multilabel_pipeline.fit(X_no_vacc_train,  y_both_vacc_train)

In [49]:
# Figure out later what this does and if we want to use it


#y_train_predicted = cross_val_predict(full_pipeline, X_train, y_train, cv=5)

Making predictions based on train and test data:

In [50]:
# for logreg
logreg_multilabel_trainpreds = logreg_multilabel.predict(X_no_vacc_train)
logreg_multilabel_testpreds = logreg_multilabel.predict(X_no_vacc_test)

# for KNN
knn_multilabel_trainpreds = knn_multilabel.predict(X_no_vacc_train)
knn_multilabel_testpreds = knn_multilabel.predict(X_no_vacc_test)

# for Random Forest
rand_forst_multilabel_trainpreds = rand_forst_multilabel.predict(X_no_vacc_train)
rand_forst_multilabel_testpreds = rand_forst_multilabel.predict(X_no_vacc_test)

# for SVM
svm_multilabel_trainpreds = svm_multilabel.predict(X_no_vacc_train)
svm_multilabel_testpreds = svm_multilabel.predict(X_no_vacc_test)

### Model evaluation

#### Train data

In [51]:
# Logreg--Train data evaluation Metrics for H1N1 Vaccines
h1n1_logreg_multilabel_train_acc = accuracy_score(y_both_vacc_train[:, 0], logreg_multilabel_trainpreds[:, 0])
h1n1_logreg_multilabel_train_recall = recall_score(y_both_vacc_train[:, 0], logreg_multilabel_trainpreds[:, 0])
h1n1_logreg_multilabel_train_precision = precision_score(y_both_vacc_train[:, 0], logreg_multilabel_trainpreds[:, 0])
h1n1_logreg_multilabel_train_f1 = f1_score(y_both_vacc_train[:, 0], logreg_multilabel_trainpreds[:, 0])
h1n1_logreg_multilabel_train_roc = roc_auc_score(y_both_vacc_train[:, 0], logreg_multilabel_trainpreds[:, 0])

#print("Accuracy: {:.2f}".format(accuracy_score(y_both_vacc_train[:, 0], logreg_multilabel_trainpreds[:, 0])))
#print("Recall: {:.2f}".format(recall_score(y_both_vacc_train[:, 0], logreg_multilabel_trainpreds[:, 0])))
#print("Precision: {:.2f}".format(precision_score(y_both_vacc_train[:, 0], logreg_multilabel_trainpreds[:, 0])))
#print("F1: {:.2f}".format(f1_score(y_both_vacc_train[:, 0], logreg_multilabel_trainpreds[:, 0])))
#print("ROC: {:.2f}".format(roc_auc_score(y_both_vacc_train[:, 0], logreg_multilabel_trainpreds[:, 0])))

In [52]:
# Logreg--Train data Evaluation Metrics for seasonal Vaccines
seasonal_logreg_multilabel_train_acc = accuracy_score(y_both_vacc_train[:, 1], logreg_multilabel_trainpreds[:, 1])
seasonal_logreg_multilabel_train_recall = recall_score(y_both_vacc_train[:, 1], logreg_multilabel_trainpreds[:, 1])
seasonal_logreg_multilabel_train_precision = precision_score(y_both_vacc_train[:, 1], logreg_multilabel_trainpreds[:, 1])
seasonal_logreg_multilabel_train_f1 = f1_score(y_both_vacc_train[:, 1], logreg_multilabel_trainpreds[:, 1])
seasonal_logreg_multilabel_train_roc = roc_auc_score(y_both_vacc_train[:, 1], logreg_multilabel_trainpreds[:, 1])

#print("Accuracy: {:.2f}".format(accuracy_score(y_both_vacc_train[:, 1], logreg_multilabel_trainpreds[:, 1])))
#print("Recall: {:.2f}".format(recall_score(y_both_vacc_train[:, 1], logreg_multilabel_trainpreds[:, 1])))
#print("Precision: {:.2f}".format(precision_score(y_both_vacc_train[:, 1], logreg_multilabel_trainpreds[:, 1])))
#print("F1: {:.2f}".format(f1_score(y_both_vacc_train[:, 1], logreg_multilabel_trainpreds[:, 1])))
#print("ROC: {:.2f}".format(roc_auc_score(y_both_vacc_train[:, 1], logreg_multilabel_trainpreds[:, 1])))

In [53]:
# KNN--Train data evaluation Metrics for H1N1 Vaccines
h1n1_knn_multilabel_train_acc = accuracy_score(y_both_vacc_train[:, 0], knn_multilabel_trainpreds[:, 0])
h1n1_knn_multilabel_train_recall = recall_score(y_both_vacc_train[:, 0], knn_multilabel_trainpreds[:, 0])
h1n1_knn_multilabel_train_precision = precision_score(y_both_vacc_train[:, 0], knn_multilabel_trainpreds[:, 0])
h1n1_knn_multilabel_train_f1 = f1_score(y_both_vacc_train[:, 0], knn_multilabel_trainpreds[:, 0])
h1n1_knn_multilabel_train_roc = roc_auc_score(y_both_vacc_train[:, 0], knn_multilabel_trainpreds[:, 0])

#print("Accuracy: {:.2f}".format(accuracy_score(y_both_vacc_train[:, 0], knn_multilabel_trainpreds[:, 0])))
#print("Recall: {:.2f}".format(recall_score(y_both_vacc_train[:, 0], knn_multilabel_trainpreds[:, 0])))
#print("Precision: {:.2f}".format(precision_score(y_both_vacc_train[:, 0], knn_multilabel_trainpreds[:, 0])))
#print("F1: {:.2f}".format(f1_score(y_both_vacc_train[:, 0], knn_multilabel_trainpreds[:, 0])))
#print("ROC: {:.2f}".format(roc_auc_score(y_both_vacc_train[:, 0], knn_multilabel_trainpreds[:, 0])))

In [54]:
# KNN--Train data Evaluation Metrics for seasonal Vaccines
seasonal_knn_multilabel_train_acc = accuracy_score(y_both_vacc_train[:, 1], knn_multilabel_trainpreds[:, 1])
seasonal_knn_multilabel_train_recall = recall_score(y_both_vacc_train[:, 1], knn_multilabel_trainpreds[:, 1])
seasonal_knn_multilabel_train_precision = precision_score(y_both_vacc_train[:, 1], knn_multilabel_trainpreds[:, 1])
seasonal_knn_multilabel_train_f1 = f1_score(y_both_vacc_train[:, 1], knn_multilabel_trainpreds[:, 1])
seasonal_knn_multilabel_train_roc = roc_auc_score(y_both_vacc_train[:, 1], knn_multilabel_trainpreds[:, 1])

#print("Accuracy: {:.2f}".format(accuracy_score(y_both_vacc_train[:, 1], knn_multilabel_trainpreds[:, 1])))
#print("Recall: {:.2f}".format(recall_score(y_both_vacc_train[:, 1], knn_multilabel_trainpreds[:, 1])))
#print("Precision: {:.2f}".format(precision_score(y_both_vacc_train[:, 1], knn_multilabel_trainpreds[:, 1])))
#print("F1: {:.2f}".format(f1_score(y_both_vacc_train[:, 1], knn_multilabel_trainpreds[:, 1])))
#print("ROC: {:.2f}".format(roc_auc_score(y_both_vacc_train[:, 1], knn_multilabel_trainpreds[:, 1])))

In [55]:
# Random Forest--Train data evaluation Metrics for H1N1 Vaccines
# THESE RESULTS ARE SLIGHTLY OFF FROM PREVIOUS NOTEBOOK (BY 0.01)--CHECK WHAT'S AT PLAY. VARIATION COMING FROM TEST-TRAIN SPLIT? EXPORT TEST-TRAIN DATA AS CSV AND TRY AGAIN
#IS IT BECAUSE IT'S RANDOM FOREST? DIFFERENT STARTING STUMP EACH TIME?
h1n1_rand_forst_multilabel_train_acc = accuracy_score(y_both_vacc_train[:, 0], rand_forst_multilabel_trainpreds[:, 0])
h1n1_rand_forst_multilabel_train_recall = recall_score(y_both_vacc_train[:, 0], rand_forst_multilabel_trainpreds[:, 0])
h1n1_rand_forst_multilabel_train_precision = precision_score(y_both_vacc_train[:, 0], rand_forst_multilabel_trainpreds[:, 0])
h1n1_rand_forst_multilabel_train_f1 = f1_score(y_both_vacc_train[:, 0], rand_forst_multilabel_trainpreds[:, 0])
h1n1_rand_forst_multilabel_train_roc = roc_auc_score(y_both_vacc_train[:, 0], rand_forst_multilabel_trainpreds[:, 0])

#print("Accuracy: {:.2f}".format(accuracy_score(y_both_vacc_train[:, 0], rand_forst_multilabel_trainpreds[:, 0])))
#print("Recall: {:.2f}".format(recall_score(y_both_vacc_train[:, 0], rand_forst_multilabel_trainpreds[:, 0])))
#print("Precision: {:.2f}".format(precision_score(y_both_vacc_train[:, 0], rand_forst_multilabel_trainpreds[:, 0])))
#print("F1: {:.2f}".format(f1_score(y_both_vacc_train[:, 0], rand_forst_multilabel_trainpreds[:, 0])))
#print("ROC: {:.2f}".format(roc_auc_score(y_both_vacc_train[:, 0], rand_forst_multilabel_trainpreds[:, 0])))

In [56]:
# Random Forest--Train data Evaluation Metrics for seasonal Vaccines
# THESE RESULTS ARE SLIGHTLY OFF FROM PREVIOUS NOTEBOOK (BY 0.01)--CHECK WHAT'S AT PLAY. VARIATION COMING FROM TEST-TRAIN SPLIT? EXPORT TEST-TRAIN DATA AS CSV AND TRY AGAIN
#IS IT BECAUSE IT'S RANDOM FOREST? DIFFERENT STARTING STUMP EACH TIME?
seasonal_rand_forst_multilabel_train_acc = accuracy_score(y_both_vacc_train[:, 1], rand_forst_multilabel_trainpreds[:, 1])
seasonal_rand_forst_multilabel_train_recall = recall_score(y_both_vacc_train[:, 1], rand_forst_multilabel_trainpreds[:, 1])
seasonal_rand_forst_multilabel_train_precision = precision_score(y_both_vacc_train[:, 1], rand_forst_multilabel_trainpreds[:, 1])
seasonal_rand_forst_multilabel_train_f1 = f1_score(y_both_vacc_train[:, 1], rand_forst_multilabel_trainpreds[:, 1])
seasonal_rand_forst_multilabel_train_roc = roc_auc_score(y_both_vacc_train[:, 1], rand_forst_multilabel_trainpreds[:, 1])

#print("Accuracy: {:.2f}".format(accuracy_score(y_both_vacc_train[:, 1], rand_forst_multilabel_trainpreds[:, 1])))
#print("Recall: {:.2f}".format(recall_score(y_both_vacc_train[:, 1], rand_forst_multilabel_trainpreds[:, 1])))
#print("Precision: {:.2f}".format(precision_score(y_both_vacc_train[:, 1], rand_forst_multilabel_trainpreds[:, 1])))
#print("F1: {:.2f}".format(f1_score(y_both_vacc_train[:, 1], rand_forst_multilabel_trainpreds[:, 1])))
#print("ROC: {:.2f}".format(roc_auc_score(y_both_vacc_train[:, 1], rand_forst_multilabel_trainpreds[:, 1])))

In [57]:
# SVM--Train data evaluation Metrics for H1N1 Vaccines
h1n1_svm_multilabel_train_acc = accuracy_score(y_both_vacc_train[:, 0], svm_multilabel_trainpreds[:, 0])
h1n1_svm_multilabel_train_recall = recall_score(y_both_vacc_train[:, 0], svm_multilabel_trainpreds[:, 0])
h1n1_svm_multilabel_train_precision = precision_score(y_both_vacc_train[:, 0], svm_multilabel_trainpreds[:, 0])
h1n1_svm_multilabel_train_f1 = f1_score(y_both_vacc_train[:, 0], svm_multilabel_trainpreds[:, 0])
h1n1_svm_multilabel_train_roc = roc_auc_score(y_both_vacc_train[:, 0], svm_multilabel_trainpreds[:, 0])

#print("Accuracy: {:.2f}".format(accuracy_score(y_both_vacc_train[:, 0], svm_multilabel_trainpreds[:, 0])))
#print("Recall: {:.2f}".format(recall_score(y_both_vacc_train[:, 0], svm_multilabel_trainpreds[:, 0])))
#print("Precision: {:.2f}".format(precision_score(y_both_vacc_train[:, 0], svm_multilabel_trainpreds[:, 0])))
#print("F1: {:.2f}".format(f1_score(y_both_vacc_train[:, 0], svm_multilabel_trainpreds[:, 0])))
#print("ROC: {:.2f}".format(roc_auc_score(y_both_vacc_train[:, 0], svm_multilabel_trainpreds[:, 0])))

In [58]:
# SVM--Train data Evaluation Metrics for seasonal Vaccines
seasonal_svm_multilabel_train_acc = accuracy_score(y_both_vacc_train[:, 1], svm_multilabel_trainpreds[:, 1])
seasonal_svm_multilabel_train_recall = recall_score(y_both_vacc_train[:, 1], svm_multilabel_trainpreds[:, 1])
seasonal_svm_multilabel_train_precision = precision_score(y_both_vacc_train[:, 1], svm_multilabel_trainpreds[:, 1])
seasonal_svm_multilabel_train_f1 = f1_score(y_both_vacc_train[:, 1], svm_multilabel_trainpreds[:, 1])
seasonal_svm_multilabel_train_roc = roc_auc_score(y_both_vacc_train[:, 1], svm_multilabel_trainpreds[:, 1])

#print("Accuracy: {:.2f}".format(accuracy_score(y_both_vacc_train[:, 1], svm_multilabel_trainpreds[:, 1])))
#print("Recall: {:.2f}".format(recall_score(y_both_vacc_train[:, 1], svm_multilabel_trainpreds[:, 1])))
#print("Precision: {:.2f}".format(precision_score(y_both_vacc_train[:, 1], svm_multilabel_trainpreds[:, 1])))
#print("F1: {:.2f}".format(f1_score(y_both_vacc_train[:, 1], svm_multilabel_trainpreds[:, 1])))
#print("ROC: {:.2f}".format(roc_auc_score(y_both_vacc_train[:, 1], svm_multilabel_trainpreds[:, 1])))

#### Test data

In [59]:
# Logreg--Test data evaluation Metrics for H1N1 Vaccines
h1n1_logreg_multilabel_test_acc = accuracy_score(y_both_vacc_test[:, 0], logreg_multilabel_testpreds[:, 0])
h1n1_logreg_multilabel_test_recall = recall_score(y_both_vacc_test[:, 0], logreg_multilabel_testpreds[:, 0])
h1n1_logreg_multilabel_test_precision = precision_score(y_both_vacc_test[:, 0], logreg_multilabel_testpreds[:, 0])
h1n1_logreg_multilabel_test_f1 = f1_score(y_both_vacc_test[:, 0], logreg_multilabel_testpreds[:, 0])
h1n1_logreg_multilabel_test_roc = roc_auc_score(y_both_vacc_test[:, 0], logreg_multilabel_testpreds[:, 0])

#print("Accuracy: {:.2f}".format(accuracy_score(y_both_vacc_test[:, 0], logreg_multilabel_testpreds[:, 0])))
#print("Recall: {:.2f}".format(recall_score(y_both_vacc_test[:, 0], logreg_multilabel_testpreds[:, 0])))
#print("Precision: {:.2f}".format(precision_score(y_both_vacc_test[:, 0], logreg_multilabel_testpreds[:, 0])))
#print("F1: {:.2f}".format(f1_score(y_both_vacc_test[:, 0], logreg_multilabel_testpreds[:, 0])))
#print("ROC: {:.2f}".format(roc_auc_score(y_both_vacc_test[:, 0], logreg_multilabel_testpreds[:, 0])))

In [60]:
# Logreg--Test data Evaluation Metrics for seasonal Vaccines
seasonal_logreg_multilabel_test_acc = accuracy_score(y_both_vacc_test[:, 1], logreg_multilabel_testpreds[:, 1])
seasonal_logreg_multilabel_test_recall = recall_score(y_both_vacc_test[:, 1], logreg_multilabel_testpreds[:, 1])
seasonal_logreg_multilabel_test_precision = precision_score(y_both_vacc_test[:, 1], logreg_multilabel_testpreds[:, 1])
seasonal_logreg_multilabel_test_f1 = f1_score(y_both_vacc_test[:, 1], logreg_multilabel_testpreds[:, 1])
seasonal_logreg_multilabel_test_roc = roc_auc_score(y_both_vacc_test[:, 1], logreg_multilabel_testpreds[:, 1])

#print("Accuracy: {:.2f}".format(accuracy_score(y_both_vacc_test[:, 1], logreg_multilabel_testpreds[:, 1])))
#print("Recall: {:.2f}".format(recall_score(y_both_vacc_test[:, 1], logreg_multilabel_testpreds[:, 1])))
#print("Precision: {:.2f}".format(precision_score(y_both_vacc_test[:, 1], logreg_multilabel_testpreds[:, 1])))
#print("F1: {:.2f}".format(f1_score(y_both_vacc_test[:, 1], logreg_multilabel_testpreds[:, 1])))
#print("ROC: {:.2f}".format(roc_auc_score(y_both_vacc_test[:, 1], logreg_multilabel_testpreds[:, 1])))

In [61]:
# KNN--Test data evaluation Metrics for H1N1 Vaccines
h1n1_knn_multilabel_test_acc = accuracy_score(y_both_vacc_test[:, 0], knn_multilabel_testpreds[:, 0])
h1n1_knn_multilabel_test_recall = recall_score(y_both_vacc_test[:, 0], knn_multilabel_testpreds[:, 0])
h1n1_knn_multilabel_test_precision = precision_score(y_both_vacc_test[:, 0], knn_multilabel_testpreds[:, 0])
h1n1_knn_multilabel_test_f1 = f1_score(y_both_vacc_test[:, 0], knn_multilabel_testpreds[:, 0])
h1n1_knn_multilabel_test_roc = roc_auc_score(y_both_vacc_test[:, 0], knn_multilabel_testpreds[:, 0])

#print("Accuracy: {:.2f}".format(accuracy_score(y_both_vacc_test[:, 0], knn_multilabel_testpreds[:, 0])))
#print("Recall: {:.2f}".format(recall_score(y_both_vacc_test[:, 0], knn_multilabel_testpreds[:, 0])))
#print("Precision: {:.2f}".format(precision_score(y_both_vacc_test[:, 0], knn_multilabel_testpreds[:, 0])))
#print("F1: {:.2f}".format(f1_score(y_both_vacc_test[:, 0], knn_multilabel_testpreds[:, 0])))
#print("ROC: {:.2f}".format(roc_auc_score(y_both_vacc_test[:, 0], knn_multilabel_testpreds[:, 0])))

In [62]:
# KNN--Test data Evaluation Metrics for seasonal Vaccines
seasonal_knn_multilabel_test_acc = accuracy_score(y_both_vacc_test[:, 1], knn_multilabel_testpreds[:, 1])
seasonal_knn_multilabel_test_recall = recall_score(y_both_vacc_test[:, 1], knn_multilabel_testpreds[:, 1])
seasonal_knn_multilabel_test_precision = precision_score(y_both_vacc_test[:, 1], knn_multilabel_testpreds[:, 1])
seasonal_knn_multilabel_test_f1 = f1_score(y_both_vacc_test[:, 1], knn_multilabel_testpreds[:, 1])
seasonal_knn_multilabel_test_roc = roc_auc_score(y_both_vacc_test[:, 1], knn_multilabel_testpreds[:, 1])

#print("Accuracy: {:.2f}".format(accuracy_score(y_both_vacc_test[:, 1], knn_multilabel_testpreds[:, 1])))
#print("Recall: {:.2f}".format(recall_score(y_both_vacc_test[:, 1], knn_multilabel_testpreds[:, 1])))
#print("Precision: {:.2f}".format(precision_score(y_both_vacc_test[:, 1], knn_multilabel_testpreds[:, 1])))
#print("F1: {:.2f}".format(f1_score(y_both_vacc_test[:, 1], knn_multilabel_testpreds[:, 1])))
#print("ROC: {:.2f}".format(roc_auc_score(y_both_vacc_test[:, 1], knn_multilabel_testpreds[:, 1])))

In [63]:
# Random Forest--Test data evaluation Metrics for H1N1 Vaccines
# THESE RESULTS ARE SLIGHTLY OFF FROM PREVIOUS NOTEBOOK (BY 0.01)--CHECK WHAT'S AT PLAY. VARIATION COMING FROM TEST-TRAIN SPLIT? EXPORT TEST-TRAIN DATA AS CSV AND TRY AGAIN
#IS IT BECAUSE IT'S RANDOM FOREST? DIFFERENT STARTING STUMP EACH TIME?
h1n1_rand_forst_multilabel_test_acc = accuracy_score(y_both_vacc_test[:, 0], rand_forst_multilabel_testpreds[:, 0])
h1n1_rand_forst_multilabel_test_recall = recall_score(y_both_vacc_test[:, 0], rand_forst_multilabel_testpreds[:, 0])
h1n1_rand_forst_multilabel_test_precision = precision_score(y_both_vacc_test[:, 0], rand_forst_multilabel_testpreds[:, 0])
h1n1_rand_forst_multilabel_test_f1 = f1_score(y_both_vacc_test[:, 0], rand_forst_multilabel_testpreds[:, 0])
h1n1_rand_forst_multilabel_test_roc = roc_auc_score(y_both_vacc_test[:, 0], rand_forst_multilabel_testpreds[:, 0])

#print("Accuracy: {:.2f}".format(accuracy_score(y_both_vacc_test[:, 0], rand_forst_multilabel_testpreds[:, 0])))
#print("Recall: {:.2f}".format(recall_score(y_both_vacc_test[:, 0], rand_forst_multilabel_testpreds[:, 0])))
#print("Precision: {:.2f}".format(precision_score(y_both_vacc_test[:, 0], rand_forst_multilabel_testpreds[:, 0])))
#print("F1: {:.2f}".format(f1_score(y_both_vacc_test[:, 0], rand_forst_multilabel_testpreds[:, 0])))
#print("ROC: {:.2f}".format(roc_auc_score(y_both_vacc_test[:, 0], rand_forst_multilabel_testpreds[:, 0])))

In [64]:
# Random Forest--Test data Evaluation Metrics for seasonal Vaccines
# THESE RESULTS ARE SLIGHTLY OFF FROM PREVIOUS NOTEBOOK (BY 0.01)--CHECK WHAT'S AT PLAY. VARIATION COMING FROM TEST-TRAIN SPLIT? EXPORT TEST-TRAIN DATA AS CSV AND TRY AGAIN
#IS IT BECAUSE IT'S RANDOM FOREST? DIFFERENT STARTING STUMP EACH TIME?
seasonal_rand_forst_multilabel_test_acc = accuracy_score(y_both_vacc_test[:, 1], rand_forst_multilabel_testpreds[:, 1])
seasonal_rand_forst_multilabel_test_recall = recall_score(y_both_vacc_test[:, 1], rand_forst_multilabel_testpreds[:, 1])
seasonal_rand_forst_multilabel_test_precision = precision_score(y_both_vacc_test[:, 1], rand_forst_multilabel_testpreds[:, 1])
seasonal_rand_forst_multilabel_test_f1 = f1_score(y_both_vacc_test[:, 1], rand_forst_multilabel_testpreds[:, 1])
seasonal_rand_forst_multilabel_test_roc = roc_auc_score(y_both_vacc_test[:, 1], rand_forst_multilabel_testpreds[:, 1])

#print("Accuracy: {:.2f}".format(accuracy_score(y_both_vacc_test[:, 1], rand_forst_multilabel_testpreds[:, 1])))
#print("Recall: {:.2f}".format(recall_score(y_both_vacc_test[:, 1], rand_forst_multilabel_testpreds[:, 1])))
#print("Precision: {:.2f}".format(precision_score(y_both_vacc_test[:, 1], rand_forst_multilabel_testpreds[:, 1])))
#print("F1: {:.2f}".format(f1_score(y_both_vacc_test[:, 1], rand_forst_multilabel_testpreds[:, 1])))
#print("ROC: {:.2f}".format(roc_auc_score(y_both_vacc_test[:, 1], rand_forst_multilabel_testpreds[:, 1])))

In [65]:
# SVM--Test data evaluation Metrics for H1N1 Vaccines
h1n1_svm_multilabel_test_acc = accuracy_score(y_both_vacc_test[:, 0], svm_multilabel_testpreds[:, 0])
h1n1_svm_multilabel_test_recall = recall_score(y_both_vacc_test[:, 0], svm_multilabel_testpreds[:, 0])
h1n1_svm_multilabel_test_precision = precision_score(y_both_vacc_test[:, 0], svm_multilabel_testpreds[:, 0])
h1n1_svm_multilabel_test_f1 = f1_score(y_both_vacc_test[:, 0], svm_multilabel_testpreds[:, 0])
h1n1_svm_multilabel_test_roc = roc_auc_score(y_both_vacc_test[:, 0], svm_multilabel_testpreds[:, 0])

#print("Accuracy: {:.2f}".format(accuracy_score(y_both_vacc_test[:, 0], svm_multilabel_testpreds[:, 0])))
#print("Recall: {:.2f}".format(recall_score(y_both_vacc_test[:, 0], svm_multilabel_testpreds[:, 0])))
#print("Precision: {:.2f}".format(precision_score(y_both_vacc_test[:, 0], svm_multilabel_testpreds[:, 0])))
#print("F1: {:.2f}".format(f1_score(y_both_vacc_test[:, 0], svm_multilabel_testpreds[:, 0])))
#print("ROC: {:.2f}".format(roc_auc_score(y_both_vacc_test[:, 0], svm_multilabel_testpreds[:, 0])))

In [66]:
# SVM--Test data Evaluation Metrics for seasonal Vaccines
seasonal_svm_multilabel_test_acc = accuracy_score(y_both_vacc_test[:, 1], svm_multilabel_testpreds[:, 1])
seasonal_svm_multilabel_test_recall = recall_score(y_both_vacc_test[:, 1], svm_multilabel_testpreds[:, 1])
seasonal_svm_multilabel_test_precision = precision_score(y_both_vacc_test[:, 1], svm_multilabel_testpreds[:, 1])
seasonal_svm_multilabel_test_f1 = f1_score(y_both_vacc_test[:, 1], svm_multilabel_testpreds[:, 1])
seasonal_svm_multilabel_test_roc = roc_auc_score(y_both_vacc_test[:, 1], svm_multilabel_testpreds[:, 1])

#print("Accuracy: {:.2f}".format(accuracy_score(y_both_vacc_test[:, 1], logreg_multilabel_testpreds[:, 1])))
#print("Recall: {:.2f}".format(recall_score(y_both_vacc_test[:, 1], logreg_multilabel_testpreds[:, 1])))
#print("Precision: {:.2f}".format(precision_score(y_both_vacc_test[:, 1], logreg_multilabel_testpreds[:, 1])))
#print("F1: {:.2f}".format(f1_score(y_both_vacc_test[:, 1], logreg_multilabel_testpreds[:, 1])))
#print("ROC: {:.2f}".format(roc_auc_score(y_both_vacc_test[:, 1], logreg_multilabel_testpreds[:, 1])))

## Tracking the model with MLFlow

### H1N1 vaccine output

#### Logistic regression

In [67]:
# setting the MLFlow connection and experiment
mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME_multilabel) #this needs to be adjusted for each experiment
name = 'logreg_multilabel_h1n1' #specify the run name here; name it with model used
mlflow.start_run(run_name = name) 
run = mlflow.active_run()

In [68]:
print("Active run_id: {}".format(run.info.run_id))

Active run_id: 729a56eea9814086a02a5b290c445eef


In [69]:
#parameters are to keep track of everything for re-running the experiment (e.g. what features are being engineered, hyperparameters of model, etc)
#to be adjusted as needed
params = {
    "Data cleaning": "Drop all nulls",
    "Data balancing": "None",
    "Hyperparameters": "None"
  }

In [70]:
mlflow.log_params(params)
#tags = data used
mlflow.set_tag("Vaccines in features", "None") #set tags for more details of what we've done
mlflow.log_metric("train -" + "ROC", h1n1_logreg_multilabel_train_roc)
mlflow.log_metric("test -" + "ROC", h1n1_logreg_multilabel_test_roc)
mlflow.log_metric("train -" + "accuracy", h1n1_logreg_multilabel_train_acc)
mlflow.log_metric("test -" + "accuracy", h1n1_logreg_multilabel_test_acc)
mlflow.log_metric("train -" + "recall", h1n1_logreg_multilabel_train_recall)
mlflow.log_metric("test -" + "recall", h1n1_logreg_multilabel_test_recall)
mlflow.log_metric("train -" + "precision", h1n1_logreg_multilabel_train_precision)
mlflow.log_metric("test -" + "precision", h1n1_logreg_multilabel_test_precision)
mlflow.log_metric("train -" + "f1", h1n1_logreg_multilabel_train_f1)
mlflow.log_metric("test -" + "f1", h1n1_logreg_multilabel_test_f1)
#mlflow.log_artifact("../models")
#mlflow.sklearn.log_model(reg, "model")
mlflow.end_run()

In [71]:
mlflow.get_run(run_id=run.info.run_id)

<Run: data=<RunData: metrics={'test -ROC': 0.6908088321028654,
 'test -accuracy': 0.825440402653855,
 'test -f1': 0.5356055995130858,
 'test -precision': 0.672782874617737,
 'test -recall': 0.4448938321536906,
 'train -ROC': 0.7053562554709693,
 'train -accuracy': 0.833485871181787,
 'train -f1': 0.5616624002409276,
 'train -precision': 0.6951174058889303,
 'train -recall': 0.4711975745325922}, params={'Data balancing': 'None',
 'Data cleaning': 'Drop all nulls',
 'Hyperparameters': 'None'}, tags={'Vaccines in features': 'None',
 'mlflow.runName': 'logreg_multilabel_h1n1',
 'mlflow.source.git.commit': '4d215498c00f64a529b2e337703b17da7ed152e1',
 'mlflow.source.name': '/Users/christinarudolf/Documents/neuefische_ds/TheFluShot/.venv/lib/python3.8/site-packages/ipykernel_launcher.py',
 'mlflow.source.type': 'LOCAL',
 'mlflow.user': 'christinarudolf'}>, info=<RunInfo: artifact_uri='s3://neuefische-mlflow/mlflow-artifacts/24/729a56eea9814086a02a5b290c445eef/artifacts', end_time=162703240586

#### KNN

In [72]:
# setting the MLFlow connection and experiment
mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME_multilabel)
name = 'knn_multilabel_h1n1' #specify the run name here; name it with model used
mlflow.start_run(run_name = name) 
run = mlflow.active_run()

In [73]:
print("Active run_id: {}".format(run.info.run_id))

Active run_id: 56850378a81a46b8b0664cdf0c1aa89c


In [74]:
#parameters are to keep track of everything for re-running the experiment (e.g. what features are being engineered, hyperparameters of model, etc)
#to be adjusted as needed
params = {
    "Data cleaning": "Drop all nulls",
    "Data balancing": "None",
    "Hyperparameters": "None"
  }

In [75]:
mlflow.log_params(params)
#tags = data used
mlflow.set_tag("Vaccines in features", "None") #set tags for more details of what we've done
mlflow.log_metric("train -" + "ROC", h1n1_knn_multilabel_train_roc)
mlflow.log_metric("test -" + "ROC", h1n1_knn_multilabel_test_roc)
mlflow.log_metric("train -" + "accuracy", h1n1_knn_multilabel_train_acc)
mlflow.log_metric("test -" + "accuracy", h1n1_knn_multilabel_test_acc)
mlflow.log_metric("train -" + "recall", h1n1_knn_multilabel_train_recall)
mlflow.log_metric("test -" + "recall", h1n1_knn_multilabel_test_recall)
mlflow.log_metric("train -" + "precision", h1n1_knn_multilabel_train_precision)
mlflow.log_metric("test -" + "precision", h1n1_knn_multilabel_test_precision)
mlflow.log_metric("train -" + "f1", h1n1_knn_multilabel_train_f1)
mlflow.log_metric("test -" + "f1", h1n1_knn_multilabel_test_f1)
#mlflow.log_artifact("../models")
#mlflow.sklearn.log_model(reg, "model")
mlflow.end_run()

In [76]:
#mlflow.get_run(run_id=run.info.run_id)

#### Random forest

In [77]:
# setting the MLFlow connection and experiment
mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME_multilabel)
name = 'rand_forst_multilabel_h1n1' #specify the run name here; name it with model used
mlflow.start_run(run_name = name) 
run = mlflow.active_run()

In [78]:
print("Active run_id: {}".format(run.info.run_id))

Active run_id: d28944cba4ff481ab0d0ab9d62087f4d


In [79]:
#parameters are to keep track of everything for re-running the experiment (e.g. what features are being engineered, hyperparameters of model, etc)
#to be adjusted as needed
params = {
    "Data cleaning": "Drop all nulls",
    "Data balancing": "None",
    "Hyperparameters": "None"
  }

In [80]:
mlflow.log_params(params)
#tags = data used
mlflow.set_tag("Vaccines in features", "None") #set tags for more details of what we've done
mlflow.log_metric("train -" + "ROC", h1n1_rand_forst_multilabel_train_roc)
mlflow.log_metric("test -" + "ROC", h1n1_rand_forst_multilabel_test_roc)
mlflow.log_metric("train -" + "accuracy", h1n1_rand_forst_multilabel_train_acc)
mlflow.log_metric("test -" + "accuracy", h1n1_rand_forst_multilabel_test_acc)
mlflow.log_metric("train -" + "recall", h1n1_rand_forst_multilabel_train_recall)
mlflow.log_metric("test -" + "recall", h1n1_rand_forst_multilabel_test_recall)
mlflow.log_metric("train -" + "precision", h1n1_rand_forst_multilabel_train_precision)
mlflow.log_metric("test -" + "precision", h1n1_rand_forst_multilabel_test_precision)
mlflow.log_metric("train -" + "f1", h1n1_rand_forst_multilabel_train_f1)
mlflow.log_metric("test -" + "f1", h1n1_rand_forst_multilabel_test_f1)
#mlflow.log_artifact("../models")
#mlflow.sklearn.log_model(reg, "model")
mlflow.end_run()

In [81]:
#mlflow.get_run(run_id=run.info.run_id)

#### SVM

In [82]:
# setting the MLFlow connection and experiment
mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME_multilabel)
name = 'svm_multilabel_h1n1' #specify the run name here; name it with model used
mlflow.start_run(run_name = name) 
run = mlflow.active_run()

In [83]:
print("Active run_id: {}".format(run.info.run_id))

Active run_id: cefe0a77981840d7b3202b14e2292328


In [84]:
#parameters are to keep track of everything for re-running the experiment (e.g. what features are being engineered, hyperparameters of model, etc)
#to be adjusted as needed
params = {
    "Data cleaning": "Drop all nulls",
    "Data balancing": "None",
    "Hyperparameters": "None"
  }

In [85]:
mlflow.log_params(params)
#tags = data used
mlflow.set_tag("Vaccines in features", "None") #set tags for more details of what we've done
mlflow.log_metric("train -" + "ROC", h1n1_svm_multilabel_train_roc)
mlflow.log_metric("test -" + "ROC", h1n1_svm_multilabel_test_roc)
mlflow.log_metric("train -" + "accuracy", h1n1_svm_multilabel_train_acc)
mlflow.log_metric("test -" + "accuracy", h1n1_svm_multilabel_test_acc)
mlflow.log_metric("train -" + "recall", h1n1_svm_multilabel_train_recall)
mlflow.log_metric("test -" + "recall", h1n1_svm_multilabel_test_recall)
mlflow.log_metric("train -" + "precision", h1n1_svm_multilabel_train_precision)
mlflow.log_metric("test -" + "precision", h1n1_svm_multilabel_test_precision)
mlflow.log_metric("train -" + "f1", h1n1_svm_multilabel_train_f1)
mlflow.log_metric("test -" + "f1", h1n1_svm_multilabel_test_f1)
#mlflow.log_artifact("../models")
#mlflow.sklearn.log_model(reg, "model")
mlflow.end_run()

In [86]:
#mlflow.get_run(run_id=run.info.run_id)

### Seasonal vaccine output

#### Logistic regression

In [87]:
# setting the MLFlow connection and experiment
mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME_multilabel)
name = 'logreg_multilabel_seasonal' #specify the run name here; name it with model used
mlflow.start_run(run_name = name) 
run = mlflow.active_run()

In [88]:
print("Active run_id: {}".format(run.info.run_id))

Active run_id: 6bcacfa13e25436d90c7eeb215d17529


In [89]:
#parameters are to keep track of everything for re-running the experiment (e.g. what features are being engineered, hyperparameters of model, etc)
#to be adjusted as needed
params = {
    "Data cleaning": "Drop all nulls",
    "Data balancing": "None",
    "Hyperparameters": "None"
  }

In [90]:
mlflow.log_params(params)
#tags = data used
mlflow.set_tag("Vaccines in features", "None") #set tags for more details of what we've done
mlflow.log_metric("train -" + "ROC", seasonal_logreg_multilabel_train_roc)
mlflow.log_metric("test -" + "ROC", seasonal_logreg_multilabel_test_roc)
mlflow.log_metric("train -" + "accuracy", seasonal_logreg_multilabel_train_acc)
mlflow.log_metric("test -" + "accuracy", seasonal_logreg_multilabel_test_acc)
mlflow.log_metric("train -" + "recall", seasonal_logreg_multilabel_train_recall)
mlflow.log_metric("test -" + "recall", seasonal_logreg_multilabel_test_recall)
mlflow.log_metric("train -" + "precision", seasonal_logreg_multilabel_train_precision)
mlflow.log_metric("test -" + "precision", seasonal_logreg_multilabel_test_precision)
mlflow.log_metric("train -" + "f1", seasonal_logreg_multilabel_train_f1)
mlflow.log_metric("test -" + "f1", seasonal_logreg_multilabel_test_f1)
#mlflow.log_artifact("../models")
#mlflow.sklearn.log_model(reg, "model")
mlflow.end_run()

In [91]:
#mlflow.get_run(run_id=run.info.run_id)

#### KNN

In [92]:
# setting the MLFlow connection and experiment
mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME_multilabel)
name = 'knn_multilabel_seasonal' #specify the run name here; name it with model used
mlflow.start_run(run_name = name) 
run = mlflow.active_run()

In [93]:
print("Active run_id: {}".format(run.info.run_id))

Active run_id: 200f3615147a46fca7470af1d1993900


In [94]:
#parameters are to keep track of everything for re-running the experiment (e.g. what features are being engineered, hyperparameters of model, etc)
#to be adjusted as needed
params = {
    "Data cleaning": "Drop all nulls",
    "Data balancing": "None",
    "Hyperparameters": "None"
  }

In [95]:
mlflow.log_params(params)
#tags = data used
mlflow.set_tag("Vaccines in features", "None") #set tags for more details of what we've done
mlflow.log_metric("train -" + "ROC", seasonal_knn_multilabel_train_roc)
mlflow.log_metric("test -" + "ROC", seasonal_knn_multilabel_test_roc)
mlflow.log_metric("train -" + "accuracy", seasonal_knn_multilabel_train_acc)
mlflow.log_metric("test -" + "accuracy", seasonal_knn_multilabel_test_acc)
mlflow.log_metric("train -" + "recall", seasonal_knn_multilabel_train_recall)
mlflow.log_metric("test -" + "recall", seasonal_knn_multilabel_test_recall)
mlflow.log_metric("train -" + "precision", seasonal_knn_multilabel_train_precision)
mlflow.log_metric("test -" + "precision", seasonal_knn_multilabel_test_precision)
mlflow.log_metric("train -" + "f1", seasonal_knn_multilabel_train_f1)
mlflow.log_metric("test -" + "f1", seasonal_knn_multilabel_test_f1)
#mlflow.log_artifact("../models")
#mlflow.sklearn.log_model(reg, "model")
mlflow.end_run()

In [96]:
#mlflow.get_run(run_id=run.info.run_id)

#### Random forest

In [97]:
# setting the MLFlow connection and experiment
mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME_multilabel)
name = 'rand_forst_multilabel_seasonal' #specify the run name here; name it with model used
mlflow.start_run(run_name = name) 
run = mlflow.active_run()

In [98]:
print("Active run_id: {}".format(run.info.run_id))

Active run_id: 5c97e669ffd24b27adefb7144d080d01


In [99]:
#parameters are to keep track of everything for re-running the experiment (e.g. what features are being engineered, hyperparameters of model, etc)
#to be adjusted as needed
params = {
    "Data cleaning": "Drop all nulls",
    "Data balancing": "None",
    "Hyperparameters": "None"
  }

In [100]:
mlflow.log_params(params)
#tags = data used
mlflow.set_tag("Vaccines in features", "None") #set tags for more details of what we've done
mlflow.log_metric("train -" + "ROC", seasonal_rand_forst_multilabel_train_roc)
mlflow.log_metric("test -" + "ROC", seasonal_rand_forst_multilabel_test_roc)
mlflow.log_metric("train -" + "accuracy", seasonal_rand_forst_multilabel_train_acc)
mlflow.log_metric("test -" + "accuracy", seasonal_rand_forst_multilabel_test_acc)
mlflow.log_metric("train -" + "recall", seasonal_rand_forst_multilabel_train_recall)
mlflow.log_metric("test -" + "recall", seasonal_rand_forst_multilabel_test_recall)
mlflow.log_metric("train -" + "precision", seasonal_rand_forst_multilabel_train_precision)
mlflow.log_metric("test -" + "precision", seasonal_rand_forst_multilabel_test_precision)
mlflow.log_metric("train -" + "f1", seasonal_rand_forst_multilabel_train_f1)
mlflow.log_metric("test -" + "f1", seasonal_rand_forst_multilabel_test_f1)
#mlflow.log_artifact("../models")
#mlflow.sklearn.log_model(reg, "model")
mlflow.end_run()

In [101]:
#mlflow.get_run(run_id=run.info.run_id)

#### SVM

In [102]:
# setting the MLFlow connection and experiment
mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME_multilabel)
name = 'svm_multilabel_seasonal' #specify the run name here; name it with model used
mlflow.start_run(run_name = name) 
run = mlflow.active_run()

In [103]:
print("Active run_id: {}".format(run.info.run_id))

Active run_id: f797b63dc0624a8ca34f6e919d7e166c


In [104]:
#parameters are to keep track of everything for re-running the experiment (e.g. what features are being engineered, hyperparameters of model, etc)
#to be adjusted as needed
params = {
    "Data cleaning": "Drop all nulls",
    "Data balancing": "None",
    "Hyperparameters": "None"
  }

In [105]:
mlflow.log_params(params)
#tags = data used
mlflow.set_tag("Vaccines in features", "None") #set tags for more details of what we've done
mlflow.log_metric("train -" + "ROC", seasonal_svm_multilabel_train_roc)
mlflow.log_metric("test -" + "ROC", seasonal_svm_multilabel_test_roc)
mlflow.log_metric("train -" + "accuracy", seasonal_svm_multilabel_train_acc)
mlflow.log_metric("test -" + "accuracy", seasonal_svm_multilabel_test_acc)
mlflow.log_metric("train -" + "recall", seasonal_svm_multilabel_train_recall)
mlflow.log_metric("test -" + "recall", seasonal_svm_multilabel_test_recall)
mlflow.log_metric("train -" + "precision", seasonal_svm_multilabel_train_precision)
mlflow.log_metric("test -" + "precision", seasonal_svm_multilabel_test_precision)
mlflow.log_metric("train -" + "f1", seasonal_svm_multilabel_train_f1)
mlflow.log_metric("test -" + "f1", seasonal_svm_multilabel_test_f1)
#mlflow.log_artifact("../models")
#mlflow.sklearn.log_model(reg, "model")
mlflow.end_run()

In [106]:
#mlflow.get_run(run_id=run.info.run_id)

# TheFluShot_H1N1: Single Label Modelling, output H1N1 vaccine

## Single Label Modelling, output H1N1 vaccine -> Seasonal Flu Vaccine not in features

The cat_features_no_vacc and X_no_vacc variables and the preprocessor remain the same from the multilabel modelling:

Set up the target variable:

In [107]:
y_h1n1_vacc = df[['h1n1_vaccine']].copy()

In [108]:
y_h1n1_vacc = y_h1n1_vacc.to_numpy()
y_h1n1_vacc

array([[0],
       [0],
       [0],
       ...,
       [0],
       [0],
       [0]])

Performing test-train split:

In [109]:
X_no_vacc_train, X_no_vacc_test, y_h1n1_vacc_train, y_h1n1_vacc_test = train_test_split(X_no_vacc, y_h1n1_vacc, stratify = y_h1n1_vacc, test_size=0.2, random_state=RSEED)

In [110]:
print('X_no_vacc_train shape:', X_no_vacc_train.shape)
print('X_no_vacc_test shape:', X_no_vacc_test.shape)
print('y_h1n1_vacc_train:', y_h1n1_vacc_train.shape)
print('y_h1n1_vacc_test:', y_h1n1_vacc_test.shape)

X_no_vacc_train shape: (17482, 29)
X_no_vacc_test shape: (4371, 29)
y_h1n1_vacc_train: (17482, 1)
y_h1n1_vacc_test: (4371, 1)


Setting up the pipeline for each model:

In [111]:
# for logreg
logreg_unilabel_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", logreg),
])

# for KNN
knn_unilabel_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", knn),
])

# for Random Forest

rand_forst_unilabel_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", rand_forst),
])

#for SVM
svm_unilabel_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", svm),
])

Training the models:

In [112]:
# for logreg
logreg_unilabel_no_vacc = logreg_unilabel_pipeline.fit(X_no_vacc_train, y_h1n1_vacc_train)

# for KNN
knn_unilabel_no_vacc = knn_unilabel_pipeline.fit(X_no_vacc_train, y_h1n1_vacc_train)

# for Random Forest
rand_forst_unilabel_no_vacc = rand_forst_unilabel_pipeline.fit(X_no_vacc_train, y_h1n1_vacc_train)

#for SVM
svm_unilabel_no_vacc = svm_unilabel_pipeline.fit(X_no_vacc_train, y_h1n1_vacc_train)

In [113]:
# Figure out later what this does and if we want to use it


#y_train_predicted = cross_val_predict(full_pipeline, X_train, y_train, cv=5)

Making predictions based on train and test data:

In [114]:
# for logreg
logreg_unilabel_no_vacc_trainpreds = logreg_unilabel_no_vacc.predict(X_no_vacc_train)
logreg_unilabel_no_vacc_testpreds = logreg_unilabel_no_vacc.predict(X_no_vacc_test)

# for KNN
knn_unilabel_no_vacc_trainpreds = knn_unilabel_no_vacc.predict(X_no_vacc_train)
knn_unilabel_no_vacc_testpreds = knn_unilabel_no_vacc.predict(X_no_vacc_test)

# for Random Forest
rand_forst_unilabel_no_vacc_trainpreds = rand_forst_unilabel_no_vacc.predict(X_no_vacc_train)
rand_forst_unilabel_no_vacc_testpreds = rand_forst_unilabel_no_vacc.predict(X_no_vacc_test)

# for SVM
svm_unilabel_no_vacc_trainpreds = svm_unilabel_no_vacc.predict(X_no_vacc_train)
svm_unilabel_no_vacc_testpreds = svm_unilabel_no_vacc.predict(X_no_vacc_test)

### Model evaluation

#### Train data

In [115]:
# Logreg--Train data evaluation Metrics
h1n1_logreg_unilabel_no_vacc_train_acc = accuracy_score(y_h1n1_vacc_train, logreg_unilabel_no_vacc_trainpreds)
h1n1_logreg_unilabel_no_vacc_train_recall = recall_score(y_h1n1_vacc_train, logreg_unilabel_no_vacc_trainpreds)
h1n1_logreg_unilabel_no_vacc_train_precision = precision_score(y_h1n1_vacc_train, logreg_unilabel_no_vacc_trainpreds)
h1n1_logreg_unilabel_no_vacc_train_f1 = f1_score(y_h1n1_vacc_train, logreg_unilabel_no_vacc_trainpreds)
h1n1_logreg_unilabel_no_vacc_train_roc = roc_auc_score(y_h1n1_vacc_train, logreg_unilabel_no_vacc_trainpreds)

#print("Accuracy: {:.2f}".format(accuracy_score(y_h1n1_vacc_train, logreg_unilabel_no_vacc_trainpreds)))
#print("Recall: {:.2f}".format(recall_score(y_h1n1_vacc_train, logreg_unilabel_no_vacc_trainpreds)))
#print("Precision: {:.2f}".format(precision_score(y_h1n1_vacc_train, logreg_unilabel_no_vacc_trainpreds)))
#print("F1: {:.2f}".format(f1_score(y_h1n1_vacc_train, logreg_unilabel_no_vacc_trainpreds)))
#print("ROC: {:.2f}".format(roc_auc_score(y_h1n1_vacc_train, logreg_unilabel_no_vacc_trainpreds)))

In [116]:
# KNN--Train data evaluation Metrics
h1n1_knn_unilabel_no_vacc_train_acc = accuracy_score(y_h1n1_vacc_train, knn_unilabel_no_vacc_trainpreds)
h1n1_knn_unilabel_no_vacc_train_recall = recall_score(y_h1n1_vacc_train, knn_unilabel_no_vacc_trainpreds)
h1n1_knn_unilabel_no_vacc_train_precision = precision_score(y_h1n1_vacc_train, knn_unilabel_no_vacc_trainpreds)
h1n1_knn_unilabel_no_vacc_train_f1 = f1_score(y_h1n1_vacc_train, knn_unilabel_no_vacc_trainpreds)
h1n1_knn_unilabel_no_vacc_train_roc = roc_auc_score(y_h1n1_vacc_train, knn_unilabel_no_vacc_trainpreds)

#print("Accuracy: {:.2f}".format(accuracy_score(y_h1n1_vacc_train, knn_unilabel_no_vacc_trainpreds)))
#print("Recall: {:.2f}".format(recall_score(y_h1n1_vacc_train, knn_unilabel_no_vacc_trainpreds)))
#print("Precision: {:.2f}".format(precision_score(y_h1n1_vacc_train, knn_unilabel_no_vacc_trainpreds)))
#print("F1: {:.2f}".format(f1_score(y_h1n1_vacc_train, knn_unilabel_no_vacc_trainpreds)))
#print("ROC: {:.2f}".format(roc_auc_score(y_h1n1_vacc_train, knn_unilabel_no_vacc_trainpreds)))

In [117]:
# Random Forest--Train data evaluation Metrics
#check if these metrics are changing compared to previous results
h1n1_rand_forst_unilabel_no_vacc_train_acc = accuracy_score(y_h1n1_vacc_train, rand_forst_unilabel_no_vacc_trainpreds)
h1n1_rand_forst_unilabel_no_vacc_train_recall = recall_score(y_h1n1_vacc_train, rand_forst_unilabel_no_vacc_trainpreds)
h1n1_rand_forst_unilabel_no_vacc_train_precision = precision_score(y_h1n1_vacc_train, rand_forst_unilabel_no_vacc_trainpreds)
h1n1_rand_forst_unilabel_no_vacc_train_f1 = f1_score(y_h1n1_vacc_train, rand_forst_unilabel_no_vacc_trainpreds)
h1n1_rand_forst_unilabel_no_vacc_train_roc = roc_auc_score(y_h1n1_vacc_train, rand_forst_unilabel_no_vacc_trainpreds)

#print("Accuracy: {:.2f}".format(accuracy_score(y_h1n1_vacc_train, rand_forst_unilabel_no_vacc_trainpreds)))
#print("Recall: {:.2f}".format(recall_score(y_h1n1_vacc_train, rand_forst_unilabel_no_vacc_trainpreds)))
#print("Precision: {:.2f}".format(precision_score(y_h1n1_vacc_train, rand_forst_unilabel_no_vacc_trainpreds)))
#print("F1: {:.2f}".format(f1_score(y_h1n1_vacc_train, rand_forst_unilabel_no_vacc_trainpreds)))
#print("ROC: {:.2f}".format(roc_auc_score(y_h1n1_vacc_train, rand_forst_unilabel_no_vacc_trainpreds)))

In [118]:
# SVM--Train data evaluation Metrics
h1n1_svm_unilabel_no_vacc_train_acc = accuracy_score(y_h1n1_vacc_train, svm_unilabel_no_vacc_trainpreds)
h1n1_svm_unilabel_no_vacc_train_recall = recall_score(y_h1n1_vacc_train, svm_unilabel_no_vacc_trainpreds)
h1n1_svm_unilabel_no_vacc_train_precision = precision_score(y_h1n1_vacc_train, svm_unilabel_no_vacc_trainpreds)
h1n1_svm_unilabel_no_vacc_train_f1 = f1_score(y_h1n1_vacc_train, svm_unilabel_no_vacc_trainpreds)
h1n1_svm_unilabel_no_vacc_train_roc = roc_auc_score(y_h1n1_vacc_train, svm_unilabel_no_vacc_trainpreds)

#print("Accuracy: {:.2f}".format(accuracy_score(y_h1n1_vacc_train, svm_unilabel_no_vacc_trainpreds)))
#print("Recall: {:.2f}".format(recall_score(y_h1n1_vacc_train, svm_unilabel_no_vacc_trainpreds)))
#print("Precision: {:.2f}".format(precision_score(y_h1n1_vacc_train, svm_unilabel_no_vacc_trainpreds)))
#print("F1: {:.2f}".format(f1_score(y_h1n1_vacc_train, svm_unilabel_no_vacc_trainpreds)))
#print("ROC: {:.2f}".format(roc_auc_score(y_h1n1_vacc_train, svm_unilabel_no_vacc_trainpreds)))

#### Test data

In [119]:
# Logreg--Test data evaluation Metrics
h1n1_logreg_unilabel_no_vacc_test_acc = accuracy_score(y_h1n1_vacc_test, logreg_unilabel_no_vacc_testpreds)
h1n1_logreg_unilabel_no_vacc_test_recall = recall_score(y_h1n1_vacc_test, logreg_unilabel_no_vacc_testpreds)
h1n1_logreg_unilabel_no_vacc_test_precision = precision_score(y_h1n1_vacc_test, logreg_unilabel_no_vacc_testpreds)
h1n1_logreg_unilabel_no_vacc_test_f1 = f1_score(y_h1n1_vacc_test, logreg_unilabel_no_vacc_testpreds)
h1n1_logreg_unilabel_no_vacc_test_roc = roc_auc_score(y_h1n1_vacc_test, logreg_unilabel_no_vacc_testpreds)

#print("Accuracy: {:.2f}".format(accuracy_score(y_h1n1_vacc_test, logreg_unilabel_no_vacc_testpreds)))
#print("Recall: {:.2f}".format(recall_score(y_h1n1_vacc_test, logreg_unilabel_no_vacc_testpreds)))
#print("Precision: {:.2f}".format(precision_score(y_h1n1_vacc_test, logreg_unilabel_no_vacc_testpreds)))
#print("F1: {:.2f}".format(f1_score(y_h1n1_vacc_test, logreg_unilabel_no_vacc_testpreds)))
#print("ROC: {:.2f}".format(roc_auc_score(y_h1n1_vacc_test, logreg_unilabel_no_vacc_testpreds)))

In [120]:
# KNN--Test data evaluation Metrics
h1n1_knn_unilabel_no_vacc_test_acc = accuracy_score(y_h1n1_vacc_test, knn_unilabel_no_vacc_testpreds)
h1n1_knn_unilabel_no_vacc_test_recall = recall_score(y_h1n1_vacc_test, knn_unilabel_no_vacc_testpreds)
h1n1_knn_unilabel_no_vacc_test_precision = precision_score(y_h1n1_vacc_test, knn_unilabel_no_vacc_testpreds)
h1n1_knn_unilabel_no_vacc_test_f1 = f1_score(y_h1n1_vacc_test, knn_unilabel_no_vacc_testpreds)
h1n1_knn_unilabel_no_vacc_test_roc = roc_auc_score(y_h1n1_vacc_test, knn_unilabel_no_vacc_testpreds)

#print("Accuracy: {:.2f}".format(accuracy_score(y_h1n1_vacc_test, knn_unilabel_no_vacc_testpreds)))
#print("Recall: {:.2f}".format(recall_score(y_h1n1_vacc_test, knn_unilabel_no_vacc_testpreds)))
#print("Precision: {:.2f}".format(precision_score(y_h1n1_vacc_test, knn_unilabel_no_vacc_testpreds)))
#print("F1: {:.2f}".format(f1_score(y_h1n1_vacc_test, knn_unilabel_no_vacc_testpreds)))
#print("ROC: {:.2f}".format(roc_auc_score(y_h1n1_vacc_test, knn_unilabel_no_vacc_testpreds)))

In [121]:
# Random Forest--Test data evaluation Metrics
#check if these metrics are changing compared to previous results
h1n1_rand_forst_unilabel_no_vacc_test_acc = accuracy_score(y_h1n1_vacc_test, rand_forst_unilabel_no_vacc_testpreds)
h1n1_rand_forst_unilabel_no_vacc_test_recall = recall_score(y_h1n1_vacc_test, rand_forst_unilabel_no_vacc_testpreds)
h1n1_rand_forst_unilabel_no_vacc_test_precision = precision_score(y_h1n1_vacc_test, rand_forst_unilabel_no_vacc_testpreds)
h1n1_rand_forst_unilabel_no_vacc_test_f1 = f1_score(y_h1n1_vacc_test, rand_forst_unilabel_no_vacc_testpreds)
h1n1_rand_forst_unilabel_no_vacc_test_roc = roc_auc_score(y_h1n1_vacc_test, rand_forst_unilabel_no_vacc_testpreds)

#print("Accuracy: {:.2f}".format(accuracy_score(y_h1n1_vacc_test, rand_forst_unilabel_no_vacc_testpreds)))
#print("Recall: {:.2f}".format(recall_score(y_h1n1_vacc_test, rand_forst_unilabel_no_vacc_testpreds)))
#print("Precision: {:.2f}".format(precision_score(y_h1n1_vacc_test, rand_forst_unilabel_no_vacc_testpreds)))
#print("F1: {:.2f}".format(f1_score(y_h1n1_vacc_test, rand_forst_unilabel_no_vacc_testpreds)))
#print("ROC: {:.2f}".format(roc_auc_score(y_h1n1_vacc_test, rand_forst_unilabel_no_vacc_testpreds)))

In [122]:
# SVM--Test data evaluation Metrics
h1n1_svm_unilabel_no_vacc_test_acc = accuracy_score(y_h1n1_vacc_test, svm_unilabel_no_vacc_testpreds)
h1n1_svm_unilabel_no_vacc_test_recall = recall_score(y_h1n1_vacc_test, svm_unilabel_no_vacc_testpreds)
h1n1_svm_unilabel_no_vacc_test_precision = precision_score(y_h1n1_vacc_test, svm_unilabel_no_vacc_testpreds)
h1n1_svm_unilabel_no_vacc_test_f1 = f1_score(y_h1n1_vacc_test, svm_unilabel_no_vacc_testpreds)
h1n1_svm_unilabel_no_vacc_test_roc = roc_auc_score(y_h1n1_vacc_test, svm_unilabel_no_vacc_testpreds)

#print("Accuracy: {:.2f}".format(accuracy_score(y_h1n1_vacc_test, svm_unilabel_no_vacc_testpreds)))
#print("Recall: {:.2f}".format(recall_score(y_h1n1_vacc_test, svm_unilabel_no_vacc_testpreds)))
#print("Precision: {:.2f}".format(precision_score(y_h1n1_vacc_test, svm_unilabel_no_vacc_testpreds)))
#print("F1: {:.2f}".format(f1_score(y_h1n1_vacc_test, svm_unilabel_no_vacc_testpreds)))
#print("ROC: {:.2f}".format(roc_auc_score(y_h1n1_vacc_test, svm_unilabel_no_vacc_testpreds)))

## Tracking the model with MLFlow

### H1N1 vaccine output

#### Logistic regression

In [123]:
# setting the MLFlow connection and experiment
mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME_h1n1) #this needs to be adjusted for each experiment
name = 'logreg_unilabel_h1n1' #specify the run name here; name it with model used
mlflow.start_run(run_name = name) 
run = mlflow.active_run()

In [124]:
print("Active run_id: {}".format(run.info.run_id))

Active run_id: 34a92dec6ec84aa5850d4a1ff7f82709


In [125]:
#parameters are to keep track of everything for re-running the experiment (e.g. what features are being engineered, hyperparameters of model, etc)
#to be adjusted as needed
params = {
    "Data cleaning": "Drop all nulls",
    "Data balancing": "None",
    "Hyperparameters": "None"
  }

In [126]:
mlflow.log_params(params)
#tags = data used
mlflow.set_tag("Vaccines in features", "None") #set tags for more details of what we've done
mlflow.log_metric("train -" + "ROC", h1n1_logreg_unilabel_no_vacc_train_roc)
mlflow.log_metric("test -" + "ROC", h1n1_logreg_unilabel_no_vacc_test_roc)
mlflow.log_metric("train -" + "accuracy", h1n1_logreg_unilabel_no_vacc_train_acc)
mlflow.log_metric("test -" + "accuracy", h1n1_logreg_unilabel_no_vacc_test_acc)
mlflow.log_metric("train -" + "recall", h1n1_logreg_unilabel_no_vacc_train_recall)
mlflow.log_metric("test -" + "recall", h1n1_logreg_unilabel_no_vacc_test_recall)
mlflow.log_metric("train -" + "precision", h1n1_logreg_unilabel_no_vacc_train_precision)
mlflow.log_metric("test -" + "precision", h1n1_logreg_unilabel_no_vacc_test_precision)
mlflow.log_metric("train -" + "f1", h1n1_logreg_unilabel_no_vacc_train_f1)
mlflow.log_metric("test -" + "f1", h1n1_logreg_unilabel_no_vacc_test_f1)
#mlflow.log_artifact("../models")
#mlflow.sklearn.log_model(reg, "model")
mlflow.end_run()

In [127]:
#mlflow.get_run(run_id=run.info.run_id)

#### KNN

In [128]:
# setting the MLFlow connection and experiment
mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME_h1n1)
name = 'knn_unilabel_h1n1' #specify the run name here; name it with model used
mlflow.start_run(run_name = name) 
run = mlflow.active_run()

In [129]:
print("Active run_id: {}".format(run.info.run_id))

Active run_id: 504f12cbfe6146009549b907d0d7fc0f


In [130]:
#parameters are to keep track of everything for re-running the experiment (e.g. what features are being engineered, hyperparameters of model, etc)
#to be adjusted as needed
params = {
    "Data cleaning": "Drop all nulls",
    "Data balancing": "None",
    "Hyperparameters": "None"
  }

In [131]:
mlflow.log_params(params)
#tags = data used
mlflow.set_tag("Vaccines in features", "None") #set tags for more details of what we've done
mlflow.log_metric("train -" + "ROC", h1n1_knn_unilabel_no_vacc_train_roc)
mlflow.log_metric("test -" + "ROC", h1n1_knn_unilabel_no_vacc_test_roc)
mlflow.log_metric("train -" + "accuracy", h1n1_knn_unilabel_no_vacc_train_acc)
mlflow.log_metric("test -" + "accuracy", h1n1_knn_unilabel_no_vacc_test_acc)
mlflow.log_metric("train -" + "recall", h1n1_knn_unilabel_no_vacc_train_recall)
mlflow.log_metric("test -" + "recall", h1n1_knn_unilabel_no_vacc_test_recall)
mlflow.log_metric("train -" + "precision", h1n1_knn_unilabel_no_vacc_train_precision)
mlflow.log_metric("test -" + "precision", h1n1_knn_unilabel_no_vacc_test_precision)
mlflow.log_metric("train -" + "f1", h1n1_knn_unilabel_no_vacc_train_f1)
mlflow.log_metric("test -" + "f1", h1n1_knn_unilabel_no_vacc_test_f1)
#mlflow.log_artifact("../models")
#mlflow.sklearn.log_model(reg, "model")
mlflow.end_run()

In [132]:
#mlflow.get_run(run_id=run.info.run_id)

#### Random forest

In [133]:
# setting the MLFlow connection and experiment
mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME_h1n1)
name = 'rand_forst_unilabel_h1n1' #specify the run name here; name it with model used
mlflow.start_run(run_name = name) 
run = mlflow.active_run()

In [134]:
print("Active run_id: {}".format(run.info.run_id))

Active run_id: 9edf7207fd0a4a90996538735a4d6789


In [135]:
#parameters are to keep track of everything for re-running the experiment (e.g. what features are being engineered, hyperparameters of model, etc)
#to be adjusted as needed
params = {
    "Data cleaning": "Drop all nulls",
    "Data balancing": "None",
    "Hyperparameters": "None"
  }

In [136]:
mlflow.log_params(params)
#tags = data used
mlflow.set_tag("Vaccines in features", "None") #set tags for more details of what we've done
mlflow.log_metric("train -" + "ROC", h1n1_rand_forst_unilabel_no_vacc_train_roc)
mlflow.log_metric("test -" + "ROC", h1n1_rand_forst_unilabel_no_vacc_test_roc)
mlflow.log_metric("train -" + "accuracy", h1n1_rand_forst_unilabel_no_vacc_train_acc)
mlflow.log_metric("test -" + "accuracy", h1n1_rand_forst_unilabel_no_vacc_test_acc)
mlflow.log_metric("train -" + "recall", h1n1_rand_forst_unilabel_no_vacc_train_recall)
mlflow.log_metric("test -" + "recall", h1n1_rand_forst_unilabel_no_vacc_test_recall)
mlflow.log_metric("train -" + "precision", h1n1_rand_forst_unilabel_no_vacc_train_precision)
mlflow.log_metric("test -" + "precision", h1n1_rand_forst_unilabel_no_vacc_test_precision)
mlflow.log_metric("train -" + "f1", h1n1_rand_forst_unilabel_no_vacc_train_f1)
mlflow.log_metric("test -" + "f1", h1n1_rand_forst_unilabel_no_vacc_test_f1)
#mlflow.log_artifact("../models")
#mlflow.sklearn.log_model(reg, "model")
mlflow.end_run()

In [137]:
#mlflow.get_run(run_id=run.info.run_id)

#### SVM

In [138]:
# setting the MLFlow connection and experiment
mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME_h1n1)
name = 'svm_unilabel_h1n1' #specify the run name here; name it with model used
mlflow.start_run(run_name = name) 
run = mlflow.active_run()

In [139]:
print("Active run_id: {}".format(run.info.run_id))

Active run_id: d532f870cd69491d9dd87a50b0a1d70b


In [140]:
#parameters are to keep track of everything for re-running the experiment (e.g. what features are being engineered, hyperparameters of model, etc)
#to be adjusted as needed
params = {
    "Data cleaning": "Drop all nulls",
    "Data balancing": "None",
    "Hyperparameters": "None"
  }

In [141]:
mlflow.log_params(params)
#tags = data used
mlflow.set_tag("Vaccines in features", "None") #set tags for more details of what we've done
mlflow.log_metric("train -" + "ROC", h1n1_svm_unilabel_no_vacc_train_roc)
mlflow.log_metric("test -" + "ROC", h1n1_svm_unilabel_no_vacc_test_roc)
mlflow.log_metric("train -" + "accuracy", h1n1_svm_unilabel_no_vacc_train_acc)
mlflow.log_metric("test -" + "accuracy", h1n1_svm_unilabel_no_vacc_test_acc)
mlflow.log_metric("train -" + "recall", h1n1_svm_unilabel_no_vacc_train_recall)
mlflow.log_metric("test -" + "recall", h1n1_svm_unilabel_no_vacc_test_recall)
mlflow.log_metric("train -" + "precision", h1n1_svm_unilabel_no_vacc_train_precision)
mlflow.log_metric("test -" + "precision", h1n1_svm_unilabel_no_vacc_test_precision)
mlflow.log_metric("train -" + "f1", h1n1_svm_unilabel_no_vacc_train_f1)
mlflow.log_metric("test -" + "f1", h1n1_svm_unilabel_no_vacc_test_f1)
#mlflow.log_artifact("../models")
#mlflow.sklearn.log_model(reg, "model")
mlflow.end_run()

In [142]:
#mlflow.get_run(run_id=run.info.run_id)

## Single Label Modelling, output H1N1 vaccine -> Seasonal Flu Vaccine is in features

The y_h1n1_vacc remains the same from the previous model; the X feature and cat_features (for the preprocessor) need to be adjusted:

In [143]:
cat_features_seas_vacc = cat_features.copy()

In [144]:
cat_features_seas_vacc.remove('h1n1_vaccine')

In [145]:
#NB: dropping the 'h1n1_vaccine' and 'seasonal_vaccine' columns
X_seas_vacc = df.drop(columns=['h1n1_vaccine'])

In [146]:
X_seas_vacc.columns

Index(['seasonal_vaccine', 'h1n1_concern', 'h1n1_knowledge',
       'behavioral_antiviral_meds', 'behavioral_avoidance',
       'behavioral_face_mask', 'behavioral_wash_hands',
       'behavioral_large_gatherings', 'behavioral_outside_home',
       'behavioral_touch_face', 'doctor_recc_h1n1', 'doctor_recc_seasonal',
       'chronic_med_condition', 'child_under_6_months', 'health_worker',
       'opinion_h1n1_vacc_effective', 'opinion_h1n1_risk',
       'opinion_h1n1_sick_from_vacc', 'opinion_seas_vacc_effective',
       'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'age_group',
       'education', 'race', 'sex', 'rent_or_own', 'hhs_geo_region',
       'census_msa', 'household_adults', 'household_children'],
      dtype='object')

Performing test-train split (the same data can be used for each model in multilabelling):

In [147]:
X_seas_vacc_train, X_seas_vacc_test, y_h1n1_vacc_train, y_h1n1_vacc_test = train_test_split(X_seas_vacc, y_h1n1_vacc, stratify = y_h1n1_vacc, test_size=0.2, random_state=RSEED)

In [148]:
print('X_seas_vacc_train shape:', X_seas_vacc_train.shape)
print('X_seas_vacc_test shape:', X_seas_vacc_test.shape)
print('y_h1n1_vacc_train:', y_h1n1_vacc_train.shape)
print('y_h1n1_vacc_test:', y_h1n1_vacc_test.shape)

X_seas_vacc_train shape: (17482, 30)
X_seas_vacc_test shape: (4371, 30)
y_h1n1_vacc_train: (17482, 1)
y_h1n1_vacc_test: (4371, 1)


Preprocessor is adjusted:

In [149]:
preprocessor_seas_vacc = ColumnTransformer([
    ('cat', cat_pipeline, cat_features_seas_vacc)
])

Pipeline is adjusted:

In [155]:
# for logreg
logreg_seas_vacc_unilabel_pipeline = Pipeline([
    ("preprocessor", preprocessor_seas_vacc),
    ("estimators", logreg),
])

# for KNN
knn_seas_vacc_unilabel_pipeline = Pipeline([
    ("preprocessor", preprocessor_seas_vacc),
    ("estimators", knn),
])

# for Random Forest

rand_forst_seas_vacc_unilabel_pipeline = Pipeline([
    ("preprocessor", preprocessor_seas_vacc),
    ("estimators", rand_forst),
])

#for SVM
svm_seas_vacc_unilabel_pipeline = Pipeline([
    ("preprocessor", preprocessor_seas_vacc),
    ("estimators", svm),
])

Training the models:

In [156]:
# for logreg
logreg_unilabel_seas_vacc = logreg_seas_vacc_unilabel_pipeline.fit(X_seas_vacc_train, y_h1n1_vacc_train)

# for KNN
knn_unilabel_seas_vacc = knn_seas_vacc_unilabel_pipeline.fit(X_seas_vacc_train, y_h1n1_vacc_train)

# for Random Forest
rand_forst_unilabel_seas_vacc = rand_forst_seas_vacc_unilabel_pipeline.fit(X_seas_vacc_train, y_h1n1_vacc_train)

#for SVM
svm_unilabel_seas_vacc = svm_seas_vacc_unilabel_pipeline.fit(X_seas_vacc_train, y_h1n1_vacc_train)

In [None]:
# Figure out later what this does and if we want to use it


#y_train_predicted = cross_val_predict(full_pipeline, X_train, y_train, cv=5)

Making predictions based on train and test data:

In [157]:
# for logreg
logreg_unilabel_seas_vacc_trainpreds = logreg_unilabel_seas_vacc.predict(X_seas_vacc_train)
logreg_unilabel_seas_vacc_testpreds = logreg_unilabel_seas_vacc.predict(X_seas_vacc_test)

# for KNN
knn_unilabel_seas_vacc_trainpreds = knn_unilabel_seas_vacc.predict(X_seas_vacc_train)
knn_unilabel_seas_vacc_testpreds = knn_unilabel_seas_vacc.predict(X_seas_vacc_test)

# for Random Forest
rand_forst_unilabel_seas_vacc_trainpreds = rand_forst_unilabel_seas_vacc.predict(X_seas_vacc_train)
rand_forst_unilabel_seas_vacc_testpreds = rand_forst_unilabel_seas_vacc.predict(X_seas_vacc_test)

# for SVM
svm_unilabel_seas_vacc_trainpreds = svm_unilabel_seas_vacc.predict(X_seas_vacc_train)
svm_unilabel_seas_vacc_testpreds = svm_unilabel_seas_vacc.predict(X_seas_vacc_test)

### Model evaluation

#### Train data

In [158]:
# Logreg--Train data evaluation Metrics
h1n1_logreg_unilabel_seas_vacc_train_acc = accuracy_score(y_h1n1_vacc_train, logreg_unilabel_seas_vacc_trainpreds)
h1n1_logreg_unilabel_seas_vacc_train_recall = recall_score(y_h1n1_vacc_train, logreg_unilabel_seas_vacc_trainpreds)
h1n1_logreg_unilabel_seas_vacc_train_precision = precision_score(y_h1n1_vacc_train, logreg_unilabel_seas_vacc_trainpreds)
h1n1_logreg_unilabel_seas_vacc_train_f1 = f1_score(y_h1n1_vacc_train, logreg_unilabel_seas_vacc_trainpreds)
h1n1_logreg_unilabel_seas_vacc_train_roc = roc_auc_score(y_h1n1_vacc_train, logreg_unilabel_seas_vacc_trainpreds)

#print("Accuracy: {:.2f}".format(accuracy_score(y_h1n1_vacc_train, logreg_unilabel_seas_vacc_trainpreds)))
#print("Recall: {:.2f}".format(recall_score(y_h1n1_vacc_train, logreg_unilabel_seas_vacc_trainpreds)))
#print("Precision: {:.2f}".format(precision_score(y_h1n1_vacc_train, logreg_unilabel_seas_vacc_trainpreds)))
#print("F1: {:.2f}".format(f1_score(y_h1n1_vacc_train, logreg_unilabel_seas_vacc_trainpreds)))
#print("ROC: {:.2f}".format(roc_auc_score(y_h1n1_vacc_train, logreg_unilabel_seas_vacc_trainpreds)))

In [159]:
# KNN--Train data evaluation Metrics
h1n1_knn_unilabel_seas_vacc_train_acc = accuracy_score(y_h1n1_vacc_train, knn_unilabel_seas_vacc_trainpreds)
h1n1_knn_unilabel_seas_vacc_train_recall = recall_score(y_h1n1_vacc_train, knn_unilabel_seas_vacc_trainpreds)
h1n1_knn_unilabel_seas_vacc_train_precision = precision_score(y_h1n1_vacc_train, knn_unilabel_seas_vacc_trainpreds)
h1n1_knn_unilabel_seas_vacc_train_f1 = f1_score(y_h1n1_vacc_train, knn_unilabel_seas_vacc_trainpreds)
h1n1_knn_unilabel_seas_vacc_train_roc = roc_auc_score(y_h1n1_vacc_train, knn_unilabel_seas_vacc_trainpreds)

#print("Accuracy: {:.2f}".format(accuracy_score(y_h1n1_vacc_train, knn_unilabel_seas_vacc_trainpreds)))
#print("Recall: {:.2f}".format(recall_score(y_h1n1_vacc_train, knn_unilabel_seas_vacc_trainpreds)))
#print("Precision: {:.2f}".format(precision_score(y_h1n1_vacc_train, knn_unilabel_seas_vacc_trainpreds)))
#print("F1: {:.2f}".format(f1_score(y_h1n1_vacc_train, knn_unilabel_seas_vacc_trainpreds)))
#print("ROC: {:.2f}".format(roc_auc_score(y_h1n1_vacc_train, knn_unilabel_seas_vacc_trainpreds)))

In [160]:
# Random Forest--Train data evaluation Metrics
#check if these metrics are changing compared to previous results
h1n1_rand_forst_unilabel_seas_vacc_train_acc = accuracy_score(y_h1n1_vacc_train, rand_forst_unilabel_seas_vacc_trainpreds)
h1n1_rand_forst_unilabel_seas_vacc_train_recall = recall_score(y_h1n1_vacc_train, rand_forst_unilabel_seas_vacc_trainpreds)
h1n1_rand_forst_unilabel_seas_vacc_train_precision = precision_score(y_h1n1_vacc_train, rand_forst_unilabel_seas_vacc_trainpreds)
h1n1_rand_forst_unilabel_seas_vacc_train_f1 = f1_score(y_h1n1_vacc_train, rand_forst_unilabel_seas_vacc_trainpreds)
h1n1_rand_forst_unilabel_seas_vacc_train_roc = roc_auc_score(y_h1n1_vacc_train, rand_forst_unilabel_seas_vacc_trainpreds)

#print("Accuracy: {:.2f}".format(accuracy_score(y_h1n1_vacc_train, rand_forst_unilabel_seas_vacc_trainpreds)))
#print("Recall: {:.2f}".format(recall_score(y_h1n1_vacc_train, rand_forst_unilabel_seas_vacc_trainpreds)))
#print("Precision: {:.2f}".format(precision_score(y_h1n1_vacc_train, rand_forst_unilabel_seas_vacc_trainpreds)))
#print("F1: {:.2f}".format(f1_score(y_h1n1_vacc_train, rand_forst_unilabel_seas_vacc_trainpreds)))
#print("ROC: {:.2f}".format(roc_auc_score(y_h1n1_vacc_train, rand_forst_unilabel_seas_vacc_trainpreds)))

In [161]:
# SVM--Train data evaluation Metrics
h1n1_svm_unilabel_seas_vacc_train_acc = accuracy_score(y_h1n1_vacc_train, svm_unilabel_seas_vacc_trainpreds)
h1n1_svm_unilabel_seas_vacc_train_recall = recall_score(y_h1n1_vacc_train, svm_unilabel_seas_vacc_trainpreds)
h1n1_svm_unilabel_seas_vacc_train_precision = precision_score(y_h1n1_vacc_train, svm_unilabel_seas_vacc_trainpreds)
h1n1_svm_unilabel_seas_vacc_train_f1 = f1_score(y_h1n1_vacc_train, svm_unilabel_seas_vacc_trainpreds)
h1n1_svm_unilabel_seas_vacc_train_roc = roc_auc_score(y_h1n1_vacc_train, svm_unilabel_seas_vacc_trainpreds)

#print("Accuracy: {:.2f}".format(accuracy_score(y_h1n1_vacc_train, svm_unilabel_seas_vacc_trainpreds)))
#print("Recall: {:.2f}".format(recall_score(y_h1n1_vacc_train, svm_unilabel_seas_vacc_trainpreds)))
#print("Precision: {:.2f}".format(precision_score(y_h1n1_vacc_train, svm_unilabel_seas_vacc_trainpreds)))
#print("F1: {:.2f}".format(f1_score(y_h1n1_vacc_train, svm_unilabel_seas_vacc_trainpreds)))
#print("ROC: {:.2f}".format(roc_auc_score(y_h1n1_vacc_train, svm_unilabel_seas_vacc_trainpreds)))

#### Test data

In [162]:
# Logreg--Test data evaluation Metrics
h1n1_logreg_unilabel_seas_vacc_test_acc = accuracy_score(y_h1n1_vacc_test, logreg_unilabel_seas_vacc_testpreds)
h1n1_logreg_unilabel_seas_vacc_test_recall = recall_score(y_h1n1_vacc_test, logreg_unilabel_seas_vacc_testpreds)
h1n1_logreg_unilabel_seas_vacc_test_precision = precision_score(y_h1n1_vacc_test, logreg_unilabel_seas_vacc_testpreds)
h1n1_logreg_unilabel_seas_vacc_test_f1 = f1_score(y_h1n1_vacc_test, logreg_unilabel_seas_vacc_testpreds)
h1n1_logreg_unilabel_seas_vacc_test_roc = roc_auc_score(y_h1n1_vacc_test, logreg_unilabel_seas_vacc_testpreds)

#print("Accuracy: {:.2f}".format(accuracy_score(y_h1n1_vacc_test, logreg_unilabel_seas_vacc_testpreds)))
#print("Recall: {:.2f}".format(recall_score(y_h1n1_vacc_test, logreg_unilabel_seas_vacc_testpreds)))
#print("Precision: {:.2f}".format(precision_score(y_h1n1_vacc_test, logreg_unilabel_seas_vacc_testpreds)))
#print("F1: {:.2f}".format(f1_score(y_h1n1_vacc_test, logreg_unilabel_seas_vacc_testpreds)))
#print("ROC: {:.2f}".format(roc_auc_score(y_h1n1_vacc_test, logreg_unilabel_seas_vacc_testpreds)))

In [163]:
# KNN--Test data evaluation Metrics
h1n1_knn_unilabel_seas_vacc_test_acc = accuracy_score(y_h1n1_vacc_test, knn_unilabel_seas_vacc_testpreds)
h1n1_knn_unilabel_seas_vacc_test_recall = recall_score(y_h1n1_vacc_test, knn_unilabel_seas_vacc_testpreds)
h1n1_knn_unilabel_seas_vacc_test_precision = precision_score(y_h1n1_vacc_test, knn_unilabel_seas_vacc_testpreds)
h1n1_knn_unilabel_seas_vacc_test_f1 = f1_score(y_h1n1_vacc_test, knn_unilabel_seas_vacc_testpreds)
h1n1_knn_unilabel_seas_vacc_test_roc = roc_auc_score(y_h1n1_vacc_test, knn_unilabel_seas_vacc_testpreds)

#print("Accuracy: {:.2f}".format(accuracy_score(y_h1n1_vacc_test, knn_unilabel_seas_vacc_testpreds)))
#print("Recall: {:.2f}".format(recall_score(y_h1n1_vacc_test, knn_unilabel_seas_vacc_testpreds)))
#print("Precision: {:.2f}".format(precision_score(y_h1n1_vacc_test, knn_unilabel_seas_vacc_testpreds)))
#print("F1: {:.2f}".format(f1_score(y_h1n1_vacc_test, knn_unilabel_seas_vacc_testpreds)))
#print("ROC: {:.2f}".format(roc_auc_score(y_h1n1_vacc_test, knn_unilabel_seas_vacc_testpreds)))

In [164]:
# Random Forest--Test data evaluation Metrics
#check if these metrics are changing compared to previous results
h1n1_rand_forst_unilabel_seas_vacc_test_acc = accuracy_score(y_h1n1_vacc_test, rand_forst_unilabel_seas_vacc_testpreds)
h1n1_rand_forst_unilabel_seas_vacc_test_recall = recall_score(y_h1n1_vacc_test, rand_forst_unilabel_seas_vacc_testpreds)
h1n1_rand_forst_unilabel_seas_vacc_test_precision = precision_score(y_h1n1_vacc_test, rand_forst_unilabel_seas_vacc_testpreds)
h1n1_rand_forst_unilabel_seas_vacc_test_f1 = f1_score(y_h1n1_vacc_test, rand_forst_unilabel_seas_vacc_testpreds)
h1n1_rand_forst_unilabel_seas_vacc_test_roc = roc_auc_score(y_h1n1_vacc_test, rand_forst_unilabel_seas_vacc_testpreds)

#print("Accuracy: {:.2f}".format(accuracy_score(y_h1n1_vacc_test, rand_forst_unilabel_seas_vacc_testpreds)))
#print("Recall: {:.2f}".format(recall_score(y_h1n1_vacc_test, rand_forst_unilabel_seas_vacc_testpreds)))
#print("Precision: {:.2f}".format(precision_score(y_h1n1_vacc_test, rand_forst_unilabel_seas_vacc_testpreds)))
#print("F1: {:.2f}".format(f1_score(y_h1n1_vacc_test, rand_forst_unilabel_seas_vacc_testpreds)))
#print("ROC: {:.2f}".format(roc_auc_score(y_h1n1_vacc_test, rand_forst_unilabel_seas_vacc_testpreds)))

In [165]:
# SVM--Test data evaluation Metrics
h1n1_svm_unilabel_seas_vacc_test_acc = accuracy_score(y_h1n1_vacc_test, svm_unilabel_seas_vacc_testpreds)
h1n1_svm_unilabel_seas_vacc_test_recall = recall_score(y_h1n1_vacc_test, svm_unilabel_seas_vacc_testpreds)
h1n1_svm_unilabel_seas_vacc_test_precision = precision_score(y_h1n1_vacc_test, svm_unilabel_seas_vacc_testpreds)
h1n1_svm_unilabel_seas_vacc_test_f1 = f1_score(y_h1n1_vacc_test, svm_unilabel_seas_vacc_testpreds)
h1n1_svm_unilabel_seas_vacc_test_roc = roc_auc_score(y_h1n1_vacc_test, svm_unilabel_seas_vacc_testpreds)

#print("Accuracy: {:.2f}".format(accuracy_score(y_h1n1_vacc_test, svm_unilabel_seas_vacc_testpreds)))
#print("Recall: {:.2f}".format(recall_score(y_h1n1_vacc_test, svm_unilabel_seas_vacc_testpreds)))
#print("Precision: {:.2f}".format(precision_score(y_h1n1_vacc_test, svm_unilabel_seas_vacc_testpreds)))
#print("F1: {:.2f}".format(f1_score(y_h1n1_vacc_test, svm_unilabel_seas_vacc_testpreds)))
#print("ROC: {:.2f}".format(roc_auc_score(y_h1n1_vacc_test, svm_unilabel_seas_vacc_testpreds)))

## Tracking the model with MLFlow

### H1N1 vaccine output

#### Logistic regression

In [166]:
# setting the MLFlow connection and experiment
mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME_h1n1) #this needs to be adjusted for each experiment
name = 'logreg_unilabel_h1n1' #specify the run name here; name it with model used
mlflow.start_run(run_name = name) 
run = mlflow.active_run()

In [167]:
print("Active run_id: {}".format(run.info.run_id))

Active run_id: 0f5307b0a6504ac8b0ae80e1427892e0


In [168]:
#parameters are to keep track of everything for re-running the experiment (e.g. what features are being engineered, hyperparameters of model, etc)
#to be adjusted as needed
params = {
    "Data cleaning": "Drop all nulls",
    "Data balancing": "None",
    "Hyperparameters": "None"
  }

In [169]:
mlflow.log_params(params)
#tags = data used
mlflow.set_tag("Vaccines in features", "Seasonal") #set tags for more details of what we've done
mlflow.log_metric("train -" + "ROC", h1n1_logreg_unilabel_seas_vacc_train_roc)
mlflow.log_metric("test -" + "ROC", h1n1_logreg_unilabel_seas_vacc_test_roc)
mlflow.log_metric("train -" + "accuracy", h1n1_logreg_unilabel_seas_vacc_train_acc)
mlflow.log_metric("test -" + "accuracy", h1n1_logreg_unilabel_seas_vacc_test_acc)
mlflow.log_metric("train -" + "recall", h1n1_logreg_unilabel_seas_vacc_train_recall)
mlflow.log_metric("test -" + "recall", h1n1_logreg_unilabel_seas_vacc_test_recall)
mlflow.log_metric("train -" + "precision", h1n1_logreg_unilabel_seas_vacc_train_precision)
mlflow.log_metric("test -" + "precision", h1n1_logreg_unilabel_seas_vacc_test_precision)
mlflow.log_metric("train -" + "f1", h1n1_logreg_unilabel_seas_vacc_train_f1)
mlflow.log_metric("test -" + "f1", h1n1_logreg_unilabel_seas_vacc_test_f1)
#mlflow.log_artifact("../models")
#mlflow.sklearn.log_model(reg, "model")
mlflow.end_run()

In [170]:
#mlflow.get_run(run_id=run.info.run_id)

#### KNN

In [171]:
# setting the MLFlow connection and experiment
mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME_h1n1)
name = 'knn_unilabel_h1n1' #specify the run name here; name it with model used
mlflow.start_run(run_name = name) 
run = mlflow.active_run()

In [172]:
print("Active run_id: {}".format(run.info.run_id))

Active run_id: 8625e38c36be4f639df526499c91f974


In [173]:
#parameters are to keep track of everything for re-running the experiment (e.g. what features are being engineered, hyperparameters of model, etc)
#to be adjusted as needed
params = {
    "Data cleaning": "Drop all nulls",
    "Data balancing": "None",
    "Hyperparameters": "None"
  }

In [174]:
mlflow.log_params(params)
#tags = data used
mlflow.set_tag("Vaccines in features", "Seasonal") #set tags for more details of what we've done
mlflow.log_metric("train -" + "ROC", h1n1_knn_unilabel_seas_vacc_train_roc)
mlflow.log_metric("test -" + "ROC", h1n1_knn_unilabel_seas_vacc_test_roc)
mlflow.log_metric("train -" + "accuracy", h1n1_knn_unilabel_seas_vacc_train_acc)
mlflow.log_metric("test -" + "accuracy", h1n1_knn_unilabel_seas_vacc_test_acc)
mlflow.log_metric("train -" + "recall", h1n1_knn_unilabel_seas_vacc_train_recall)
mlflow.log_metric("test -" + "recall", h1n1_knn_unilabel_seas_vacc_test_recall)
mlflow.log_metric("train -" + "precision", h1n1_knn_unilabel_seas_vacc_train_precision)
mlflow.log_metric("test -" + "precision", h1n1_knn_unilabel_seas_vacc_test_precision)
mlflow.log_metric("train -" + "f1", h1n1_knn_unilabel_seas_vacc_train_f1)
mlflow.log_metric("test -" + "f1", h1n1_knn_unilabel_seas_vacc_test_f1)
#mlflow.log_artifact("../models")
#mlflow.sklearn.log_model(reg, "model")
mlflow.end_run()

In [175]:
#mlflow.get_run(run_id=run.info.run_id)

#### Random forest

In [176]:
# setting the MLFlow connection and experiment
mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME_h1n1)
name = 'rand_forst_unilabel_h1n1' #specify the run name here; name it with model used
mlflow.start_run(run_name = name) 
run = mlflow.active_run()

In [177]:
print("Active run_id: {}".format(run.info.run_id))

Active run_id: d0dadf7a5b7f4b748a9889977b2cb503


In [178]:
#parameters are to keep track of everything for re-running the experiment (e.g. what features are being engineered, hyperparameters of model, etc)
#to be adjusted as needed
params = {
    "Data cleaning": "Drop all nulls",
    "Data balancing": "None",
    "Hyperparameters": "None"
  }

In [179]:
mlflow.log_params(params)
#tags = data used
mlflow.set_tag("Vaccines in features", "Seasonal") #set tags for more details of what we've done
mlflow.log_metric("train -" + "ROC", h1n1_rand_forst_unilabel_seas_vacc_train_roc)
mlflow.log_metric("test -" + "ROC", h1n1_rand_forst_unilabel_seas_vacc_test_roc)
mlflow.log_metric("train -" + "accuracy", h1n1_rand_forst_unilabel_seas_vacc_train_acc)
mlflow.log_metric("test -" + "accuracy", h1n1_rand_forst_unilabel_seas_vacc_test_acc)
mlflow.log_metric("train -" + "recall", h1n1_rand_forst_unilabel_seas_vacc_train_recall)
mlflow.log_metric("test -" + "recall", h1n1_rand_forst_unilabel_seas_vacc_test_recall)
mlflow.log_metric("train -" + "precision", h1n1_rand_forst_unilabel_seas_vacc_train_precision)
mlflow.log_metric("test -" + "precision", h1n1_rand_forst_unilabel_seas_vacc_test_precision)
mlflow.log_metric("train -" + "f1", h1n1_rand_forst_unilabel_seas_vacc_train_f1)
mlflow.log_metric("test -" + "f1", h1n1_rand_forst_unilabel_seas_vacc_test_f1)
#mlflow.log_artifact("../models")
#mlflow.sklearn.log_model(reg, "model")
mlflow.end_run()

In [180]:
#mlflow.get_run(run_id=run.info.run_id)

#### SVM

In [181]:
# setting the MLFlow connection and experiment
mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME_h1n1)
name = 'svm_unilabel_h1n1' #specify the run name here; name it with model used
mlflow.start_run(run_name = name) 
run = mlflow.active_run()

In [182]:
print("Active run_id: {}".format(run.info.run_id))

Active run_id: 441de9f5927b40bba0a64ca3c409a1d5


In [183]:
#parameters are to keep track of everything for re-running the experiment (e.g. what features are being engineered, hyperparameters of model, etc)
#to be adjusted as needed
params = {
    "Data cleaning": "Drop all nulls",
    "Data balancing": "None",
    "Hyperparameters": "None"
  }

In [184]:
mlflow.log_params(params)
#tags = data used
mlflow.set_tag("Vaccines in features", "Seasonal") #set tags for more details of what we've done
mlflow.log_metric("train -" + "ROC", h1n1_svm_unilabel_seas_vacc_train_roc)
mlflow.log_metric("test -" + "ROC", h1n1_svm_unilabel_seas_vacc_test_roc)
mlflow.log_metric("train -" + "accuracy", h1n1_svm_unilabel_seas_vacc_train_acc)
mlflow.log_metric("test -" + "accuracy", h1n1_svm_unilabel_seas_vacc_test_acc)
mlflow.log_metric("train -" + "recall", h1n1_svm_unilabel_seas_vacc_train_recall)
mlflow.log_metric("test -" + "recall", h1n1_svm_unilabel_seas_vacc_test_recall)
mlflow.log_metric("train -" + "precision", h1n1_svm_unilabel_seas_vacc_train_precision)
mlflow.log_metric("test -" + "precision", h1n1_svm_unilabel_seas_vacc_test_precision)
mlflow.log_metric("train -" + "f1", h1n1_svm_unilabel_seas_vacc_train_f1)
mlflow.log_metric("test -" + "f1", h1n1_svm_unilabel_seas_vacc_test_f1)
#mlflow.log_artifact("../models")
#mlflow.sklearn.log_model(reg, "model")
mlflow.end_run()

In [None]:
#mlflow.get_run(run_id=run.info.run_id)

# TheFluShot_seasonal: Single Label Modelling, output seasonal vaccine

## Single Label Modelling, output seasonal vaccine -> H1N1 Flu Vaccine not in features

The cat_features_no_vacc and X_no_vacc variables and the preprocessor remain the same from the multilabel modelling:

Set up the target variable:

In [None]:
y_seas_vacc = df[['seasonal_vaccine']].copy()

In [None]:
y_seas_vacc = y_seas_vacc.to_numpy()
y_seas_vacc

Performing test-train split:

In [None]:
X_no_vacc_train, X_no_vacc_test, y_seas_vacc_train, y_seas_vacc_test = train_test_split(X_no_vacc, y_seas_vacc, stratify = y_seas_vacc, test_size=0.2, random_state=RSEED)

In [None]:
print('X_no_vacc_train shape:', X_no_vacc_train.shape)
print('X_no_vacc_test shape:', X_no_vacc_test.shape)
print('y_seas_vacc_train:', y_seas_vacc_train.shape)
print('y_seas_vacc_test:', y_seas_vacc_test.shape)

Setting up the pipeline for each model:

In [None]:
# for logreg
logreg_unilabel_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", logreg),
])

# for KNN
knn_unilabel_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", knn),
])

# for Random Forest

rand_forst_unilabel_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", rand_forst),
])

#for SVM
svm_unilabel_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", svm),
])

Training the models:

In [None]:
# for logreg
logreg_unilabel_no_vacc = logreg_unilabel_pipeline.fit(X_no_vacc_train, y_seas_vacc_train)

# for KNN
knn_unilabel_no_vacc = knn_unilabel_pipeline.fit(X_no_vacc_train, y_seas_vacc_train)

# for Random Forest
rand_forst_unilabel_no_vacc = rand_forst_unilabel_pipeline.fit(X_no_vacc_train, y_seas_vacc_train)

#for SVM
svm_unilabel_no_vacc = svm_unilabel_pipeline.fit(X_no_vacc_train, y_seas_vacc_train)

In [None]:
# Figure out later what this does and if we want to use it


#y_train_predicted = cross_val_predict(full_pipeline, X_train, y_train, cv=5)

Making predictions based on train and test data:

In [None]:
# for logreg
logreg_unilabel_no_vacc_trainpreds = logreg_unilabel_no_vacc.predict(X_no_vacc_train)
logreg_unilabel_no_vacc_testpreds = logreg_unilabel_no_vacc.predict(X_no_vacc_test)

# for KNN
knn_unilabel_no_vacc_trainpreds = knn_unilabel_no_vacc.predict(X_no_vacc_train)
knn_unilabel_no_vacc_testpreds = knn_unilabel_no_vacc.predict(X_no_vacc_test)

# for Random Forest
rand_forst_unilabel_no_vacc_trainpreds = rand_forst_unilabel_no_vacc.predict(X_no_vacc_train)
rand_forst_unilabel_no_vacc_testpreds = rand_forst_unilabel_no_vacc.predict(X_no_vacc_test)

# for SVM
svm_unilabel_no_vacc_trainpreds = svm_unilabel_no_vacc.predict(X_no_vacc_train)
svm_unilabel_no_vacc_testpreds = svm_unilabel_no_vacc.predict(X_no_vacc_test)

### Model evaluation

#### Train data

In [None]:
# Logreg--Train data evaluation Metrics
seas_logreg_unilabel_no_vacc_train_acc = accuracy_score(y_seas_vacc_train, logreg_unilabel_no_vacc_trainpreds)
seas_logreg_unilabel_no_vacc_train_recall = recall_score(y_seas_vacc_train, logreg_unilabel_no_vacc_trainpreds)
seas_logreg_unilabel_no_vacc_train_precision = precision_score(y_seas_vacc_train, logreg_unilabel_no_vacc_trainpreds)
seas_logreg_unilabel_no_vacc_train_f1 = f1_score(y_seas_vacc_train, logreg_unilabel_no_vacc_trainpreds)
seas_logreg_unilabel_no_vacc_train_roc = roc_auc_score(y_seas_vacc_train, logreg_unilabel_no_vacc_trainpreds)

#print("Accuracy: {:.2f}".format(accuracy_score(y_seas_vacc_train, logreg_unilabel_no_vacc_trainpreds)))
#print("Recall: {:.2f}".format(recall_score(y_seas_vacc_train, logreg_unilabel_no_vacc_trainpreds)))
#print("Precision: {:.2f}".format(precision_score(y_seas_vacc_train, logreg_unilabel_no_vacc_trainpreds)))
#print("F1: {:.2f}".format(f1_score(y_seas_vacc_train, logreg_unilabel_no_vacc_trainpreds)))
#print("ROC: {:.2f}".format(roc_auc_score(y_seas_vacc_train, logreg_unilabel_no_vacc_trainpreds)))

In [None]:
# KNN--Train data evaluation Metrics
seas_knn_unilabel_no_vacc_train_acc = accuracy_score(y_seas_vacc_train, knn_unilabel_no_vacc_trainpreds)
seas_knn_unilabel_no_vacc_train_recall = recall_score(y_seas_vacc_train, knn_unilabel_no_vacc_trainpreds)
seas_knn_unilabel_no_vacc_train_precision = precision_score(y_seas_vacc_train, knn_unilabel_no_vacc_trainpreds)
seas_knn_unilabel_no_vacc_train_f1 = f1_score(y_seas_vacc_train, knn_unilabel_no_vacc_trainpreds)
seas_knn_unilabel_no_vacc_train_roc = roc_auc_score(y_seas_vacc_train, knn_unilabel_no_vacc_trainpreds)

#print("Accuracy: {:.2f}".format(accuracy_score(y_seas_vacc_train, knn_unilabel_no_vacc_trainpreds)))
#print("Recall: {:.2f}".format(recall_score(y_seas_vacc_train, knn_unilabel_no_vacc_trainpreds)))
#print("Precision: {:.2f}".format(precision_score(y_seas_vacc_train, knn_unilabel_no_vacc_trainpreds)))
#print("F1: {:.2f}".format(f1_score(y_seas_vacc_train, knn_unilabel_no_vacc_trainpreds)))
#print("ROC: {:.2f}".format(roc_auc_score(y_seas_vacc_train, knn_unilabel_no_vacc_trainpreds)))

In [None]:
# Random Forest--Train data evaluation Metrics
#check if these metrics are changing compared to previous results
seas_rand_forst_unilabel_no_vacc_train_acc = accuracy_score(y_seas_vacc_train, rand_forst_unilabel_no_vacc_trainpreds)
seas_rand_forst_unilabel_no_vacc_train_recall = recall_score(y_seas_vacc_train, rand_forst_unilabel_no_vacc_trainpreds)
seas_rand_forst_unilabel_no_vacc_train_precision = precision_score(y_seas_vacc_train, rand_forst_unilabel_no_vacc_trainpreds)
seas_rand_forst_unilabel_no_vacc_train_f1 = f1_score(y_seas_vacc_train, rand_forst_unilabel_no_vacc_trainpreds)
seas_rand_forst_unilabel_no_vacc_train_roc = roc_auc_score(y_seas_vacc_train, rand_forst_unilabel_no_vacc_trainpreds)

#print("Accuracy: {:.2f}".format(accuracy_score(y_seas_vacc_train, rand_forst_unilabel_no_vacc_trainpreds)))
#print("Recall: {:.2f}".format(recall_score(y_seas_vacc_train, rand_forst_unilabel_no_vacc_trainpreds)))
#print("Precision: {:.2f}".format(precision_score(y_seas_vacc_train, rand_forst_unilabel_no_vacc_trainpreds)))
#print("F1: {:.2f}".format(f1_score(y_seas_vacc_train, rand_forst_unilabel_no_vacc_trainpreds)))
#print("ROC: {:.2f}".format(roc_auc_score(y_seas_vacc_train, rand_forst_unilabel_no_vacc_trainpreds)))

In [None]:
# SVM--Train data evaluation Metrics
seas_svm_unilabel_no_vacc_train_acc = accuracy_score(y_seas_vacc_train, svm_unilabel_no_vacc_trainpreds)
seas_svm_unilabel_no_vacc_train_recall = recall_score(y_seas_vacc_train, svm_unilabel_no_vacc_trainpreds)
seas_svm_unilabel_no_vacc_train_precision = precision_score(y_seas_vacc_train, svm_unilabel_no_vacc_trainpreds)
seas_svm_unilabel_no_vacc_train_f1 = f1_score(y_seas_vacc_train, svm_unilabel_no_vacc_trainpreds)
seas_svm_unilabel_no_vacc_train_roc = roc_auc_score(y_seas_vacc_train, svm_unilabel_no_vacc_trainpreds)

#print("Accuracy: {:.2f}".format(accuracy_score(y_seas_vacc_train, svm_unilabel_no_vacc_trainpreds)))
#print("Recall: {:.2f}".format(recall_score(y_seas_vacc_train, svm_unilabel_no_vacc_trainpreds)))
#print("Precision: {:.2f}".format(precision_score(y_seas_vacc_train, svm_unilabel_no_vacc_trainpreds)))
#print("F1: {:.2f}".format(f1_score(y_seas_vacc_train, svm_unilabel_no_vacc_trainpreds)))
#print("ROC: {:.2f}".format(roc_auc_score(y_seas_vacc_train, svm_unilabel_no_vacc_trainpreds)))

#### Test data

In [None]:
# Logreg--Test data evaluation Metrics
seas_logreg_unilabel_no_vacc_test_acc = accuracy_score(y_seas_vacc_test, logreg_unilabel_no_vacc_testpreds)
seas_logreg_unilabel_no_vacc_test_recall = recall_score(y_seas_vacc_test, logreg_unilabel_no_vacc_testpreds)
seas_logreg_unilabel_no_vacc_test_precision = precision_score(y_seas_vacc_test, logreg_unilabel_no_vacc_testpreds)
seas_logreg_unilabel_no_vacc_test_f1 = f1_score(y_seas_vacc_test, logreg_unilabel_no_vacc_testpreds)
seas_logreg_unilabel_no_vacc_test_roc = roc_auc_score(y_seas_vacc_test, logreg_unilabel_no_vacc_testpreds)

#print("Accuracy: {:.2f}".format(accuracy_score(y_seas_vacc_test, logreg_unilabel_no_vacc_testpreds)))
#print("Recall: {:.2f}".format(recall_score(y_seas_vacc_test, logreg_unilabel_no_vacc_testpreds)))
#print("Precision: {:.2f}".format(precision_score(y_seas_vacc_test, logreg_unilabel_no_vacc_testpreds)))
#print("F1: {:.2f}".format(f1_score(y_seas_vacc_test, logreg_unilabel_no_vacc_testpreds)))
#print("ROC: {:.2f}".format(roc_auc_score(y_seas_vacc_test, logreg_unilabel_no_vacc_testpreds)))

In [None]:
# KNN--Test data evaluation Metrics
seas_knn_unilabel_no_vacc_test_acc = accuracy_score(y_seas_vacc_test, knn_unilabel_no_vacc_testpreds)
seas_knn_unilabel_no_vacc_test_recall = recall_score(y_seas_vacc_test, knn_unilabel_no_vacc_testpreds)
seas_knn_unilabel_no_vacc_test_precision = precision_score(y_seas_vacc_test, knn_unilabel_no_vacc_testpreds)
seas_knn_unilabel_no_vacc_test_f1 = f1_score(y_seas_vacc_test, knn_unilabel_no_vacc_testpreds)
seas_knn_unilabel_no_vacc_test_roc = roc_auc_score(y_seas_vacc_test, knn_unilabel_no_vacc_testpreds)

#print("Accuracy: {:.2f}".format(accuracy_score(y_seas_vacc_test, knn_unilabel_no_vacc_testpreds)))
#print("Recall: {:.2f}".format(recall_score(y_seas_vacc_test, knn_unilabel_no_vacc_testpreds)))
#print("Precision: {:.2f}".format(precision_score(y_seas_vacc_test, knn_unilabel_no_vacc_testpreds)))
#print("F1: {:.2f}".format(f1_score(y_seas_vacc_test, knn_unilabel_no_vacc_testpreds)))
#print("ROC: {:.2f}".format(roc_auc_score(y_seas_vacc_test, knn_unilabel_no_vacc_testpreds)))

In [None]:
# Random Forest--Test data evaluation Metrics
#check if these metrics are changing compared to previous results
seas_rand_forst_unilabel_no_vacc_test_acc = accuracy_score(y_seas_vacc_test, rand_forst_unilabel_no_vacc_testpreds)
seas_rand_forst_unilabel_no_vacc_test_recall = recall_score(y_seas_vacc_test, rand_forst_unilabel_no_vacc_testpreds)
seas_rand_forst_unilabel_no_vacc_test_precision = precision_score(y_seas_vacc_test, rand_forst_unilabel_no_vacc_testpreds)
seas_rand_forst_unilabel_no_vacc_test_f1 = f1_score(y_seas_vacc_test, rand_forst_unilabel_no_vacc_testpreds)
seas_rand_forst_unilabel_no_vacc_test_roc = roc_auc_score(y_seas_vacc_test, rand_forst_unilabel_no_vacc_testpreds)

#print("Accuracy: {:.2f}".format(accuracy_score(y_seas_vacc_test, rand_forst_unilabel_no_vacc_testpreds)))
#print("Recall: {:.2f}".format(recall_score(y_seas_vacc_test, rand_forst_unilabel_no_vacc_testpreds)))
#print("Precision: {:.2f}".format(precision_score(y_seas_vacc_test, rand_forst_unilabel_no_vacc_testpreds)))
#print("F1: {:.2f}".format(f1_score(y_seas_vacc_test, rand_forst_unilabel_no_vacc_testpreds)))
#print("ROC: {:.2f}".format(roc_auc_score(y_seas_vacc_test, rand_forst_unilabel_no_vacc_testpreds)))

In [None]:
# SVM--Test data evaluation Metrics
seas_svm_unilabel_no_vacc_test_acc = accuracy_score(y_seas_vacc_test, svm_unilabel_no_vacc_testpreds)
seas_svm_unilabel_no_vacc_test_recall = recall_score(y_seas_vacc_test, svm_unilabel_no_vacc_testpreds)
seas_svm_unilabel_no_vacc_test_precision = precision_score(y_seas_vacc_test, svm_unilabel_no_vacc_testpreds)
seas_svm_unilabel_no_vacc_test_f1 = f1_score(y_seas_vacc_test, svm_unilabel_no_vacc_testpreds)
seas_svm_unilabel_no_vacc_test_roc = roc_auc_score(y_seas_vacc_test, svm_unilabel_no_vacc_testpreds)

#print("Accuracy: {:.2f}".format(accuracy_score(y_seas_vacc_test, svm_unilabel_no_vacc_testpreds)))
#print("Recall: {:.2f}".format(recall_score(y_seas_vacc_test, svm_unilabel_no_vacc_testpreds)))
#print("Precision: {:.2f}".format(precision_score(y_seas_vacc_test, svm_unilabel_no_vacc_testpreds)))
#print("F1: {:.2f}".format(f1_score(y_seas_vacc_test, svm_unilabel_no_vacc_testpreds)))
#print("ROC: {:.2f}".format(roc_auc_score(y_seas_vacc_test, svm_unilabel_no_vacc_testpreds)))

## Tracking the model with MLFlow

### Seasonal vaccine output

#### Logistic regression

In [None]:
# setting the MLFlow connection and experiment
mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME_seasonal) #this needs to be adjusted for each experiment
name = 'logreg_unilabel_seasonal' #specify the run name here; name it with model used
mlflow.start_run(run_name = name) 
run = mlflow.active_run()

In [None]:
print("Active run_id: {}".format(run.info.run_id))

In [None]:
#parameters are to keep track of everything for re-running the experiment (e.g. what features are being engineered, hyperparameters of model, etc)
#to be adjusted as needed
params = {
    "Data cleaning": "Drop all nulls",
    "Data balancing": "None",
    "Hyperparameters": "None"
  }

In [None]:
mlflow.log_params(params)
#tags = data used
mlflow.set_tag("Vaccines in features", "None") #set tags for more details of what we've done
mlflow.log_metric("train -" + "ROC", seas_logreg_unilabel_no_vacc_train_roc)
mlflow.log_metric("test -" + "ROC", seas_logreg_unilabel_no_vacc_test_roc)
mlflow.log_metric("train -" + "accuracy", seas_logreg_unilabel_no_vacc_train_acc)
mlflow.log_metric("test -" + "accuracy", seas_logreg_unilabel_no_vacc_test_acc)
mlflow.log_metric("train -" + "recall", seas_logreg_unilabel_no_vacc_train_recall)
mlflow.log_metric("test -" + "recall", seas_logreg_unilabel_no_vacc_test_recall)
mlflow.log_metric("train -" + "precision", seas_logreg_unilabel_no_vacc_train_precision)
mlflow.log_metric("test -" + "precision", seas_logreg_unilabel_no_vacc_test_precision)
mlflow.log_metric("train -" + "f1", seas_logreg_unilabel_no_vacc_train_f1)
mlflow.log_metric("test -" + "f1", seas_logreg_unilabel_no_vacc_test_f1)
#mlflow.log_artifact("../models")
#mlflow.sklearn.log_model(reg, "model")
mlflow.end_run()

In [None]:
#mlflow.get_run(run_id=run.info.run_id)

#### KNN

In [None]:
# setting the MLFlow connection and experiment
mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME_seasonal)
name = 'knn_unilabel_seasonal' #specify the run name here; name it with model used
mlflow.start_run(run_name = name) 
run = mlflow.active_run()

In [None]:
print("Active run_id: {}".format(run.info.run_id))

In [None]:
#parameters are to keep track of everything for re-running the experiment (e.g. what features are being engineered, hyperparameters of model, etc)
#to be adjusted as needed
params = {
    "Data cleaning": "Drop all nulls",
    "Data balancing": "None",
    "Hyperparameters": "None"
  }

In [None]:
mlflow.log_params(params)
#tags = data used
mlflow.set_tag("Vaccines in features", "None") #set tags for more details of what we've done
mlflow.log_metric("train -" + "ROC", seas_knn_unilabel_no_vacc_train_roc)
mlflow.log_metric("test -" + "ROC", seas_knn_unilabel_no_vacc_test_roc)
mlflow.log_metric("train -" + "accuracy", seas_knn_unilabel_no_vacc_train_acc)
mlflow.log_metric("test -" + "accuracy", seas_knn_unilabel_no_vacc_test_acc)
mlflow.log_metric("train -" + "recall", seas_knn_unilabel_no_vacc_train_recall)
mlflow.log_metric("test -" + "recall", seas_knn_unilabel_no_vacc_test_recall)
mlflow.log_metric("train -" + "precision", seas_knn_unilabel_no_vacc_train_precision)
mlflow.log_metric("test -" + "precision", seas_knn_unilabel_no_vacc_test_precision)
mlflow.log_metric("train -" + "f1", seas_knn_unilabel_no_vacc_train_f1)
mlflow.log_metric("test -" + "f1", seas_knn_unilabel_no_vacc_test_f1)
#mlflow.log_artifact("../models")
#mlflow.sklearn.log_model(reg, "model")
mlflow.end_run()

In [None]:
#mlflow.get_run(run_id=run.info.run_id)

#### Random forest

In [None]:
# setting the MLFlow connection and experiment
mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME_seasonal)
name = 'rand_forst_unilabel_seasonal' #specify the run name here; name it with model used
mlflow.start_run(run_name = name) 
run = mlflow.active_run()

In [None]:
print("Active run_id: {}".format(run.info.run_id))

In [None]:
#parameters are to keep track of everything for re-running the experiment (e.g. what features are being engineered, hyperparameters of model, etc)
#to be adjusted as needed
params = {
    "Data cleaning": "Drop all nulls",
    "Data balancing": "None",
    "Hyperparameters": "None"
  }

In [None]:
mlflow.log_params(params)
#tags = data used
mlflow.set_tag("Vaccines in features", "None") #set tags for more details of what we've done
mlflow.log_metric("train -" + "ROC", seas_rand_forst_unilabel_no_vacc_train_roc)
mlflow.log_metric("test -" + "ROC", seas_rand_forst_unilabel_no_vacc_test_roc)
mlflow.log_metric("train -" + "accuracy", seas_rand_forst_unilabel_no_vacc_train_acc)
mlflow.log_metric("test -" + "accuracy", seas_rand_forst_unilabel_no_vacc_test_acc)
mlflow.log_metric("train -" + "recall", seas_rand_forst_unilabel_no_vacc_train_recall)
mlflow.log_metric("test -" + "recall", seas_rand_forst_unilabel_no_vacc_test_recall)
mlflow.log_metric("train -" + "precision", seas_rand_forst_unilabel_no_vacc_train_precision)
mlflow.log_metric("test -" + "precision", seas_rand_forst_unilabel_no_vacc_test_precision)
mlflow.log_metric("train -" + "f1", seas_rand_forst_unilabel_no_vacc_train_f1)
mlflow.log_metric("test -" + "f1", seas_rand_forst_unilabel_no_vacc_test_f1)
#mlflow.log_artifact("../models")
#mlflow.sklearn.log_model(reg, "model")
mlflow.end_run()

In [None]:
#mlflow.get_run(run_id=run.info.run_id)

#### SVM

In [None]:
# setting the MLFlow connection and experiment
mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME_seasonal)
name = 'svm_unilabel_seasonal' #specify the run name here; name it with model used
mlflow.start_run(run_name = name) 
run = mlflow.active_run()

In [None]:
print("Active run_id: {}".format(run.info.run_id))

In [None]:
#parameters are to keep track of everything for re-running the experiment (e.g. what features are being engineered, hyperparameters of model, etc)
#to be adjusted as needed
params = {
    "Data cleaning": "Drop all nulls",
    "Data balancing": "None",
    "Hyperparameters": "None"
  }

In [None]:
mlflow.log_params(params)
#tags = data used
mlflow.set_tag("Vaccines in features", "None") #set tags for more details of what we've done
mlflow.log_metric("train -" + "ROC", seas_svm_unilabel_no_vacc_train_roc)
mlflow.log_metric("test -" + "ROC", seas_svm_unilabel_no_vacc_test_roc)
mlflow.log_metric("train -" + "accuracy", seas_svm_unilabel_no_vacc_train_acc)
mlflow.log_metric("test -" + "accuracy", seas_svm_unilabel_no_vacc_test_acc)
mlflow.log_metric("train -" + "recall", seas_svm_unilabel_no_vacc_train_recall)
mlflow.log_metric("test -" + "recall", seas_svm_unilabel_no_vacc_test_recall)
mlflow.log_metric("train -" + "precision", seas_svm_unilabel_no_vacc_train_precision)
mlflow.log_metric("test -" + "precision", seas_svm_unilabel_no_vacc_test_precision)
mlflow.log_metric("train -" + "f1", seas_svm_unilabel_no_vacc_train_f1)
mlflow.log_metric("test -" + "f1", seas_svm_unilabel_no_vacc_test_f1)
#mlflow.log_artifact("../models")
#mlflow.sklearn.log_model(reg, "model")
mlflow.end_run()

In [None]:
#mlflow.get_run(run_id=run.info.run_id)

## Single Label Modelling, output seasonal vaccine -> H1N1 Flu Vaccine is in features

The y_seas_vacc remains the same from the previous model; the X feature and cat_features (for the preprocessor) need to be adjusted:

In [None]:
cat_features_h1n1_vacc = cat_features.copy()

In [None]:
cat_features_h1n1_vacc.remove('seasonal_vaccine')

In [None]:
#NB: dropping the 'h1n1_vaccine' and 'seasonal_vaccine' columns
X_h1n1_vacc = df.drop(columns=['seasonal_vaccine'])

Performing test-train split (the same data can be used for each model in multilabelling):

In [None]:
X_h1n1_vacc_train, X_h1n1_vacc_test, y_seas_vacc_train, y_seas_vacc_test = train_test_split(X_h1n1_vacc, y_seas_vacc, stratify = y_seas_vacc, test_size=0.2, random_state=RSEED)

In [None]:
print('X_h1n1_vacc_train shape:', X_h1n1_vacc_train.shape)
print('X_h1n1_vacc_test shape:', X_h1n1_vacc_test.shape)
print('y_seas_vacc_train:', y_seas_vacc_train.shape)
print('y_seas_vacc_test:', y_seas_vacc_test.shape)

The pipeline stays the same as the model above

Training the models:

In [None]:
# for logreg
logreg_unilabel_h1n1_vacc = logreg_unilabel_pipeline.fit(X_h1n1_vacc_train, y_seas_vacc_train)

# for KNN
knn_unilabel_h1n1_vacc = knn_unilabel_pipeline.fit(X_h1n1_vacc_train, y_seas_vacc_train)

# for Random Forest
rand_forst_unilabel_h1n1_vacc = rand_forst_unilabel_pipeline.fit(X_h1n1_vacc_train, y_seas_vacc_train)

#for SVM
svm_unilabel_h1n1_vacc = svm_unilabel_pipeline.fit(X_h1n1_vacc_train, y_seas_vacc_train)

In [None]:
# Figure out later what this does and if we want to use it


#y_train_predicted = cross_val_predict(full_pipeline, X_train, y_train, cv=5)

Making predictions based on train and test data:

In [None]:
# for logreg
logreg_unilabel_h1n1_vacc_trainpreds = logreg_unilabel_h1n1_vacc.predict(X_h1n1_vacc_train)
logreg_unilabel_h1n1_vacc_testpreds = logreg_unilabel_h1n1_vacc.predict(X_h1n1_vacc_test)

# for KNN
knn_unilabel_h1n1_vacc_trainpreds = knn_unilabel_h1n1_vacc.predict(X_h1n1_vacc_train)
knn_unilabel_h1n1_vacc_testpreds = knn_unilabel_h1n1_vacc.predict(X_h1n1_vacc_test)

# for Random Forest
rand_forst_unilabel_h1n1_vacc_trainpreds = rand_forst_unilabel_h1n1_vacc.predict(X_h1n1_vacc_train)
rand_forst_unilabel_h1n1_vacc_testpreds = rand_forst_unilabel_h1n1_vacc.predict(X_h1n1_vacc_test)

# for SVM
svm_unilabel_h1n1_vacc_trainpreds = svm_unilabel_h1n1_vacc.predict(X_h1n1_vacc_train)
svm_unilabel_h1n1_vacc_testpreds = svm_unilabel_h1n1_vacc.predict(X_h1n1_vacc_test)

### Model evaluation

#### Train data

In [None]:
# Logreg--Train data evaluation Metrics
seas_logreg_unilabel_h1n1_vacc_train_acc = accuracy_score(y_seas_vacc_train, logreg_unilabel_h1n1_vacc_trainpreds)
seas_logreg_unilabel_h1n1_vacc_train_recall = recall_score(y_seas_vacc_train, logreg_unilabel_h1n1_vacc_trainpreds)
seas_logreg_unilabel_h1n1_vacc_train_precision = precision_score(y_seas_vacc_train, logreg_unilabel_h1n1_vacc_trainpreds)
seas_logreg_unilabel_h1n1_vacc_train_f1 = f1_score(y_seas_vacc_train, logreg_unilabel_h1n1_vacc_trainpreds)
seas_logreg_unilabel_h1n1_vacc_train_roc = roc_auc_score(y_seas_vacc_train, logreg_unilabel_h1n1_vacc_trainpreds)

#print("Accuracy: {:.2f}".format(accuracy_score(y_seas_vacc_train, logreg_unilabel_h1n1_vacc_trainpreds)))
#print("Recall: {:.2f}".format(recall_score(y_seas_vacc_train, logreg_unilabel_h1n1_vacc_trainpreds)))
#print("Precision: {:.2f}".format(precision_score(y_seas_vacc_train, logreg_unilabel_h1n1_vacc_trainpreds)))
#print("F1: {:.2f}".format(f1_score(y_seas_vacc_train, logreg_unilabel_h1n1_vacc_trainpreds)))
#print("ROC: {:.2f}".format(roc_auc_score(y_seas_vacc_train, logreg_unilabel_h1n1_vacc_trainpreds)))

In [None]:
# KNN--Train data evaluation Metrics
seas_knn_unilabel_h1n1_vacc_train_acc = accuracy_score(y_seas_vacc_train, knn_unilabel_h1n1_vacc_trainpreds)
seas_knn_unilabel_h1n1_vacc_train_recall = recall_score(y_seas_vacc_train, knn_unilabel_h1n1_vacc_trainpreds)
seas_knn_unilabel_h1n1_vacc_train_precision = precision_score(y_seas_vacc_train, knn_unilabel_h1n1_vacc_trainpreds)
seas_knn_unilabel_h1n1_vacc_train_f1 = f1_score(y_seas_vacc_train, knn_unilabel_h1n1_vacc_trainpreds)
seas_knn_unilabel_h1n1_vacc_train_roc = roc_auc_score(y_seas_vacc_train, knn_unilabel_h1n1_vacc_trainpreds)

#print("Accuracy: {:.2f}".format(accuracy_score(y_seas_vacc_train, knn_unilabel_h1n1_vacc_trainpreds)))
#print("Recall: {:.2f}".format(recall_score(y_seas_vacc_train, knn_unilabel_h1n1_vacc_trainpreds)))
#print("Precision: {:.2f}".format(precision_score(y_seas_vacc_train, knn_unilabel_h1n1_vacc_trainpreds)))
#print("F1: {:.2f}".format(f1_score(y_seas_vacc_train, knn_unilabel_h1n1_vacc_trainpreds)))
#print("ROC: {:.2f}".format(roc_auc_score(y_seas_vacc_train, knn_unilabel_h1n1_vacc_trainpreds)))

In [None]:
# Random Forest--Train data evaluation Metrics
#check if these metrics are changing compared to previous results
seas_rand_forst_unilabel_h1n1_vacc_train_acc = accuracy_score(y_seas_vacc_train, rand_forst_unilabel_h1n1_vacc_trainpreds)
seas_rand_forst_unilabel_h1n1_vacc_train_recall = recall_score(y_seas_vacc_train, rand_forst_unilabel_h1n1_vacc_trainpreds)
seas_rand_forst_unilabel_h1n1_vacc_train_precision = precision_score(y_seas_vacc_train, rand_forst_unilabel_h1n1_vacc_trainpreds)
seas_rand_forst_unilabel_h1n1_vacc_train_f1 = f1_score(y_seas_vacc_train, rand_forst_unilabel_h1n1_vacc_trainpreds)
seas_rand_forst_unilabel_h1n1_vacc_train_roc = roc_auc_score(y_seas_vacc_train, rand_forst_unilabel_h1n1_vacc_trainpreds)

#print("Accuracy: {:.2f}".format(accuracy_score(y_seas_vacc_train, rand_forst_unilabel_h1n1_vacc_trainpreds)))
#print("Recall: {:.2f}".format(recall_score(y_seas_vacc_train, rand_forst_unilabel_h1n1_vacc_trainpreds)))
#print("Precision: {:.2f}".format(precision_score(y_seas_vacc_train, rand_forst_unilabel_h1n1_vacc_trainpreds)))
#print("F1: {:.2f}".format(f1_score(y_seas_vacc_train, rand_forst_unilabel_h1n1_vacc_trainpreds)))
#print("ROC: {:.2f}".format(roc_auc_score(y_seas_vacc_train, rand_forst_unilabel_h1n1_vacc_trainpreds)))

In [None]:
# SVM--Train data evaluation Metrics
seas_svm_unilabel_h1n1_vacc_train_acc = accuracy_score(y_seas_vacc_train, svm_unilabel_h1n1_vacc_trainpreds)
seas_svm_unilabel_h1n1_vacc_train_recall = recall_score(y_seas_vacc_train, svm_unilabel_h1n1_vacc_trainpreds)
seas_svm_unilabel_h1n1_vacc_train_precision = precision_score(y_seas_vacc_train, svm_unilabel_h1n1_vacc_trainpreds)
seas_svm_unilabel_h1n1_vacc_train_f1 = f1_score(y_seas_vacc_train, svm_unilabel_h1n1_vacc_trainpreds)
seas_svm_unilabel_h1n1_vacc_train_roc = roc_auc_score(y_seas_vacc_train, svm_unilabel_h1n1_vacc_trainpreds)

#print("Accuracy: {:.2f}".format(accuracy_score(y_seas_vacc_train, svm_unilabel_h1n1_vacc_trainpreds)))
#print("Recall: {:.2f}".format(recall_score(y_seas_vacc_train, svm_unilabel_h1n1_vacc_trainpreds)))
#print("Precision: {:.2f}".format(precision_score(y_seas_vacc_train, svm_unilabel_h1n1_vacc_trainpreds)))
#print("F1: {:.2f}".format(f1_score(y_seas_vacc_train, svm_unilabel_h1n1_vacc_trainpreds)))
#print("ROC: {:.2f}".format(roc_auc_score(y_seas_vacc_train, svm_unilabel_h1n1_vacc_trainpreds)))

#### Test data

In [None]:
# Logreg--Test data evaluation Metrics
seas_logreg_unilabel_h1n1_vacc_test_acc = accuracy_score(y_seas_vacc_test, logreg_unilabel_h1n1_vacc_testpreds)
seas_logreg_unilabel_h1n1_vacc_test_recall = recall_score(y_seas_vacc_test, logreg_unilabel_h1n1_vacc_testpreds)
seas_logreg_unilabel_h1n1_vacc_test_precision = precision_score(y_seas_vacc_test, logreg_unilabel_h1n1_vacc_testpreds)
seas_logreg_unilabel_h1n1_vacc_test_f1 = f1_score(y_seas_vacc_test, logreg_unilabel_h1n1_vacc_testpreds)
seas_logreg_unilabel_h1n1_vacc_test_roc = roc_auc_score(y_seas_vacc_test, logreg_unilabel_h1n1_vacc_testpreds)

#print("Accuracy: {:.2f}".format(accuracy_score(y_seas_vacc_test, logreg_unilabel_h1n1_vacc_testpreds)))
#print("Recall: {:.2f}".format(recall_score(y_seas_vacc_test, logreg_unilabel_h1n1_vacc_testpreds)))
#print("Precision: {:.2f}".format(precision_score(y_seas_vacc_test, logreg_unilabel_h1n1_vacc_testpreds)))
#print("F1: {:.2f}".format(f1_score(y_seas_vacc_test, logreg_unilabel_h1n1_vacc_testpreds)))
#print("ROC: {:.2f}".format(roc_auc_score(y_seas_vacc_test, logreg_unilabel_h1n1_vacc_testpreds)))

In [None]:
# KNN--Test data evaluation Metrics
seas_knn_unilabel_h1n1_vacc_test_acc = accuracy_score(y_seas_vacc_test, knn_unilabel_h1n1_vacc_testpreds)
seas_knn_unilabel_h1n1_vacc_test_recall = recall_score(y_seas_vacc_test, knn_unilabel_h1n1_vacc_testpreds)
seas_knn_unilabel_h1n1_vacc_test_precision = precision_score(y_seas_vacc_test, knn_unilabel_h1n1_vacc_testpreds)
seas_knn_unilabel_h1n1_vacc_test_f1 = f1_score(y_seas_vacc_test, knn_unilabel_h1n1_vacc_testpreds)
seas_knn_unilabel_h1n1_vacc_test_roc = roc_auc_score(y_seas_vacc_test, knn_unilabel_h1n1_vacc_testpreds)

#print("Accuracy: {:.2f}".format(accuracy_score(y_seas_vacc_test, knn_unilabel_h1n1_vacc_testpreds)))
#print("Recall: {:.2f}".format(recall_score(y_seas_vacc_test, knn_unilabel_h1n1_vacc_testpreds)))
#print("Precision: {:.2f}".format(precision_score(y_seas_vacc_test, knn_unilabel_h1n1_vacc_testpreds)))
#print("F1: {:.2f}".format(f1_score(y_seas_vacc_test, knn_unilabel_h1n1_vacc_testpreds)))
#print("ROC: {:.2f}".format(roc_auc_score(y_seas_vacc_test, knn_unilabel_h1n1_vacc_testpreds)))

In [None]:
# Random Forest--Test data evaluation Metrics
#check if these metrics are changing compared to previous results
seas_rand_forst_unilabel_h1n1_vacc_test_acc = accuracy_score(y_seas_vacc_test, rand_forst_unilabel_h1n1_vacc_testpreds)
seas_rand_forst_unilabel_h1n1_vacc_test_recall = recall_score(y_seas_vacc_test, rand_forst_unilabel_h1n1_vacc_testpreds)
seas_rand_forst_unilabel_h1n1_vacc_test_precision = precision_score(y_seas_vacc_test, rand_forst_unilabel_h1n1_vacc_testpreds)
seas_rand_forst_unilabel_h1n1_vacc_test_f1 = f1_score(y_seas_vacc_test, rand_forst_unilabel_h1n1_vacc_testpreds)
seas_rand_forst_unilabel_h1n1_vacc_test_roc = roc_auc_score(y_seas_vacc_test, rand_forst_unilabel_h1n1_vacc_testpreds)

#print("Accuracy: {:.2f}".format(accuracy_score(y_seas_vacc_test, rand_forst_unilabel_h1n1_vacc_testpreds)))
#print("Recall: {:.2f}".format(recall_score(y_seas_vacc_test, rand_forst_unilabel_h1n1_vacc_testpreds)))
#print("Precision: {:.2f}".format(precision_score(y_seas_vacc_test, rand_forst_unilabel_h1n1_vacc_testpreds)))
#print("F1: {:.2f}".format(f1_score(y_seas_vacc_test, rand_forst_unilabel_h1n1_vacc_testpreds)))
#print("ROC: {:.2f}".format(roc_auc_score(y_seas_vacc_test, rand_forst_unilabel_h1n1_vacc_testpreds)))

In [None]:
# SVM--Test data evaluation Metrics
seas_svm_unilabel_h1n1_vacc_test_acc = accuracy_score(y_seas_vacc_test, svm_unilabel_h1n1_vacc_testpreds)
seas_svm_unilabel_h1n1_vacc_test_recall = recall_score(y_seas_vacc_test, svm_unilabel_h1n1_vacc_testpreds)
seas_svm_unilabel_h1n1_vacc_test_precision = precision_score(y_seas_vacc_test, svm_unilabel_h1n1_vacc_testpreds)
seas_svm_unilabel_h1n1_vacc_test_f1 = f1_score(y_seas_vacc_test, svm_unilabel_h1n1_vacc_testpreds)
seas_svm_unilabel_h1n1_vacc_test_roc = roc_auc_score(y_seas_vacc_test, svm_unilabel_h1n1_vacc_testpreds)

#print("Accuracy: {:.2f}".format(accuracy_score(y_seas_vacc_test, svm_unilabel_h1n1_vacc_testpreds)))
#print("Recall: {:.2f}".format(recall_score(y_seas_vacc_test, svm_unilabel_h1n1_vacc_testpreds)))
#print("Precision: {:.2f}".format(precision_score(y_seas_vacc_test, svm_unilabel_h1n1_vacc_testpreds)))
#print("F1: {:.2f}".format(f1_score(y_seas_vacc_test, svm_unilabel_h1n1_vacc_testpreds)))
#print("ROC: {:.2f}".format(roc_auc_score(y_seas_vacc_test, svm_unilabel_h1n1_vacc_testpreds)))

## Tracking the model with MLFlow

### H1N1 vaccine output

#### Logistic regression

In [None]:
# setting the MLFlow connection and experiment
mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME_seasonal) #this needs to be adjusted for each experiment
name = 'logreg_unilabel_seasonal' #specify the run name here; name it with model used
mlflow.start_run(run_name = name) 
run = mlflow.active_run()

In [None]:
print("Active run_id: {}".format(run.info.run_id))

In [None]:
#parameters are to keep track of everything for re-running the experiment (e.g. what features are being engineered, hyperparameters of model, etc)
#to be adjusted as needed
params = {
    "Data cleaning": "Drop all nulls",
    "Data balancing": "None",
    "Hyperparameters": "None"
  }

In [None]:
mlflow.log_params(params)
#tags = data used
mlflow.set_tag("Vaccines in features", "H1N1") #set tags for more details of what we've done
mlflow.log_metric("train -" + "ROC", seas_logreg_unilabel_h1n1_vacc_train_roc)
mlflow.log_metric("test -" + "ROC", seas_logreg_unilabel_h1n1_vacc_test_roc)
mlflow.log_metric("train -" + "accuracy", seas_logreg_unilabel_h1n1_vacc_train_acc)
mlflow.log_metric("test -" + "accuracy", seas_logreg_unilabel_h1n1_vacc_test_acc)
mlflow.log_metric("train -" + "recall", seas_logreg_unilabel_h1n1_vacc_train_recall)
mlflow.log_metric("test -" + "recall", seas_logreg_unilabel_h1n1_vacc_test_recall)
mlflow.log_metric("train -" + "precision", seas_logreg_unilabel_h1n1_vacc_train_precision)
mlflow.log_metric("test -" + "precision", seas_logreg_unilabel_h1n1_vacc_test_precision)
mlflow.log_metric("train -" + "f1", seas_logreg_unilabel_h1n1_vacc_train_f1)
mlflow.log_metric("test -" + "f1", seas_logreg_unilabel_h1n1_vacc_test_f1)
#mlflow.log_artifact("../models")
#mlflow.sklearn.log_model(reg, "model")
mlflow.end_run()

In [None]:
#mlflow.get_run(run_id=run.info.run_id)

#### KNN

In [None]:
# setting the MLFlow connection and experiment
mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME_seasonal)
name = 'knn_unilabel_seasonal' #specify the run name here; name it with model used
mlflow.start_run(run_name = name) 
run = mlflow.active_run()

In [None]:
print("Active run_id: {}".format(run.info.run_id))

In [None]:
#parameters are to keep track of everything for re-running the experiment (e.g. what features are being engineered, hyperparameters of model, etc)
#to be adjusted as needed
params = {
    "Data cleaning": "Drop all nulls",
    "Data balancing": "None",
    "Hyperparameters": "None"
  }

In [None]:
mlflow.log_params(params)
#tags = data used
mlflow.set_tag("Vaccines in features", "H1N1") #set tags for more details of what we've done
mlflow.log_metric("train -" + "ROC", seas_knn_unilabel_h1n1_vacc_train_roc)
mlflow.log_metric("test -" + "ROC", seas_knn_unilabel_h1n1_vacc_test_roc)
mlflow.log_metric("train -" + "accuracy", seas_knn_unilabel_h1n1_vacc_train_acc)
mlflow.log_metric("test -" + "accuracy", seas_knn_unilabel_h1n1_vacc_test_acc)
mlflow.log_metric("train -" + "recall", seas_knn_unilabel_h1n1_vacc_train_recall)
mlflow.log_metric("test -" + "recall", seas_knn_unilabel_h1n1_vacc_test_recall)
mlflow.log_metric("train -" + "precision", seas_knn_unilabel_h1n1_vacc_train_precision)
mlflow.log_metric("test -" + "precision", seas_knn_unilabel_h1n1_vacc_test_precision)
mlflow.log_metric("train -" + "f1", seas_knn_unilabel_h1n1_vacc_train_f1)
mlflow.log_metric("test -" + "f1", seas_knn_unilabel_h1n1_vacc_test_f1)
#mlflow.log_artifact("../models")
#mlflow.sklearn.log_model(reg, "model")
mlflow.end_run()

In [None]:
#mlflow.get_run(run_id=run.info.run_id)

#### Random forest

In [None]:
# setting the MLFlow connection and experiment
mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME_seasonal)
name = 'rand_forst_unilabel_seasonal' #specify the run name here; name it with model used
mlflow.start_run(run_name = name) 
run = mlflow.active_run()

In [None]:
print("Active run_id: {}".format(run.info.run_id))

In [None]:
#parameters are to keep track of everything for re-running the experiment (e.g. what features are being engineered, hyperparameters of model, etc)
#to be adjusted as needed
params = {
    "Data cleaning": "Drop all nulls",
    "Data balancing": "None",
    "Hyperparameters": "None"
  }

In [None]:
mlflow.log_params(params)
#tags = data used
mlflow.set_tag("Vaccines in features", "H1N1") #set tags for more details of what we've done
mlflow.log_metric("train -" + "ROC", seas_rand_forst_unilabel_h1n1_vacc_train_roc)
mlflow.log_metric("test -" + "ROC", seas_rand_forst_unilabel_h1n1_vacc_test_roc)
mlflow.log_metric("train -" + "accuracy", seas_rand_forst_unilabel_h1n1_vacc_train_acc)
mlflow.log_metric("test -" + "accuracy", seas_rand_forst_unilabel_h1n1_vacc_test_acc)
mlflow.log_metric("train -" + "recall", seas_rand_forst_unilabel_h1n1_vacc_train_recall)
mlflow.log_metric("test -" + "recall", seas_rand_forst_unilabel_h1n1_vacc_test_recall)
mlflow.log_metric("train -" + "precision", seas_rand_forst_unilabel_h1n1_vacc_train_precision)
mlflow.log_metric("test -" + "precision", seas_rand_forst_unilabel_h1n1_vacc_test_precision)
mlflow.log_metric("train -" + "f1", seas_rand_forst_unilabel_h1n1_vacc_train_f1)
mlflow.log_metric("test -" + "f1", seas_rand_forst_unilabel_h1n1_vacc_test_f1)
#mlflow.log_artifact("../models")
#mlflow.sklearn.log_model(reg, "model")
mlflow.end_run()

In [None]:
#mlflow.get_run(run_id=run.info.run_id)

#### SVM

In [None]:
# setting the MLFlow connection and experiment
mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME_seasonal)
name = 'svm_unilabel_seasonal' #specify the run name here; name it with model used
mlflow.start_run(run_name = name) 
run = mlflow.active_run()

In [None]:
print("Active run_id: {}".format(run.info.run_id))

In [None]:
#parameters are to keep track of everything for re-running the experiment (e.g. what features are being engineered, hyperparameters of model, etc)
#to be adjusted as needed
params = {
    "Data cleaning": "Drop all nulls",
    "Data balancing": "None",
    "Hyperparameters": "None"
  }

In [None]:
mlflow.log_params(params)
#tags = data used
mlflow.set_tag("Vaccines in features", "H1N1") #set tags for more details of what we've done
mlflow.log_metric("train -" + "ROC", seas_svm_unilabel_h1n1_vacc_train_roc)
mlflow.log_metric("test -" + "ROC", seas_svm_unilabel_h1n1_vacc_test_roc)
mlflow.log_metric("train -" + "accuracy", seas_svm_unilabel_h1n1_vacc_train_acc)
mlflow.log_metric("test -" + "accuracy", seas_svm_unilabel_h1n1_vacc_test_acc)
mlflow.log_metric("train -" + "recall", seas_svm_unilabel_h1n1_vacc_train_recall)
mlflow.log_metric("test -" + "recall", seas_svm_unilabel_h1n1_vacc_test_recall)
mlflow.log_metric("train -" + "precision", seas_svm_unilabel_h1n1_vacc_train_precision)
mlflow.log_metric("test -" + "precision", seas_svm_unilabel_h1n1_vacc_test_precision)
mlflow.log_metric("train -" + "f1", seas_svm_unilabel_h1n1_vacc_train_f1)
mlflow.log_metric("test -" + "f1", seas_svm_unilabel_h1n1_vacc_test_f1)
#mlflow.log_artifact("../models")
#mlflow.sklearn.log_model(reg, "model")
mlflow.end_run()

In [None]:
#mlflow.get_run(run_id=run.info.run_id)

# THE CODE BELOW HAS NOT BEEN REVIEWED!
***


# Single Label Modelling

In [None]:
# pipeline for the single label

full_pipeline_1 = Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", LogisticRegression()),
    
])

## Predicting h1n1_vaccine  with Seasonal Flu Vaccine not in features

In [None]:
y = df['h1n1_vaccine'].copy() # for h1n1_vaccine only

In [None]:
y = y.to_numpy()
y

In [None]:
#NB: the H1N1 vaccine and seasonal vaccine are left in, otherwise the pipeline doesn't run properly
#X = df

#NB: dropping the 'h1n1_vaccine' and 'seasonal_vaccine' columns
#X = df.drop(columns=['seasonal_vaccine'])


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size=0.2, random_state=RSEED) # split for h1n1_vaccine

In [None]:
full_pipeline_1.fit(X_train, y_train)

In [None]:
preds = full_pipeline_1.predict(X_test)

In [None]:
# Evaluation Metrices for H1N1 Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test, preds)))
print("Recall: {:.2f}".format(recall_score(y_test, preds)))
print("Precision: {:.2f}".format(precision_score(y_test, preds)))
print("F1: {:.2f}".format(f1_score(y_test, preds)))
print("ROC: {:.2f}".format(roc_auc_score(y_test, preds)))

## Predicting Seasonal Flu Vaccine with h1n1_vaccine not in features

In [None]:
y = df['seasonal_vaccine'].copy() # for seasonal_vaccine only

In [None]:
y = y.to_numpy()
y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size=0.2, random_state=RSEED) # split for seasonal_vaccine

In [None]:
full_pipeline_1.fit(X_train, y_train)

In [None]:
preds = full_pipeline_1.predict(X_test)

In [None]:
# Evaluation Metrices for H1N1 Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test, preds)))
print("Recall: {:.2f}".format(recall_score(y_test, preds)))
print("Precision: {:.2f}".format(precision_score(y_test, preds)))
print("F1: {:.2f}".format(f1_score(y_test, preds)))
print("ROC: {:.2f}".format(roc_auc_score(y_test, preds)))

## Predicting Seasonal Flu Vaccine with h1n1_vaccine in features

In [None]:
cat_features_new = list(df.columns)

In [None]:
cat_features_new.remove('seasonal_vaccine')

In [None]:
#NB: dropping the 'h1n1_vaccine' and 'seasonal_vaccine' columns
X = df.drop(columns=['seasonal_vaccine'])

In [None]:
y = df['seasonal_vaccine'].copy()
y

In [None]:
y = y.to_numpy()
y

In [None]:
preprocessor = ColumnTransformer([
    ('cat', cat_pipeline, cat_features_new)
])

In [None]:
full_pipeline_1 = Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", LogisticRegression()),
    
])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size=0.2, random_state=RSEED) # split for Seasonal_vaccine

In [None]:
full_pipeline_1.fit(X_train, y_train)

In [None]:
preds = full_pipeline_1.predict(X_test)

In [None]:
# Evaluation Metrices for Seasonal Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test, preds)))
print("Recall: {:.2f}".format(recall_score(y_test, preds)))
print("Precision: {:.2f}".format(precision_score(y_test, preds)))
print("F1: {:.2f}".format(f1_score(y_test, preds)))
print("ROC: {:.2f}".format(roc_auc_score(y_test, preds)))

## Predicting h1n1_vaccine with Seasonal Flu Vaccine  in features

In [None]:
cat_features_new = list(df.columns)


In [None]:
cat_features_new.remove('h1n1_vaccine')

In [None]:
#NB: dropping the 'h1n1_vaccine' and 'seasonal_vaccine' columns
X = df.drop(columns=['h1n1_vaccine'])

In [None]:
y = df['h1n1_vaccine'].copy()
y

In [None]:
preprocessor = ColumnTransformer([
    ('cat', cat_pipeline, cat_features_new)
])

In [None]:
full_pipeline_1 = Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", LogisticRegression()),
    
])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size=0.2, random_state=RSEED) # split for H1N1_vaccine

In [None]:
full_pipeline_1.fit(X_train, y_train)

In [None]:
preds = full_pipeline_1.predict(X_test)

In [None]:
# Evaluation Metrices for H1N1 Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test, preds)))
print("Recall: {:.2f}".format(recall_score(y_test, preds)))
print("Precision: {:.2f}".format(precision_score(y_test, preds)))
print("F1: {:.2f}".format(f1_score(y_test, preds)))
print("ROC: {:.2f}".format(roc_auc_score(y_test, preds)))

# Modelling Algorithms

In addition to Logistic regression, we are trying four different models to compare performance in terms of predicting the Vaccine Intake:

- K nearest neighbours
- Random Forest
- Support Vector Machine
- Naive Bayes

Instantiate the models:

In [None]:
knn_model = KNeighborsClassifier()
rand_forst_model = RandomForestClassifier()
svm_model = svm.SVC(kernel='rbf') 
#svm_model = svm.SVC(kernel='linear', C=1E10) 



Create Pipeline for each:

In [None]:
# for KNN
estimators_knn= MultiOutputClassifier(
    estimator=knn_model
)

# for Random Forest
estimators_rand_forst= MultiOutputClassifier(
    estimator=rand_forst_model
)


# for SVM

estimators_SVC= MultiOutputClassifier(
    estimator=svm_model
)



In [None]:
cat_features = list(df.columns)
cat_features.remove('h1n1_vaccine')
cat_features.remove('seasonal_vaccine')
#cat_features

In [None]:
preprocessor = ColumnTransformer([
    ('cat', cat_pipeline, cat_features)
])

In [None]:
# for KNN
full_pipeline_knn = Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", estimators_knn),
])

# for Random Forest

full_pipeline_rand_forst= Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", estimators_rand_forst),
])

#for SVM

full_pipeline_SVM= Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", estimators_SVC),
])

In [None]:
y = df[['h1n1_vaccine', 'seasonal_vaccine']].copy()
y

In [None]:
y = y.to_numpy()
y


In [None]:
#NB: dropping the 'h1n1_vaccine' and 'seasonal_vaccine' columns
X = df.drop(columns=['h1n1_vaccine', 'seasonal_vaccine'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size=0.2, random_state=RSEED)

Fit the data:

In [None]:
full_pipeline_SVM.fit(X_train, y_train)
full_pipeline_knn.fit(X_train, y_train)
full_pipeline_rand_forst.fit(X_train, y_train)

Get predictions:

In [None]:
#KNN
knn_train_pred = full_pipeline_knn.predict(X_train)
knn_test_pred = full_pipeline_knn.predict(X_test)


In [None]:
#Random forest

rand_forst_train_pred = full_pipeline_rand_forst.predict(X_train)
rand_forst_test_pred = full_pipeline_rand_forst.predict(X_test)

In [None]:
#SVM
svm_train_pred = full_pipeline_SVM.predict(X_train)
svm_test_pred = full_pipeline_SVM.predict(X_test)


### Evaluating model performance for Multilabel

KNN:

In [None]:
# Evaluation Metrices for H1N1 Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test[:, 0], knn_test_pred[:, 0])))
print("Recall: {:.2f}".format(recall_score(y_test[:, 0], knn_test_pred[:, 0])))
print("Precision: {:.2f}".format(precision_score(y_test[:, 0], knn_test_pred[:, 0])))
print("F1: {:.2f}".format(f1_score(y_test[:, 0], knn_test_pred[:, 0])))
print("ROC: {:.2f}".format(roc_auc_score(y_test[:, 0], knn_test_pred[:, 0])))

In [None]:
# Evaluation Metrices for Seasonal Flu Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test[:, 1], knn_test_pred[:, 1])))
print("Recall: {:.2f}".format(recall_score(y_test[:, 1], knn_test_pred[:, 1])))
print("Precision: {:.2f}".format(precision_score(y_test[:, 1], knn_test_pred[:, 1])))
print("F1: {:.2f}".format(f1_score(y_test[:, 1], knn_test_pred[:, 1])))
print("ROC: {:.2f}".format(roc_auc_score(y_test[:, 1], knn_test_pred[:, 1])))

Random Forest:

In [None]:
# Evaluation Metrices for H1N1 Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test[:, 0], rand_forst_test_pred[:, 0])))
print("Recall: {:.2f}".format(recall_score(y_test[:, 0], rand_forst_test_pred[:, 0])))
print("Precision: {:.2f}".format(precision_score(y_test[:, 0], rand_forst_test_pred[:, 0])))
print("F1: {:.2f}".format(f1_score(y_test[:, 0], rand_forst_test_pred[:, 0])))
print("ROC: {:.2f}".format(roc_auc_score(y_test[:, 0], rand_forst_test_pred[:, 0])))

In [None]:
# Evaluation Metrices for Seasonal Flu Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test[:, 1], rand_forst_test_pred[:, 1])))
print("Recall: {:.2f}".format(recall_score(y_test[:, 1], rand_forst_test_pred[:, 1])))
print("Precision: {:.2f}".format(precision_score(y_test[:, 1], rand_forst_test_pred[:, 1])))
print("F1: {:.2f}".format(f1_score(y_test[:, 1], rand_forst_test_pred[:, 1])))
print("ROC: {:.2f}".format(roc_auc_score(y_test[:, 1], rand_forst_test_pred[:, 1])))

Support Vector Machine:

In [None]:
# Evaluation Metrices for H1N1 Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test[:, 0], svm_test_pred[:, 0])))
print("Recall: {:.2f}".format(recall_score(y_test[:, 0], svm_test_pred[:, 0])))
print("Precision: {:.2f}".format(precision_score(y_test[:, 0], svm_test_pred[:, 0])))
print("F1: {:.2f}".format(f1_score(y_test[:, 0], svm_test_pred[:, 0])))
print("ROC: {:.2f}".format(roc_auc_score(y_test[:, 0], svm_test_pred[:, 0])))

In [None]:
# Evaluation Metrices for Seasonal Flu Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test[:, 1], svm_test_pred[:, 1])))
print("Recall: {:.2f}".format(recall_score(y_test[:, 1], svm_test_pred[:, 1])))
print("Precision: {:.2f}".format(precision_score(y_test[:, 1], svm_test_pred[:, 1])))
print("F1: {:.2f}".format(f1_score(y_test[:, 1], svm_test_pred[:, 1])))
print("ROC: {:.2f}".format(roc_auc_score(y_test[:, 1], svm_test_pred[:, 1])))

# Single Label Modelling of the four other algorithms

## Predicting h1n1_vaccine  with Seasonal Flu Vaccine not in features

In [None]:
y = df['h1n1_vaccine'].copy() # for h1n1_vaccine only
y = y.to_numpy()
y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size=0.2, random_state=RSEED) # split for h1n1_vaccine

### Pipeline for the single label

In [None]:
# for KNN
full_pipeline_knn = Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", knn_model),
])

# for Random Forest

full_pipeline_rand_forst= Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", rand_forst_model),
    ])

# for SVM

full_pipeline_svm= Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", svm_model),
    ]) 
    

In [None]:
full_pipeline_svm.fit(X_train, y_train)
full_pipeline_knn.fit(X_train, y_train)
full_pipeline_rand_forst.fit(X_train, y_train)

In [None]:
#KNN
knn_train_pred = full_pipeline_knn.predict(X_train)
knn_test_pred = full_pipeline_knn.predict(X_test)

In [None]:
#Random forest

rand_forst_train_pred = full_pipeline_rand_forst.predict(X_train)
rand_forst_test_pred = full_pipeline_rand_forst.predict(X_test)

In [None]:
#SVM

SVM_train_pred = full_pipeline_svm.predict(X_train)
SVM_forst_test_pred = full_pipeline_svm.predict(X_test)

### Evaluating model performance for Multilabel

KNN:

In [None]:
# Evaluation Metrices for H1N1 Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test, knn_test_pred)))
print("Recall: {:.2f}".format(recall_score(y_test, knn_test_pred)))
print("Precision: {:.2f}".format(precision_score(y_test, knn_test_pred)))
print("F1: {:.2f}".format(f1_score(y_test, knn_test_pred)))
print("ROC: {:.2f}".format(roc_auc_score(y_test, knn_test_pred)))

Random Forest:

In [None]:
# Evaluation Metrices for H1N1 Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test, rand_forst_test_pred)))
print("Recall: {:.2f}".format(recall_score(y_test, rand_forst_test_pred)))
print("Precision: {:.2f}".format(precision_score(y_test, rand_forst_test_pred)))
print("F1: {:.2f}".format(f1_score(y_test, rand_forst_test_pred)))
print("ROC: {:.2f}".format(roc_auc_score(y_test, rand_forst_test_pred)))

SVM:

In [None]:
# Evaluation Metrices for H1N1 Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test, SVM_forst_test_pred)))
print("Recall: {:.2f}".format(recall_score(y_test, SVM_forst_test_pred)))
print("Precision: {:.2f}".format(precision_score(y_test, SVM_forst_test_pred)))
print("F1: {:.2f}".format(f1_score(y_test, SVM_forst_test_pred)))
print("ROC: {:.2f}".format(roc_auc_score(y_test, SVM_forst_test_pred)))

## Predicting Seasonal Flu Vaccine with h1n1_vaccine not in features

In [None]:
y = df['seasonal_vaccine'].copy() # seasonal_vaccine only
y = y.to_numpy()
y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size=0.2, random_state=RSEED) # split for seasonal_vaccine

### Fitting Pipeline for the single label 

In [None]:
full_pipeline_svm.fit(X_train, y_train)
full_pipeline_knn.fit(X_train, y_train)
full_pipeline_rand_forst.fit(X_train, y_train)

In [None]:
#KNN
knn_train_pred = full_pipeline_knn.predict(X_train)
knn_test_pred = full_pipeline_knn.predict(X_test)

In [None]:
#Random forest

rand_forst_train_pred = full_pipeline_rand_forst.predict(X_train)
rand_forst_test_pred = full_pipeline_rand_forst.predict(X_test)

SVM:

In [None]:
#SVM
SVM_train_pred = full_pipeline_svm.predict(X_train)
SVM_forst_test_pred = full_pipeline_svm.predict(X_test)

KNN:

In [None]:
# Evaluation Metrices for H1N1 Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test, knn_test_pred)))
print("Recall: {:.2f}".format(recall_score(y_test, knn_test_pred)))
print("Precision: {:.2f}".format(precision_score(y_test, knn_test_pred)))
print("F1: {:.2f}".format(f1_score(y_test, knn_test_pred)))
print("ROC: {:.2f}".format(roc_auc_score(y_test, knn_test_pred)))

Random Forest:

In [None]:
# Evaluation Metrices for H1N1 Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test, rand_forst_test_pred)))
print("Recall: {:.2f}".format(recall_score(y_test, rand_forst_test_pred)))
print("Precision: {:.2f}".format(precision_score(y_test, rand_forst_test_pred)))
print("F1: {:.2f}".format(f1_score(y_test, rand_forst_test_pred)))
print("ROC: {:.2f}".format(roc_auc_score(y_test, rand_forst_test_pred)))

SVM:

In [None]:
# Evaluation Metrices for H1N1 Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test, SVM_forst_test_pred)))
print("Recall: {:.2f}".format(recall_score(y_test, SVM_forst_test_pred)))
print("Precision: {:.2f}".format(precision_score(y_test, SVM_forst_test_pred)))
print("F1: {:.2f}".format(f1_score(y_test, SVM_forst_test_pred)))
print("ROC: {:.2f}".format(roc_auc_score(y_test, SVM_forst_test_pred)))

## Predicting h1n1_vaccine with Seasonal Flu Vaccine  in features

In [None]:
y = df['h1n1_vaccine'].copy() # for h1n1_vaccine only
y = y.to_numpy()
y

In [None]:
cat_features = list(df.columns)
cat_features.remove('h1n1_vaccine')

#cat_features

In [None]:
#NB: dropping the 'h1n1_vaccine' column
X = df.drop(columns=['h1n1_vaccine'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size=0.2, random_state=RSEED)

In [None]:
preprocessor = ColumnTransformer([
    ('cat', cat_pipeline, cat_features)
])

In [None]:
# for KNN
full_pipeline_knn = Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", knn_model),
])

# for Random Forest

full_pipeline_rand_forst= Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", rand_forst_model),
    ])

    # for SVM

full_pipeline_svm= Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", svm_model),
    ]) 

In [None]:
full_pipeline_svm.fit(X_train, y_train)
full_pipeline_knn.fit(X_train, y_train)
full_pipeline_rand_forst.fit(X_train, y_train)

In [None]:
#KNN
knn_train_pred = full_pipeline_knn.predict(X_train)
knn_test_pred = full_pipeline_knn.predict(X_test)

In [None]:
#Random forest

rand_forst_train_pred = full_pipeline_rand_forst.predict(X_train)
rand_forst_test_pred = full_pipeline_rand_forst.predict(X_test)

In [None]:
#SVM

SVM_train_pred = full_pipeline_svm.predict(X_train)
SVM_test_pred = full_pipeline_svm.predict(X_test)

KNN:

In [None]:
# Evaluation Metrices for H1N1 Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test, knn_test_pred)))
print("Recall: {:.2f}".format(recall_score(y_test, knn_test_pred)))
print("Precision: {:.2f}".format(precision_score(y_test, knn_test_pred)))
print("F1: {:.2f}".format(f1_score(y_test, knn_test_pred)))
print("ROC: {:.2f}".format(roc_auc_score(y_test, knn_test_pred)))

Random Forest:

In [None]:
# Evaluation Metrices for H1N1 Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test, rand_forst_test_pred)))
print("Recall: {:.2f}".format(recall_score(y_test, rand_forst_test_pred)))
print("Precision: {:.2f}".format(precision_score(y_test, rand_forst_test_pred)))
print("F1: {:.2f}".format(f1_score(y_test, rand_forst_test_pred)))
print("ROC: {:.2f}".format(roc_auc_score(y_test, rand_forst_test_pred)))

SVM:

In [None]:
# Evaluation Metrices for H1N1 Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test, SVM_test_pred)))
print("Recall: {:.2f}".format(recall_score(y_test, SVM_test_pred)))
print("Precision: {:.2f}".format(precision_score(y_test, SVM_test_pred)))
print("F1: {:.2f}".format(f1_score(y_test, SVM_test_pred)))
print("ROC: {:.2f}".format(roc_auc_score(y_test, SVM_test_pred)))

## Predicting Seasonal Flu Vaccine with h1n1_vaccine in features

In [None]:
y = df['seasonal_vaccine'].copy() # seasonal_vaccine only
y = y.to_numpy()
y

In [None]:
cat_features = list(df.columns)

cat_features.remove('seasonal_vaccine')
#cat_features

In [None]:
#NB: dropping the 'seasonal_vaccine' column
X = df.drop(columns=['seasonal_vaccine'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size=0.2, random_state=RSEED)

In [None]:
preprocessor = ColumnTransformer([
    ('cat', cat_pipeline, cat_features)
])

In [None]:
# for KNN
full_pipeline_knn = Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", knn_model),
])

# for Random Forest

full_pipeline_rand_forst= Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", rand_forst_model),
    ])

#SVM

full_pipeline_svm= Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", svm_model),
    ]) 

In [None]:
full_pipeline_svm.fit(X_train, y_train)
full_pipeline_knn.fit(X_train, y_train)
full_pipeline_rand_forst.fit(X_train, y_train)

In [None]:
#KNN
knn_train_pred = full_pipeline_knn.predict(X_train)
knn_test_pred = full_pipeline_knn.predict(X_test)

In [None]:
#Random forest

rand_forst_train_pred = full_pipeline_rand_forst.predict(X_train)
rand_forst_test_pred = full_pipeline_rand_forst.predict(X_test)

In [None]:
#SVM

SVM_train_pred = full_pipeline_svm.predict(X_train)
SVM_test_pred = full_pipeline_svm.predict(X_test)

In [None]:
#full_pipeline_knn.fit(X_train, y_train)
#full_pipeline_rand_forst.fit(X_train, y_train)
#svm_model.fit(X_train, y_train)
#model_pipeline.fit(train.data, train.target)

KNN:

In [None]:
# Evaluation Metrices for H1N1 Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test, knn_test_pred)))
print("Recall: {:.2f}".format(recall_score(y_test, knn_test_pred)))
print("Precision: {:.2f}".format(precision_score(y_test, knn_test_pred)))
print("F1: {:.2f}".format(f1_score(y_test, knn_test_pred)))
print("ROC: {:.2f}".format(roc_auc_score(y_test, knn_test_pred)))

Random Forest:

In [None]:
# Evaluation Metrices for H1N1 Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test, rand_forst_test_pred)))
print("Recall: {:.2f}".format(recall_score(y_test, rand_forst_test_pred)))
print("Precision: {:.2f}".format(precision_score(y_test, rand_forst_test_pred)))
print("F1: {:.2f}".format(f1_score(y_test, rand_forst_test_pred)))
print("ROC: {:.2f}".format(roc_auc_score(y_test, rand_forst_test_pred)))

SVM:

In [None]:
# Evaluation Metrices for H1N1 Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test, SVM_test_pred)))
print("Recall: {:.2f}".format(recall_score(y_test, SVM_test_pred)))
print("Precision: {:.2f}".format(precision_score(y_test, SVM_test_pred)))
print("F1: {:.2f}".format(f1_score(y_test, SVM_test_pred)))
print("ROC: {:.2f}".format(roc_auc_score(y_test, SVM_test_pred)))

The dummy classifier predicts everything to belong to the same class and thus has no discriminatory ability (between negative and positive class). Therefore, the AUC of 0.5 is expected.

## Trainining the model and tracking with MLFlow

In [None]:
# setting the MLFlow connection and experiment
mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME_multiclass)
mlflow.start_run()
run = mlflow.active_run()

In [None]:
print("Active run_id: {}".format(run.info.run_id))

In [None]:
#training the model
reg = full_pipeline.fit(X_train, y_train)

In [None]:
y_train_pred = reg.predict(X_train)
#mse_train = mean_squared_error(y_train, y_train_pred)
roc_train = roc_auc_score(y_train[:, 0], y_train_pred[:, 0])
print(roc_train) #why does the ROC not match previous results that we had? Should be 0.71

In [None]:
#data cleaning and engineering--not done
#dropping Quakers column and unnamed
#changing one of the altitude to log and droping the original
'''X_test["altitude_mean_log"] = np.log(X_test["altitude_mean_meters"])
X_test.drop(['altitude_mean_meters'], axis=1, inplace=True)
X_test.drop(['Quakers'], axis=1, inplace=True)
X_test.drop(['Unnamed: 0'], axis=1, inplace=True)
# fillna with mean.. 
X_test["altitude_low_meters"] = X_test["altitude_low_meters"].fillna(altitude_low_meters_mean)
X_test["altitude_high_meters"] = X_test["altitude_high_meters"].fillna(altitude_high_meters_mean)
X_test["altitude_mean_log"] = X_test["altitude_mean_log"].fillna(altitude_mean_log_mean)'''

In [None]:
y_test_pred = reg.predict(X_test)
roc_test = roc_auc_score(y_test[:, 0], y_test_pred[:, 0])
print(roc_test) #looks correct

In [None]:
#keep track of everything for re-running the experiment (e.g. what features are being engineered, hyperparameters of model)
params = {

  }

In [None]:
mlflow.log_params(params)
mlflow.set_tag("running_from_jupyter", "True") #set tags for more details of what we've done
mlflow.log_metric("train -" + "ROC", roc_train)
mlflow.log_metric("test -" + "ROC", roc_test)
#mlflow.log_artifact("../models")
#mlflow.sklearn.log_model(reg, "model")
mlflow.end_run()

In [None]:
mlflow.get_run(run_id=run.info.run_id)

## Checking the experiments

while the next cell is running you will not be able to run other cells in the notebook

In [None]:
#use if running locally
#!mlflow ui