In [None]:
import sys
# adding to the path variables the one folder higher (locally, not changing system variables)
sys.path.append("..")
import pandas as pd
import numpy as np
import warnings
import mlflow
from modeling.config import TRACKING_URI, EXPERIMENT_NAME

RSEED = 42
# Modeling Libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.dummy import DummyClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_predict, cross_val_score, cross_validate
from sklearn.metrics import roc_curve, confusion_matrix, accuracy_score, recall_score, precision_score, f1_score, roc_auc_score

from sklearn.linear_model import LogisticRegression



warnings.filterwarnings('ignore')


In [None]:
df_features = pd.read_csv('../data/Flu_Shot_Learning_Predict_H1N1_and_Seasonal_Flu_Vaccines_-_Training_Features.csv')

In [None]:
df_target = pd.read_csv('../data/Flu_Shot_Learning_Predict_H1N1_and_Seasonal_Flu_Vaccines_-_Training_Labels.csv')

In [None]:
# Get info for the target
df_target.info()

In [None]:
df_features.isnull().sum(axis = 0)

- express missing values as %

Options:
- modelling for imputation (withput using our target variable!)
- or use prediction models that don't care about missing values
- or impute so that the overall distribution stays the same (based on statistics of this data set)
- or impute using a hypothesis (e.g. people who have missing values don't have health insurance?)
- for imputation we can try several approaches and see what gives best results :)

- remember--when we impute, we want to base assumptions based on the test set of a train-test split:
    - if we do multiple models, test-train split for each
    - create functions for imputation

- remember modelling for understanding (EDA style) =/= modelling for prediction

library for visualising missing values:  

https://github.com/ResidentMario/missingno

In [None]:
# We are concatenating both dataframes into one 

df = pd.merge(df_target, df_features, on=['respondent_id'])

# Data imputation

These count as columns that were dropped last time, with "too many missing values":
['health_insurance', 'employment_industry', 'employment_occupation', 'income_poverty', 'marital_status', 'employment_status']

To try:  
- imputation on whole DF, most common values
- imputation on whole DF (feature by feature), fill values but keep data distribution (e.g. 90% of values are 1, 10% of values are 0)
- model to predict missing values:
    - this is most important for health_insurance: 45% values are missing, but it's likely to be important to us
    - income_poverty could also be important
    - employment_occupation and employment_industry have many missing values, but their contents are not very useful (anonymised, we can't really make sense of them)--these features could be dropped

To note:
- prior to doing imputation models, I need to make sure the rest of the data is clean (or use a model which isn't impacted by missing data)
- I may need a combination strategy to tacle this (e.g. using a model to impute some features and filling other features in with their most common value)

How to assess imputation strategy:
- What were the final model evaluation metrics before and after imputation?
- Did overall statistics of the imputed features stay the same?
- What about their correlations to other features?

### Info:  
- https://machinelearningmastery.com/statistical-imputation-for-missing-values-in-machine-learning/

Statistics of features, before imputation:

In [None]:
df.describe()

How many values are missing? (expressed as %)

In [None]:
round(df.isnull().sum(axis = 0) / df.shape[0] *100, 2).sort_values(ascending=False)

## Looking at the missing values with missingno library

In [None]:
import missingno as msno
%matplotlib inline
msno.matrix(df.sample(26707))

- The employment_industry and employment_occupation columns seem to be highly correlated in terms of missing info (i.e. info that's missing in one column is missing in the other)
- The missing info in health_insurance doesn't seem to be correlated with anything

In [None]:
msno.heatmap(df)

In [None]:
msno.dendrogram(df)

Visualising nullity by column:

In [None]:
msno.bar(df.sample(26707))

## Initial strategy--drop some columns and then drop all rows with missing values

Dropping of features with too many missing values:

In [None]:
col_drop = ['health_insurance', 'employment_industry', 'employment_occupation', 'income_poverty', 'marital_status', 'employment_status']

df_initial = df.drop(col_drop, axis=1)

Also dropped the column 'respondent_id', because it will cause problems during 1-hot encoding:

In [None]:
#col_drop2 = ['index', 'respondent_id']
df_initial = df_initial.drop('respondent_id', axis=1)

Dropping of all rows with null values:

In [None]:
df_initial.dropna(inplace=True)

Check that all null values have been dropped:

In [None]:
df_initial.isnull().sum(axis = 0)

In [None]:
#the index is causing problems during 1-hot encoding
#df_initial.reset_index(inplace=True)

## Strategy 1: no imputation, leaving all missing values in

In [None]:
df = df.drop('respondent_id', axis=1)

In [None]:
df.isnull().sum(axis = 0)

## Strategy 2: imputation with SimpleImputer, most frequent values

https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html  

We fill everything with the most frequent value that appeared in that column. 
Run on the whole dataset.

In [None]:
from sklearn.impute import SimpleImputer

# define imputer
imputer = SimpleImputer(strategy='most_frequent')

# fit on the dataset
imputer.fit(df)

# transform the dataset
df_mostfreq = pd.DataFrame(imputer.transform(df))

SimpleImputer gives a dataframe without column headings--here the headings are added back on

In [None]:
col_names = df.columns.values.tolist()

In [None]:
df_mostfreq.columns = col_names

Checking that all values have been dropped

In [None]:
df_mostfreq.isnull().sum(axis = 0)

In [None]:
df_mostfreq.info()

The SimpleImputer has converted all the columns to strings...so they should be changed to numeric (or at least the target variables should be converted)

In [None]:
df_mostfreq['h1n1_vaccine'] = pd.to_numeric(df_mostfreq['h1n1_vaccine'])
df_mostfreq['seasonal_vaccine'] = pd.to_numeric(df_mostfreq['seasonal_vaccine'])

In [None]:
df_mostfreq['health_insurance'].value_counts()

## Strategy 3: imputation with IterativeImputer, default settings (apart from initial_strategy = “most_frequent”)

https://scikit-learn.org/stable/modules/generated/sklearn.impute.IterativeImputer.html#sklearn.impute.IterativeImputer

initial_strategy = “most_frequent”; other parameters at default (Bayesian Ridge estimation used)

Other estimators: https://scikit-learn.org/stable/auto_examples/impute/plot_iterative_imputer_variants_comparison.html

Dataframe split into X and Y before imputation, so the target variable isn't used in the imputation

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [None]:
y_strat3 = df[['h1n1_vaccine', 'seasonal_vaccine']].copy()

In [None]:
y_strat3 = y_strat3.to_numpy()

In [None]:
#NB: dropping the 'h1n1_vaccine' and 'seasonal_vaccine' columns
X_strat3 = df.drop(columns=['h1n1_vaccine', 'seasonal_vaccine'])

The IterativeImputer seems to have an issue with string values, so data converted to numerical:

In [None]:
cleanup = {"age_group": {"18 - 34 Years": 1, "35 - 44 Years": 2, "45 - 54 Years": 3, "55 - 64 Years": 4,
                                  "65+ Years": 5},
            "education": {"< 12 Years": 1, "12 Years": 2, "Some College": 3, "College Graduate": 4},
            "race": {"White": 1, "Black": 2, "Hispanic": 3, "Other or Multiple": 4},
            "sex" : {"Female": 1, "Male": 2},
            "rent_or_own" : {"Own": 1, "Rent": 2},
            "hhs_geo_region" : {"lzgpxyit": 1, "fpwskwrf": 2, "qufhixun": 3, "bhuqouqj": 4, "oxchjgsf": 5, "kbazzjca": 6, "mlyzmhmf": 7, "atmpeygn": 8, "lrircsnp": 9, "dqpwygqj": 10},
            "census_msa" : {"MSA, Not Principle  City": 1, "MSA, Principle City": 2, "Non-MSA": 3},
            "income_poverty" : {"Below Poverty": 1, "<= $75,000, Above Poverty": 2, "> $75,000": 3},
            "employment_industry" : {"fcxhlnwr": 1, "wxleyezf": 2, "ldnlellj": 3, "pxcmvdjn": 4, "atmlpfrs": 5, "arjwrbjb": 6, "xicduogh": 7, "mfikgejo": 8, "vjjrobsf": 9,
                                    "rucpziij": 10, "xqicxuve": 11, "saaquncn": 12, "cfqqtusy": 13, "nduyfdeo": 14, "mcubkhph": 15, "wlfvacwt": 16, "dotnnunm": 17, "haxffmxo": 18, "msuufmds": 19, "phxvnwax": 20,
                                    "qnlwzans": 21},
           "employment_occupation" : {"xtkaffoo": 1, "mxkfnird": 2, "emcorrxb": 3, "cmhcxjea": 4, "xgwztkwe": 5, "hfxkjkmi": 6, "qxajmpny": 7, "xqwwgdyp": 8, "kldqjyjy": 9,
                                    "uqqtjvyb": 10, "tfqavkke": 11, "ukymxvdu": 12, "vlluhbov": 13, "oijqvulv": 14, "ccgxvspp": 15, "bxpfxfdn": 16, "haliazsg": 17, "rcertsgn": 18, "xzmlyyjv": 19, "dlvbwzss": 20,
                                    "hodpvpew": 21, "dcjcmpih": 22, "pvmttkik": 23},
           "marital_status" : {"Married": 1, "Not Married": 2},
           "employment_status" : {"Employed": 1, "Not in Labor Force": 2, "Unemployed": 3}
                                  }
X_strat3_num = X_strat3.replace(cleanup)

In [None]:
#define imputer
it_imputer = IterativeImputer(initial_strategy='most_frequent')

# fit on the dataset
it_imputer.fit(X_strat3_num)
it_imputer.fit(X_strat3_num)

# transform the dataset
X_iter = it_imputer.transform(X_strat3_num)

IterativeImputer gives an array without column headings--here the headings are added back on and the array is converted back to a data frame:

In [None]:
col_names = X_strat3.columns.values.tolist()
X_iter = pd.DataFrame(X_iter, columns=col_names)

Checking that all values have been dropped:

In [None]:
X_iter.isnull().sum(axis = 0)

All data is currently numerical:

In [None]:
X_iter.info()

## Strategy 4: imputation with IterativeImputer, DecisionTreeRegressor estimator

This is the same as Strategy 3, except the imputation estimator is changed from default (BayesianRidge) to DecisionTreeRegressor

The same dataframes are used as for Strategy 3

In [None]:
from sklearn.tree import DecisionTreeRegressor

#define imputer
it_imputer_DT = IterativeImputer(estimator = DecisionTreeRegressor(),initial_strategy='most_frequent')

# fit on the dataset
it_imputer_DT.fit(X_strat3_num)

# transform the dataset
X_iter_DT = it_imputer_DT.transform(X_strat3_num)

In [None]:
col_names = X_strat3.columns.values.tolist()
X_iter_DT = pd.DataFrame(X_iter_DT, columns=col_names)

In [None]:
X_iter_DT.isnull().sum(axis = 0)

In [None]:
X_iter_DT.info()

---

These parts need to be adjusted for each imputation strategy

## Initial strategy

### Setting up variables

In [None]:
cat_features = list(df_initial.columns)

Removal of target variables from cat_features list:

In [None]:
cat_features.remove('h1n1_vaccine')

In [None]:
cat_features.remove('seasonal_vaccine')

Rename the features and target to 'X' and 'y', to make the test-train split easier (adjust for each dataframe, because the number of rows in the target will be different, depending on imputation strategy):

In [None]:
y_initial = df_initial[['h1n1_vaccine', 'seasonal_vaccine']].copy()

In [None]:
y_initial = y_initial.to_numpy()

In [None]:
#NB: dropping the 'h1n1_vaccine' and 'seasonal_vaccine' columns
X_initial = df_initial.drop(columns=['h1n1_vaccine', 'seasonal_vaccine'])

### Setting up pipeline  

Doesn't need modifying, but needs to be run for each imputation strategy

In [None]:
# Pipeline for categorical features
#CHECK HERE--ARE WE DROPPING THE FIRST COLUMN TO PREVENT MULTICOLINEARITY?
cat_pipeline = Pipeline([
    ('1hot', OneHotEncoder(handle_unknown='error', drop='first'))
])

In [None]:
preprocessor = ColumnTransformer([
    ('cat', cat_pipeline, cat_features)
])

In [None]:
estimators = MultiOutputClassifier(
    estimator=LogisticRegression()
)

In [None]:
full_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", estimators),
])

### Train-test split

Names need to be modified

In [None]:
X_train_init, X_test_init, y_train_init, y_test_init = train_test_split(X_initial, y_initial, stratify = y_initial, test_size=0.2, random_state=RSEED)

In [None]:
print('X_train_init shape:', X_train_init.shape)
print('X_test_init shape:', X_test_init.shape)
print('y_train_init shape:', y_train_init.shape)
print('y_test_init shape:', y_test_init.shape)

In [None]:
full_pipeline.fit(X_train_init, y_train_init)

In [None]:
#getting predictions from train data
preds_train_init = full_pipeline.predict(X_train_init)

In [None]:
#getting predictions from test data
preds_test_init = full_pipeline.predict(X_test_init)


### Model evaluation

In [None]:
# Evaluation Metrices for H1N1 Vaccines--train data
print("Accuracy: {:.2f}".format(accuracy_score(y_train_init[:, 0], preds_train_init[:, 0])))
print("Recall: {:.2f}".format(recall_score(y_train_init[:, 0], preds_train_init[:, 0])))
print("Precision: {:.2f}".format(precision_score(y_train_init[:, 0], preds_train_init[:, 0])))
print("F1: {:.2f}".format(f1_score(y_train_init[:, 0], preds_train_init[:, 0])))
print("ROC: {:.2f}".format(roc_auc_score(y_train_init[:, 0], preds_train_init[:, 0])))

In [None]:
# Evaluation Metrices for H1N1 Vaccines--test data
print("Accuracy: {:.2f}".format(accuracy_score(y_test_init[:, 0], preds_test_init[:, 0])))
print("Recall: {:.2f}".format(recall_score(y_test_init[:, 0], preds_test_init[:, 0])))
print("Precision: {:.2f}".format(precision_score(y_test_init[:, 0], preds_test_init[:, 0])))
print("F1: {:.2f}".format(f1_score(y_test_init[:, 0], preds_test_init[:, 0])))
print("ROC: {:.2f}".format(roc_auc_score(y_test_init[:, 0], preds_test_init[:, 0])))

In [None]:
# Evaluation Metrices for Seasonal Flu Vaccines--train data
print("Accuracy: {:.2f}".format(accuracy_score(y_train_init[:, 1], preds_train_init[:, 1])))
print("Recall: {:.2f}".format(recall_score(y_train_init[:, 1], preds_train_init[:, 1])))
print("Precision: {:.2f}".format(precision_score(y_train_init[:, 1], preds_train_init[:, 1])))
print("F1: {:.2f}".format(f1_score(y_train_init[:, 1], preds_train_init[:, 1])))
print("ROC: {:.2f}".format(roc_auc_score(y_train_init[:, 1], preds_train_init[:, 1])))

In [None]:
# Evaluation Metrices for Seasonal Flu Vaccines--test data
print("Accuracy: {:.2f}".format(accuracy_score(y_test_init[:, 1], preds_test_init[:, 1])))
print("Recall: {:.2f}".format(recall_score(y_test_init[:, 1], preds_test_init[:, 1])))
print("Precision: {:.2f}".format(precision_score(y_test_init[:, 1], preds_test_init[:, 1])))
print("F1: {:.2f}".format(f1_score(y_test_init[:, 1], preds_test_init[:, 1])))
print("ROC: {:.2f}".format(roc_auc_score(y_test_init[:, 1], preds_test_init[:, 1])))

---

## Strategy 1--no imputation

### Setting up variables

In [None]:
cat_features = list(df.columns)

Removal of target variables from cat_features list:

In [None]:
cat_features.remove('h1n1_vaccine')

In [None]:
cat_features.remove('seasonal_vaccine')

Rename the features and target to 'X' and 'y', to make the test-train split easier (adjust for each dataframe, because the number of rows in the target will be different, depending on imputation strategy):

In [None]:
y_strat1 = df[['h1n1_vaccine', 'seasonal_vaccine']].copy()

In [None]:
y_strat1 = y_strat1.to_numpy()

In [None]:
#NB: dropping the 'h1n1_vaccine' and 'seasonal_vaccine' columns
X_strat1 = df.drop(columns=['h1n1_vaccine', 'seasonal_vaccine'])

### Setting up pipeline  

Doesn't need modifying, but needs to be run for each imputation strategy

In [None]:
# Pipeline for categorical features
#CHECK HERE--ARE WE DROPPING THE FIRST COLUMN TO PREVENT MULTICOLINEARITY?
cat_pipeline = Pipeline([
    ('1hot', OneHotEncoder(handle_unknown='error', drop='first'))
])

In [None]:
preprocessor = ColumnTransformer([
    ('cat', cat_pipeline, cat_features)
])

In [None]:
estimators = MultiOutputClassifier(
    estimator=LogisticRegression()
)

In [None]:
full_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", estimators),
])

### Train-test split

Names need to be modified

In [None]:
X_train_strat1, X_test_strat1, y_train_strat1, y_test_strat1 = train_test_split(X_strat1, y_strat1, stratify = y_strat1, test_size=0.2, random_state=RSEED)

In [None]:
print('X_train_strat1 shape:', X_train_strat1.shape)
print('X_test_strat1 shape:', X_test_strat1.shape)
print('y_train_strat1 shape:', y_train_strat1.shape)
print('y_test_strat1 shape:', y_test_strat1.shape)

In [None]:
full_pipeline.fit(X_train_strat1, y_train_strat1)

In [None]:
#getting predictions from train data
preds_train_strat1 = full_pipeline.predict(X_train_strat1)

In [None]:
#getting predictions from test data
preds_test_strat1 = full_pipeline.predict(X_test_strat1)


### Model evaluation

In [None]:
# Evaluation Metrices for H1N1 Vaccines--train data
print("Accuracy: {:.2f}".format(accuracy_score(y_train_strat1[:, 0], preds_train_strat1[:, 0])))
print("Recall: {:.2f}".format(recall_score(y_train_strat1[:, 0], preds_train_strat1[:, 0])))
print("Precision: {:.2f}".format(precision_score(y_train_strat1[:, 0], preds_train_strat1[:, 0])))
print("F1: {:.2f}".format(f1_score(y_train_strat1[:, 0], preds_train_strat1[:, 0])))
print("ROC: {:.2f}".format(roc_auc_score(y_train_strat1[:, 0], preds_train_strat1[:, 0])))

In [None]:
# Evaluation Metrices for H1N1 Vaccines--test data
print("Accuracy: {:.2f}".format(accuracy_score(y_test_strat1[:, 0], preds_test_strat1[:, 0])))
print("Recall: {:.2f}".format(recall_score(y_test_strat1[:, 0], preds_test_strat1[:, 0])))
print("Precision: {:.2f}".format(precision_score(y_test_strat1[:, 0], preds_test_strat1[:, 0])))
print("F1: {:.2f}".format(f1_score(y_test_strat1[:, 0], preds_test_strat1[:, 0])))
print("ROC: {:.2f}".format(roc_auc_score(y_test_strat1[:, 0], preds_test_strat1[:, 0])))

In [None]:
# Evaluation Metrices for Seasonal Flu Vaccines--train data
print("Accuracy: {:.2f}".format(accuracy_score(y_train_strat1[:, 1], preds_train_strat1[:, 1])))
print("Recall: {:.2f}".format(recall_score(y_train_strat1[:, 1], preds_train_strat1[:, 1])))
print("Precision: {:.2f}".format(precision_score(y_train_strat1[:, 1], preds_train_strat1[:, 1])))
print("F1: {:.2f}".format(f1_score(y_train_strat1[:, 1], preds_train_strat1[:, 1])))
print("ROC: {:.2f}".format(roc_auc_score(y_train_strat1[:, 1], preds_train_strat1[:, 1])))

In [None]:
# Evaluation Metrices for Seasonal Flu Vaccines--test data
print("Accuracy: {:.2f}".format(accuracy_score(y_test_strat1[:, 1], preds_test_strat1[:, 1])))
print("Recall: {:.2f}".format(recall_score(y_test_strat1[:, 1], preds_test_strat1[:, 1])))
print("Precision: {:.2f}".format(precision_score(y_test_strat1[:, 1], preds_test_strat1[:, 1])))
print("F1: {:.2f}".format(f1_score(y_test_strat1[:, 1], preds_test_strat1[:, 1])))
print("ROC: {:.2f}".format(roc_auc_score(y_test_strat1[:, 1], preds_test_strat1[:, 1])))

---

## Strategy 2: imputation with SimpleImputer, most frequent values

### Setting up variables

In [None]:
cat_features = list(df_mostfreq.columns)

Removal of target variables from cat_features list:

In [None]:
cat_features.remove('h1n1_vaccine')

In [None]:
cat_features.remove('seasonal_vaccine')

Rename the features and target to 'X' and 'y', to make the test-train split easier (adjust for each dataframe, because the number of rows in the target will be different, depending on imputation strategy):

In [None]:
y_strat2 = df_mostfreq[['h1n1_vaccine', 'seasonal_vaccine']].copy()

In [None]:
y_strat2 = y_strat2.to_numpy()

In [None]:
#NB: dropping the 'h1n1_vaccine' and 'seasonal_vaccine' columns
X_strat2 = df_mostfreq.drop(columns=['h1n1_vaccine', 'seasonal_vaccine'])

### Setting up pipeline  

Doesn't need modifying, but needs to be run for each imputation strategy

In [None]:
# Pipeline for categorical features
#CHECK HERE--ARE WE DROPPING THE FIRST COLUMN TO PREVENT MULTICOLINEARITY?
cat_pipeline = Pipeline([
    ('1hot', OneHotEncoder(handle_unknown='error', drop='first'))
])

In [None]:
preprocessor = ColumnTransformer([
    ('cat', cat_pipeline, cat_features)
])

In [None]:
estimators = MultiOutputClassifier(
    estimator=LogisticRegression()
)

In [None]:
full_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", estimators),
])

### Train-test split

Names need to be modified

In [None]:
X_train_strat2, X_test_strat2, y_train_strat2, y_test_strat2 = train_test_split(X_strat2, y_strat2, stratify = y_strat2, test_size=0.2, random_state=RSEED)

In [None]:
print('X_train_strat2 shape:', X_train_strat2.shape)
print('X_test_strat2 shape:', X_test_strat2.shape)
print('y_train_strat2 shape:', y_train_strat2.shape)
print('y_test_strat2 shape:', y_test_strat2.shape)

In [None]:
full_pipeline.fit(X_train_strat2, y_train_strat2)

In [None]:
#getting predictions from train data
preds_train_strat2 = full_pipeline.predict(X_train_strat2)

In [None]:
#getting predictions from test data
preds_test_strat2 = full_pipeline.predict(X_test_strat2)

### Model evaluation

In [None]:
# Evaluation Metrices for H1N1 Vaccines--train data
print("Accuracy: {:.2f}".format(accuracy_score(y_train_strat2[:, 0], preds_train_strat2[:, 0])))
print("Recall: {:.2f}".format(recall_score(y_train_strat2[:, 0], preds_train_strat2[:, 0])))
print("Precision: {:.2f}".format(precision_score(y_train_strat2[:, 0], preds_train_strat2[:, 0])))
print("F1: {:.2f}".format(f1_score(y_train_strat2[:, 0], preds_train_strat2[:, 0])))
print("ROC: {:.2f}".format(roc_auc_score(y_train_strat2[:, 0], preds_train_strat2[:, 0])))

In [None]:
# Evaluation Metrices for H1N1 Vaccines--test data
print("Accuracy: {:.2f}".format(accuracy_score(y_test_strat2[:, 0], preds_test_strat2[:, 0])))
print("Recall: {:.2f}".format(recall_score(y_test_strat2[:, 0], preds_test_strat2[:, 0])))
print("Precision: {:.2f}".format(precision_score(y_test_strat2[:, 0], preds_test_strat2[:, 0])))
print("F1: {:.2f}".format(f1_score(y_test_strat2[:, 0], preds_test_strat2[:, 0])))
print("ROC: {:.2f}".format(roc_auc_score(y_test_strat2[:, 0], preds_test_strat2[:, 0])))

In [None]:
# Evaluation Metrices for Seasonal Flu Vaccines--train data
print("Accuracy: {:.2f}".format(accuracy_score(y_train_strat2[:, 1], preds_train_strat2[:, 1])))
print("Recall: {:.2f}".format(recall_score(y_train_strat2[:, 1], preds_train_strat2[:, 1])))
print("Precision: {:.2f}".format(precision_score(y_train_strat2[:, 1], preds_train_strat2[:, 1])))
print("F1: {:.2f}".format(f1_score(y_train_strat2[:, 1], preds_train_strat2[:, 1])))
print("ROC: {:.2f}".format(roc_auc_score(y_train_strat2[:, 1], preds_train_strat2[:, 1])))

In [None]:
# Evaluation Metrices for Seasonal Flu Vaccines--test data
print("Accuracy: {:.2f}".format(accuracy_score(y_test_strat2[:, 1], preds_test_strat2[:, 1])))
print("Recall: {:.2f}".format(recall_score(y_test_strat2[:, 1], preds_test_strat2[:, 1])))
print("Precision: {:.2f}".format(precision_score(y_test_strat2[:, 1], preds_test_strat2[:, 1])))
print("F1: {:.2f}".format(f1_score(y_test_strat2[:, 1], preds_test_strat2[:, 1])))
print("ROC: {:.2f}".format(roc_auc_score(y_test_strat2[:, 1], preds_test_strat2[:, 1])))

## Strategy 3: imputation with IterativeImputer, default settings (apart from initial_strategy = “most_frequent”)

### Setting up pipeline  

Doesn't need modifying, but needs to be run for each imputation strategy

In [None]:
# Pipeline for categorical features
#CHECK HERE--ARE WE DROPPING THE FIRST COLUMN TO PREVENT MULTICOLINEARITY?
cat_pipeline = Pipeline([
    ('1hot', OneHotEncoder(handle_unknown='error', drop='first'))
])

In [None]:
preprocessor = ColumnTransformer([
    ('cat', cat_pipeline, cat_features)
])

In [None]:
estimators = MultiOutputClassifier(
    estimator=LogisticRegression()
)

In [None]:
full_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", estimators),
])

### Train-test split

Names need to be modified

In [None]:
X_train_strat3, X_test_strat3, y_train_strat3, y_test_strat3 = train_test_split(X_iter, y_strat3, stratify = y_strat3, test_size=0.2, random_state=RSEED)

In [None]:
print('X_train_strat3 shape:', X_train_strat3.shape)
print('X_test_strat3 shape:', X_test_strat3.shape)
print('y_train_strat3 shape:', y_train_strat3.shape)
print('y_test_strat3 shape:', y_test_strat3.shape)

In [None]:
full_pipeline.fit(X_train_strat3, y_train_strat3)

In [None]:
#getting predictions from train data
preds_train_strat3 = full_pipeline.predict(X_train_strat3)

In [None]:
X_test_strat3

In [None]:
#getting predictions from test data
preds_test_strat3 = full_pipeline.predict(X_test_strat3)

### Model evaluation

In [None]:
# Evaluation Metrices for H1N1 Vaccines--train data
print("Accuracy: {:.2f}".format(accuracy_score(y_train_strat3[:, 0], preds_train_strat3[:, 0])))
print("Recall: {:.2f}".format(recall_score(y_train_strat3[:, 0], preds_train_strat3[:, 0])))
print("Precision: {:.2f}".format(precision_score(y_train_strat3[:, 0], preds_train_strat3[:, 0])))
print("F1: {:.2f}".format(f1_score(y_train_strat3[:, 0], preds_train_strat3[:, 0])))
print("ROC: {:.2f}".format(roc_auc_score(y_train_strat3[:, 0], preds_train_strat3[:, 0])))

In [None]:
# Evaluation Metrices for H1N1 Vaccines--test data
print("Accuracy: {:.2f}".format(accuracy_score(y_test_strat3[:, 0], preds_test_strat3[:, 0])))
print("Recall: {:.2f}".format(recall_score(y_test_strat3[:, 0], preds_test_strat3[:, 0])))
print("Precision: {:.2f}".format(precision_score(y_test_strat3[:, 0], preds_test_strat3[:, 0])))
print("F1: {:.2f}".format(f1_score(y_test_strat3[:, 0], preds_test_strat3[:, 0])))
print("ROC: {:.2f}".format(roc_auc_score(y_test_strat3[:, 0], preds_test_strat3[:, 0])))

In [None]:
# Evaluation Metrices for Seasonal Flu Vaccines--train data
print("Accuracy: {:.2f}".format(accuracy_score(y_train_strat3[:, 1], preds_train_strat3[:, 1])))
print("Recall: {:.2f}".format(recall_score(y_train_strat3[:, 1], preds_train_strat3[:, 1])))
print("Precision: {:.2f}".format(precision_score(y_train_strat3[:, 1], preds_train_strat3[:, 1])))
print("F1: {:.2f}".format(f1_score(y_train_strat3[:, 1], preds_train_strat3[:, 1])))
print("ROC: {:.2f}".format(roc_auc_score(y_train_strat3[:, 1], preds_train_strat3[:, 1])))

In [None]:
# Evaluation Metrices for Seasonal Flu Vaccines--test data
print("Accuracy: {:.2f}".format(accuracy_score(y_test_strat3[:, 1], preds_test_strat3[:, 1])))
print("Recall: {:.2f}".format(recall_score(y_test_strat3[:, 1], preds_test_strat3[:, 1])))
print("Precision: {:.2f}".format(precision_score(y_test_strat3[:, 1], preds_test_strat3[:, 1])))
print("F1: {:.2f}".format(f1_score(y_test_strat3[:, 1], preds_test_strat3[:, 1])))
print("ROC: {:.2f}".format(roc_auc_score(y_test_strat3[:, 1], preds_test_strat3[:, 1])))

## Strategy 4: imputation with IterativeImputer, DecisionTreeRegressor estimator

### Setting up pipeline  

Doesn't need modifying, but needs to be run for each imputation strategy

In [None]:
# Pipeline for categorical features
#CHECK HERE--ARE WE DROPPING THE FIRST COLUMN TO PREVENT MULTICOLINEARITY?
cat_pipeline = Pipeline([
    ('1hot', OneHotEncoder(handle_unknown='error', drop='first'))
])

In [None]:
preprocessor = ColumnTransformer([
    ('cat', cat_pipeline, cat_features)
])

In [None]:
estimators = MultiOutputClassifier(
    estimator=LogisticRegression()
)

In [None]:
full_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", estimators),
])

### Train-test split

Names need to be modified

In [None]:
X_train_strat1, X_test_strat1, y_train_strat1, y_test_strat1 = train_test_split(X_strat1, y_strat1, stratify = y_strat1, test_size=0.2, random_state=RSEED)

In [None]:
print('X_train_strat1 shape:', X_train_strat1.shape)
print('X_test_strat1 shape:', X_test_strat1.shape)
print('y_train_strat1 shape:', y_train_strat1.shape)
print('y_test_strat1 shape:', y_test_strat1.shape)

In [None]:
full_pipeline.fit(X_train_strat1, y_train_strat1)

In [None]:
#getting predictions from train data
preds_train_strat1 = full_pipeline.predict(X_train_strat1)

In [None]:
#getting predictions from test data
preds_test_strat1 = full_pipeline.predict(X_test_strat1)


### Model evaluation

In [None]:
# Evaluation Metrices for H1N1 Vaccines--train data
print("Accuracy: {:.2f}".format(accuracy_score(y_train_strat1[:, 0], preds_train_strat1[:, 0])))
print("Recall: {:.2f}".format(recall_score(y_train_strat1[:, 0], preds_train_strat1[:, 0])))
print("Precision: {:.2f}".format(precision_score(y_train_strat1[:, 0], preds_train_strat1[:, 0])))
print("F1: {:.2f}".format(f1_score(y_train_strat1[:, 0], preds_train_strat1[:, 0])))
print("ROC: {:.2f}".format(roc_auc_score(y_train_strat1[:, 0], preds_train_strat1[:, 0])))

In [None]:
# Evaluation Metrices for H1N1 Vaccines--test data
print("Accuracy: {:.2f}".format(accuracy_score(y_test_strat1[:, 0], preds_test_strat1[:, 0])))
print("Recall: {:.2f}".format(recall_score(y_test_strat1[:, 0], preds_test_strat1[:, 0])))
print("Precision: {:.2f}".format(precision_score(y_test_strat1[:, 0], preds_test_strat1[:, 0])))
print("F1: {:.2f}".format(f1_score(y_test_strat1[:, 0], preds_test_strat1[:, 0])))
print("ROC: {:.2f}".format(roc_auc_score(y_test_strat1[:, 0], preds_test_strat1[:, 0])))

In [None]:
# Evaluation Metrices for Seasonal Flu Vaccines--train data
print("Accuracy: {:.2f}".format(accuracy_score(y_train_strat1[:, 1], preds_train_strat1[:, 1])))
print("Recall: {:.2f}".format(recall_score(y_train_strat1[:, 1], preds_train_strat1[:, 1])))
print("Precision: {:.2f}".format(precision_score(y_train_strat1[:, 1], preds_train_strat1[:, 1])))
print("F1: {:.2f}".format(f1_score(y_train_strat1[:, 1], preds_train_strat1[:, 1])))
print("ROC: {:.2f}".format(roc_auc_score(y_train_strat1[:, 1], preds_train_strat1[:, 1])))

In [None]:
# Evaluation Metrices for Seasonal Flu Vaccines--test data
print("Accuracy: {:.2f}".format(accuracy_score(y_test_strat1[:, 1], preds_test_strat1[:, 1])))
print("Recall: {:.2f}".format(recall_score(y_test_strat1[:, 1], preds_test_strat1[:, 1])))
print("Precision: {:.2f}".format(precision_score(y_test_strat1[:, 1], preds_test_strat1[:, 1])))
print("F1: {:.2f}".format(f1_score(y_test_strat1[:, 1], preds_test_strat1[:, 1])))
print("ROC: {:.2f}".format(roc_auc_score(y_test_strat1[:, 1], preds_test_strat1[:, 1])))