In [None]:
import sys
# adding to the path variables the one folder higher (locally, not changing system variables)
sys.path.append("..")
import pandas as pd
import numpy as np
import warnings
import mlflow
from modeling.config import TRACKING_URI, EXPERIMENT_NAME

RSEED = 42

# Modeling Libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.dummy import DummyClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_predict, cross_val_score, cross_validate
from sklearn.metrics import roc_curve, confusion_matrix, accuracy_score, recall_score, precision_score, f1_score, roc_auc_score

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn import svm


warnings.filterwarnings('ignore')


In [None]:
df_features = pd.read_csv('../data/Flu_Shot_Learning_Predict_H1N1_and_Seasonal_Flu_Vaccines_-_Training_Features.csv')

In [None]:
df_features.head()

In [None]:
df_target = pd.read_csv('../data/Flu_Shot_Learning_Predict_H1N1_and_Seasonal_Flu_Vaccines_-_Training_Labels.csv')

In [None]:
# Get info for the target
df_target.info()

In [None]:
df_features.info()

In [None]:
df_features.isnull().sum(axis = 0)

- express missing values as %

Options:
- modelling for imputation (withput using our target variable!)
- or use prediction models that don't care about missing values
- or impute so that the overall distribution stays the same (based on statistics of this data set)
- or impute using a hypothesis (e.g. people who have missing values don't have health insurance?)
- for imputation we can try several approaches and see what gives best results :)

- remember--when we impute, we want to base assumptions based on the test set of a train-test split:
    - if we do multiple models, test-train split for each
    - create functions for imputation

- remember modelling for understanding (EDA style) =/= modelling for prediction

library for visualising missing values:
https://github.com/ResidentMario/missingno

In [None]:
# We are concatenating both dataframes into one 

df = pd.merge(df_target, df_features, on=['respondent_id'])

We will drop the following columns for our first iteration:    
- health_insurance, employment_industry, employment_occupation, income_poverty, marital_status, employment_status

What are the values in the features that have a lot of missing data?

In [None]:
df_features.health_insurance.value_counts()

Binary variable; 12% do not have health insurance, the rest do

In [None]:
df_features.employment_industry.value_counts()

Anonymised variable with 21 values

In [None]:
df_features.employment_occupation.value_counts()

Anonymised variable with 23 values

# Data cleaning

Dropping of features with too many missing values:

In [None]:
col_drop = ['health_insurance', 'employment_industry', 'employment_occupation', 'income_poverty', 'marital_status', 'employment_status']

df.drop(col_drop, axis=1, inplace=True)

Dropping of all rows with null values:

In [None]:
df.dropna(inplace=True)

Check that all null values have been dropped:

In [None]:
df.isnull().sum(axis = 0)

In [None]:
df.info()

In [None]:
df.reset_index(inplace=True)

We dropped all rows with missing values
- Maybe later on, we will want to refine this approach.

In [None]:
 # We are looking for unique values in order to identify whether we have duplicates

df['respondent_id'].nunique()

All values are unique, no duplicates. 

## EDA

In [None]:
# checking for balance in data

print(df.h1n1_vaccine.value_counts())
print(df.seasonal_vaccine.value_counts())

23% vaccination rate for H1N1 and 52% vaccination rate for seasonal flu
The H1N1 vaccine outcome appears to be unbalanced (almost 17k vs 5k) (read lit--what is considered unbalanced?)
The seasonal vaccine outcome appears to be fairly balanced

We may want to deal with the lack of balance later

In [None]:
# Understanding observations and rows

df.info()

In [None]:
#columns to drop because they're little use
col_drop = ['index', 'respondent_id']

df.drop(col_drop, axis=1, inplace=True)

- our target variables are h1n1_vaccine and seasonal_vaccine
- at the moment we are working with 28 feature variables--all categorical (refer to challenge documentation for description; we will need to transfer this info to the README)
- seven of the variables are strings--we will convert these to numeric encoding so we can look at correlations in Profiler
household_adults and household_children are 'top-coded' up to 3--that means that household with 3+ adults (or children) will fall into the '3' group
- 'hhs_geo_region' is an anonymised string
- we should remember that the current column names (which would be used as labels in the graphs) are not really human-readable--we need to keep this in mind when we're making plots (either rename the columns beforehand, or include a plotting command to change the labels)

24  age_group                    21853 non-null  object 
 25  education                    21853 non-null  object 
 26  race                         21853 non-null  object 
 27  sex                          21853 non-null  object 
 28  rent_or_own                  21853 non-null  object 
 29  hhs_geo_region               21853 non-null  object 
 30  census_msa                   21853 non-null  object 

24  age_group                    21853 non-null  object 
 25  education                    21853 non-null  object 
 26  race                         21853 non-null  object 
 27  sex                          21853 non-null  object 
 28  rent_or_own                  21853 non-null  object 
 29  hhs_geo_region               21853 non-null  object 
 30  census_msa                   21853 non-null  object 

Conversion of string variables to numeric (so these variables het displayed in Profiler properly) vie manual numeric encoding:

In [None]:
#a separate dataframe is made for the Profiler; the original dataframe will be retained for one-hot encoding (so the column headings we get during one-hot encoding remain meaningful)
df["age_group"].value_counts()

In [None]:
df["education"].value_counts()

In [None]:
df["race"].value_counts()

In [None]:
df["sex"].value_counts()

In [None]:
df["rent_or_own"].value_counts()

In [None]:
df["hhs_geo_region"].value_counts()

In [None]:
df["census_msa"].value_counts()

In [None]:
cleanup = {"age_group": {"18 - 34 Years": 1, "35 - 44 Years": 2, "45 - 54 Years": 3, "55 - 64 Years": 4,
                                  "65+ Years": 5},
            "education": {"< 12 Years": 1, "12 Years": 2, "Some College": 3, "College Graduate": 4},
            "race": {"White": 1, "Black": 2, "Hispanic": 3, "Other or Multiple": 4},
            "sex" : {"Female": 1, "Male": 2},
            "rent_or_own" : {"Own": 1, "Rent": 2},
            "hhs_geo_region" : {"lzgpxyit": 1, "fpwskwrf": 2, "qufhixun": 3, "bhuqouqj": 4, "oxchjgsf": 5, "kbazzjca": 6, "mlyzmhmf": 7, "atmpeygn": 8, "lrircsnp": 9, "dqpwygqj": 10},
            "census_msa" : {"MSA, Not Principle  City": 1, "MSA, Principle City": 2, "Non-MSA": 3}
                                  }

In [None]:
df_for_profiler = df.replace(cleanup)
df_for_profiler.head()

Run Profiler to explore the data:

In [None]:
#import Profiler
#before opening VS Code, run this command in the terminal: pip install pandas-profiling==2.11.0
from pandas_profiling import ProfileReport

In [None]:
profile = ProfileReport(df_for_profiler, title="Pandas Profiling Report", explorative=True)

In [None]:
#profile

**Possible multicollinearity (between features) based on heatmap:**
- behavioral_large_gatherings vs behavioral_outside_home
- doctor_recc_h1n1 vs doctor_recc_seasonal
- opinion_h1n1_risk vs opinion_seas_risk
- household_children vs age_group

**Possible outliers and features to be aware of:**
- behavioral_antiviral_meds : very unbalanced between categories; the people taking anviral meds could have something else going on (e.g. already sick, or worried about getting flu and taking meds profilactically)--be careful about this variable
- behavioral_face_mask
- behavioral_wash_hands
-child_under_6_months
- health_worker
- opinion_h1n1_vacc_effective and opinion_seasonal_vacc_effective (1.0 group)
- race (not many non-white respondents)
- household_adults and household_children (3.0 groups pretty small)

## Creating Pipelines

In [None]:
# Pipeline for categorical features
#CHECK HERE--ARE WE DROPPING THE FIRST COLUMN TO PREVENT MULTICOLINEARITY?
cat_pipeline = Pipeline([
    ('1hot', OneHotEncoder(handle_unknown='error', drop='first'))
])

In [None]:
cat_features = list(df.columns)


Removal of target variables from cat_features list:

In [None]:
cat_features.remove('h1n1_vaccine')

In [None]:
cat_features.remove('seasonal_vaccine')

Rename the features and target to 'X' and 'y', to make the test-train split easier:

In [None]:
y = df[['h1n1_vaccine', 'seasonal_vaccine']].copy()

In [None]:
y = y.to_numpy()
y

In [None]:

#NB: the H1N1 vaccine and seasonal vaccine are left in, otherwise the pipeline doesn't run properly
#X = df

#NB: dropping the 'h1n1_vaccine' and 'seasonal_vaccine' columns
X = df.drop(columns=['h1n1_vaccine', 'seasonal_vaccine'])

In [None]:
preprocessor = ColumnTransformer([
    ('cat', cat_pipeline, cat_features)
])

In [None]:
estimators = MultiOutputClassifier(
    estimator=LogisticRegression()#(penalty="l2", C=1)
)


In [None]:
full_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", estimators),
])

In [None]:

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size=0.2, random_state=RSEED)

In [None]:
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

In [None]:
full_pipeline.fit(X_train, y_train)



In [None]:
# Figure out later what this does and if we want to use it


#y_train_predicted = cross_val_predict(full_pipeline, X_train, y_train, cv=5)

In [None]:
preds = full_pipeline.predict(X_test)


Model evaluation

In [None]:
# Evaluation Metrices for H1N1 Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test[:, 0], preds[:, 0])))
print("Recall: {:.2f}".format(recall_score(y_test[:, 0], preds[:, 0])))
print("Precision: {:.2f}".format(precision_score(y_test[:, 0], preds[:, 0])))
print("F1: {:.2f}".format(f1_score(y_test[:, 0], preds[:, 0])))
print("ROC: {:.2f}".format(roc_auc_score(y_test[:, 0], preds[:, 0])))

In [None]:
# Evaluation Metrices for Seasonal Flu Vaccines
print("train data: {:.2f}".format(accuracy_score(y_test[:, 1], preds[:, 1])))
print("Recall: {:.2f}".format(recall_score(y_test[:, 1], preds[:, 1])))
print("Precision: {:.2f}".format(precision_score(y_test[:, 1], preds[:, 1])))
print("F1: {:.2f}".format(f1_score(y_test[:, 1], preds[:, 1])))
print("ROC: {:.2f}".format(roc_auc_score(y_test[:, 1], preds[:, 1])))

In [None]:
y_test[:, 0]

In [None]:
y_test

ROC is chosen for the following reasons:
1. curve consideres both -ves and +ves 
2. AUC_score tells how well model distinquishes between -ves and +ves 
3. Both outcomes are valuable because there is nor preference for either
4. For further reading https://developers.google.com/machine-learning/crash-course/classification/roc-and-auc


The goal for the AUC_score is pegged at 0.8 based of the challenge data and the 
benchmarks reached i the competition https://www.researchgate.net/post/What-is-the-value-of-the-area-under-the-roc-curve-AUC-to-conclude-that-a-classifier-is-excellent

## BASELINE MODEL RESULTS

In [None]:
dummy_classifier = DummyClassifier()
dummy_classifier.fit(X_train, y_train)


In [None]:
dummy_train_pred = dummy_classifier.predict(X_train)
dummy_test_pred = dummy_classifier.predict(X_test)

In [None]:
print("ROC: {:.2f}".format(roc_auc_score(y_test[:, 0], dummy_test_pred[:, 0])))
print("ROC: {:.2f}".format(roc_auc_score(y_test[:, 1], dummy_test_pred[:, 1])))

# Single Label Modelling

In [None]:
# pipeline for the single label

full_pipeline_1 = Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", LogisticRegression()),
    
])

## Predicting h1n1_vaccine  with Seasonal Flu Vaccine not in features

In [None]:
y = df['h1n1_vaccine'].copy() # for h1n1_vaccine only

In [None]:
y = y.to_numpy()
y

In [None]:
#NB: the H1N1 vaccine and seasonal vaccine are left in, otherwise the pipeline doesn't run properly
#X = df

#NB: dropping the 'h1n1_vaccine' and 'seasonal_vaccine' columns
#X = df.drop(columns=['seasonal_vaccine'])


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size=0.2, random_state=RSEED) # split for h1n1_vaccine

In [None]:
full_pipeline_1.fit(X_train, y_train)

In [None]:
preds = full_pipeline_1.predict(X_test)

In [None]:
# Evaluation Metrices for H1N1 Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test, preds)))
print("Recall: {:.2f}".format(recall_score(y_test, preds)))
print("Precision: {:.2f}".format(precision_score(y_test, preds)))
print("F1: {:.2f}".format(f1_score(y_test, preds)))
print("ROC: {:.2f}".format(roc_auc_score(y_test, preds)))

## Predicting Seasonal Flu Vaccine with h1n1_vaccine not in features

In [None]:
y = df['seasonal_vaccine'].copy() # for seasonal_vaccine only

In [None]:
y = y.to_numpy()
y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size=0.2, random_state=RSEED) # split for seasonal_vaccine

In [None]:
full_pipeline_1.fit(X_train, y_train)

In [None]:
preds = full_pipeline_1.predict(X_test)

In [None]:
# Evaluation Metrices for H1N1 Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test, preds)))
print("Recall: {:.2f}".format(recall_score(y_test, preds)))
print("Precision: {:.2f}".format(precision_score(y_test, preds)))
print("F1: {:.2f}".format(f1_score(y_test, preds)))
print("ROC: {:.2f}".format(roc_auc_score(y_test, preds)))

## Predicting Seasonal Flu Vaccine with h1n1_vaccine in features

In [None]:
cat_features_new = list(df.columns)

In [None]:
cat_features_new.remove('seasonal_vaccine')

In [None]:
#NB: dropping the 'h1n1_vaccine' and 'seasonal_vaccine' columns
X = df.drop(columns=['seasonal_vaccine'])

In [None]:
y = df['seasonal_vaccine'].copy()
y

In [None]:
y = y.to_numpy()
y

In [None]:
preprocessor = ColumnTransformer([
    ('cat', cat_pipeline, cat_features_new)
])

In [None]:
full_pipeline_1 = Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", LogisticRegression()),
    
])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size=0.2, random_state=RSEED) # split for Seasonal_vaccine

In [None]:
full_pipeline_1.fit(X_train, y_train)

In [None]:
preds = full_pipeline_1.predict(X_test)

In [None]:
# Evaluation Metrices for Seasonal Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test, preds)))
print("Recall: {:.2f}".format(recall_score(y_test, preds)))
print("Precision: {:.2f}".format(precision_score(y_test, preds)))
print("F1: {:.2f}".format(f1_score(y_test, preds)))
print("ROC: {:.2f}".format(roc_auc_score(y_test, preds)))

## Predicting h1n1_vaccine with Seasonal Flu Vaccine  in features

In [None]:
cat_features_new = list(df.columns)


In [None]:
cat_features_new.remove('h1n1_vaccine')

In [None]:
#NB: dropping the 'h1n1_vaccine' and 'seasonal_vaccine' columns
X = df.drop(columns=['h1n1_vaccine'])

In [None]:
y = df['h1n1_vaccine'].copy()
y

In [None]:
preprocessor = ColumnTransformer([
    ('cat', cat_pipeline, cat_features_new)
])

In [None]:
full_pipeline_1 = Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", LogisticRegression()),
    
])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size=0.2, random_state=RSEED) # split for H1N1_vaccine

In [None]:
full_pipeline_1.fit(X_train, y_train)

In [None]:
preds = full_pipeline_1.predict(X_test)

In [None]:
# Evaluation Metrices for H1N1 Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test, preds)))
print("Recall: {:.2f}".format(recall_score(y_test, preds)))
print("Precision: {:.2f}".format(precision_score(y_test, preds)))
print("F1: {:.2f}".format(f1_score(y_test, preds)))
print("ROC: {:.2f}".format(roc_auc_score(y_test, preds)))

# Modelling Algorithms

In addition to Logistic regression, we are trying four different models to compare performance in terms of predicting the Vaccine Intake:

- K nearest neighbours
- Random Forest
- Support Vector Machine
- Naive Bayes

Instantiate the models:

In [None]:
knn_model = KNeighborsClassifier()
rand_forst_model = RandomForestClassifier()
svm_model = svm.SVC(kernel='rbf') 
#svm_model = svm.SVC(kernel='linear', C=1E10) 



Create Pipeline for each:

In [None]:
# for KNN
estimators_knn= MultiOutputClassifier(
    estimator=knn_model
)

# for Random Forest
estimators_rand_forst= MultiOutputClassifier(
    estimator=rand_forst_model
)


# for SVM

estimators_SVC= MultiOutputClassifier(
    estimator=svm_model
)



In [None]:
cat_features = list(df.columns)
cat_features.remove('h1n1_vaccine')
cat_features.remove('seasonal_vaccine')
#cat_features

In [None]:
preprocessor = ColumnTransformer([
    ('cat', cat_pipeline, cat_features)
])

In [None]:
# for KNN
full_pipeline_knn = Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", estimators_knn),
])

# for Random Forest

full_pipeline_rand_forst= Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", estimators_rand_forst),
])

#for SVM

full_pipeline_SVM= Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", estimators_SVC),
])

In [None]:
y = df[['h1n1_vaccine', 'seasonal_vaccine']].copy()
y

In [None]:
y = y.to_numpy()
y


In [None]:
#NB: dropping the 'h1n1_vaccine' and 'seasonal_vaccine' columns
X = df.drop(columns=['h1n1_vaccine', 'seasonal_vaccine'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size=0.2, random_state=RSEED)

Fit the data:

In [None]:
full_pipeline_SVM.fit(X_train, y_train)
full_pipeline_knn.fit(X_train, y_train)
full_pipeline_rand_forst.fit(X_train, y_train)

Get predictions:

In [None]:
#KNN
knn_train_pred = full_pipeline_knn.predict(X_train)
knn_test_pred = full_pipeline_knn.predict(X_test)


In [None]:
#Random forest

rand_forst_train_pred = full_pipeline_rand_forst.predict(X_train)
rand_forst_test_pred = full_pipeline_rand_forst.predict(X_test)

In [None]:
#SVM
svm_train_pred = full_pipeline_SVM.predict(X_train)
svm_test_pred = full_pipeline_SVM.predict(X_test)


### Evaluating model performance for Multilabel

KNN:

In [None]:
# Evaluation Metrices for H1N1 Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test[:, 0], knn_test_pred[:, 0])))
print("Recall: {:.2f}".format(recall_score(y_test[:, 0], knn_test_pred[:, 0])))
print("Precision: {:.2f}".format(precision_score(y_test[:, 0], knn_test_pred[:, 0])))
print("F1: {:.2f}".format(f1_score(y_test[:, 0], knn_test_pred[:, 0])))
print("ROC: {:.2f}".format(roc_auc_score(y_test[:, 0], knn_test_pred[:, 0])))

In [None]:
# Evaluation Metrices for Seasonal Flu Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test[:, 1], knn_test_pred[:, 1])))
print("Recall: {:.2f}".format(recall_score(y_test[:, 1], knn_test_pred[:, 1])))
print("Precision: {:.2f}".format(precision_score(y_test[:, 1], knn_test_pred[:, 1])))
print("F1: {:.2f}".format(f1_score(y_test[:, 1], knn_test_pred[:, 1])))
print("ROC: {:.2f}".format(roc_auc_score(y_test[:, 1], knn_test_pred[:, 1])))

Random Forest:

In [None]:
# Evaluation Metrices for H1N1 Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test[:, 0], rand_forst_test_pred[:, 0])))
print("Recall: {:.2f}".format(recall_score(y_test[:, 0], rand_forst_test_pred[:, 0])))
print("Precision: {:.2f}".format(precision_score(y_test[:, 0], rand_forst_test_pred[:, 0])))
print("F1: {:.2f}".format(f1_score(y_test[:, 0], rand_forst_test_pred[:, 0])))
print("ROC: {:.2f}".format(roc_auc_score(y_test[:, 0], rand_forst_test_pred[:, 0])))

In [None]:
# Evaluation Metrices for Seasonal Flu Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test[:, 1], rand_forst_test_pred[:, 1])))
print("Recall: {:.2f}".format(recall_score(y_test[:, 1], rand_forst_test_pred[:, 1])))
print("Precision: {:.2f}".format(precision_score(y_test[:, 1], rand_forst_test_pred[:, 1])))
print("F1: {:.2f}".format(f1_score(y_test[:, 1], rand_forst_test_pred[:, 1])))
print("ROC: {:.2f}".format(roc_auc_score(y_test[:, 1], rand_forst_test_pred[:, 1])))

Support Vector Machine:

In [None]:
# Evaluation Metrices for H1N1 Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test[:, 0], svm_test_pred[:, 0])))
print("Recall: {:.2f}".format(recall_score(y_test[:, 0], svm_test_pred[:, 0])))
print("Precision: {:.2f}".format(precision_score(y_test[:, 0], svm_test_pred[:, 0])))
print("F1: {:.2f}".format(f1_score(y_test[:, 0], svm_test_pred[:, 0])))
print("ROC: {:.2f}".format(roc_auc_score(y_test[:, 0], svm_test_pred[:, 0])))

In [None]:
# Evaluation Metrices for Seasonal Flu Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test[:, 1], svm_test_pred[:, 1])))
print("Recall: {:.2f}".format(recall_score(y_test[:, 1], svm_test_pred[:, 1])))
print("Precision: {:.2f}".format(precision_score(y_test[:, 1], svm_test_pred[:, 1])))
print("F1: {:.2f}".format(f1_score(y_test[:, 1], svm_test_pred[:, 1])))
print("ROC: {:.2f}".format(roc_auc_score(y_test[:, 1], svm_test_pred[:, 1])))

# Single Label Modelling of the four other algorithms

## Predicting h1n1_vaccine  with Seasonal Flu Vaccine not in features

In [None]:
y = df['h1n1_vaccine'].copy() # for h1n1_vaccine only
y = y.to_numpy()
y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size=0.2, random_state=RSEED) # split for h1n1_vaccine

### Pipeline for the single label

In [None]:
# for KNN
full_pipeline_knn = Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", knn_model),
])

# for Random Forest

full_pipeline_rand_forst= Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", rand_forst_model),
    ])

# for SVM

full_pipeline_svm= Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", svm_model),
    ]) 
    

In [None]:
full_pipeline_svm.fit(X_train, y_train)
full_pipeline_knn.fit(X_train, y_train)
full_pipeline_rand_forst.fit(X_train, y_train)

In [None]:
#KNN
knn_train_pred = full_pipeline_knn.predict(X_train)
knn_test_pred = full_pipeline_knn.predict(X_test)

In [None]:
#Random forest

rand_forst_train_pred = full_pipeline_rand_forst.predict(X_train)
rand_forst_test_pred = full_pipeline_rand_forst.predict(X_test)

In [None]:
#SVM

SVM_train_pred = full_pipeline_svm.predict(X_train)
SVM_forst_test_pred = full_pipeline_svm.predict(X_test)

### Evaluating model performance for Multilabel

KNN:

In [None]:
# Evaluation Metrices for H1N1 Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test, knn_test_pred)))
print("Recall: {:.2f}".format(recall_score(y_test, knn_test_pred)))
print("Precision: {:.2f}".format(precision_score(y_test, knn_test_pred)))
print("F1: {:.2f}".format(f1_score(y_test, knn_test_pred)))
print("ROC: {:.2f}".format(roc_auc_score(y_test, knn_test_pred)))

Random Forest:

In [None]:
# Evaluation Metrices for H1N1 Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test, rand_forst_test_pred)))
print("Recall: {:.2f}".format(recall_score(y_test, rand_forst_test_pred)))
print("Precision: {:.2f}".format(precision_score(y_test, rand_forst_test_pred)))
print("F1: {:.2f}".format(f1_score(y_test, rand_forst_test_pred)))
print("ROC: {:.2f}".format(roc_auc_score(y_test, rand_forst_test_pred)))

SVM:

In [None]:
# Evaluation Metrices for H1N1 Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test, SVM_forst_test_pred)))
print("Recall: {:.2f}".format(recall_score(y_test, SVM_forst_test_pred)))
print("Precision: {:.2f}".format(precision_score(y_test, SVM_forst_test_pred)))
print("F1: {:.2f}".format(f1_score(y_test, SVM_forst_test_pred)))
print("ROC: {:.2f}".format(roc_auc_score(y_test, SVM_forst_test_pred)))

## Predicting Seasonal Flu Vaccine with h1n1_vaccine not in features

In [None]:
y = df['seasonal_vaccine'].copy() # seasonal_vaccine only
y = y.to_numpy()
y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size=0.2, random_state=RSEED) # split for seasonal_vaccine

### Fitting Pipeline for the single label 

In [None]:
full_pipeline_svm.fit(X_train, y_train)
full_pipeline_knn.fit(X_train, y_train)
full_pipeline_rand_forst.fit(X_train, y_train)

In [None]:
#KNN
knn_train_pred = full_pipeline_knn.predict(X_train)
knn_test_pred = full_pipeline_knn.predict(X_test)

In [None]:
#Random forest

rand_forst_train_pred = full_pipeline_rand_forst.predict(X_train)
rand_forst_test_pred = full_pipeline_rand_forst.predict(X_test)

SVM:

In [None]:
#SVM
SVM_train_pred = full_pipeline_svm.predict(X_train)
SVM_forst_test_pred = full_pipeline_svm.predict(X_test)

KNN:

In [None]:
# Evaluation Metrices for H1N1 Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test, knn_test_pred)))
print("Recall: {:.2f}".format(recall_score(y_test, knn_test_pred)))
print("Precision: {:.2f}".format(precision_score(y_test, knn_test_pred)))
print("F1: {:.2f}".format(f1_score(y_test, knn_test_pred)))
print("ROC: {:.2f}".format(roc_auc_score(y_test, knn_test_pred)))

Random Forest:

In [None]:
# Evaluation Metrices for H1N1 Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test, rand_forst_test_pred)))
print("Recall: {:.2f}".format(recall_score(y_test, rand_forst_test_pred)))
print("Precision: {:.2f}".format(precision_score(y_test, rand_forst_test_pred)))
print("F1: {:.2f}".format(f1_score(y_test, rand_forst_test_pred)))
print("ROC: {:.2f}".format(roc_auc_score(y_test, rand_forst_test_pred)))

SVM:

In [None]:
# Evaluation Metrices for H1N1 Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test, SVM_forst_test_pred)))
print("Recall: {:.2f}".format(recall_score(y_test, SVM_forst_test_pred)))
print("Precision: {:.2f}".format(precision_score(y_test, SVM_forst_test_pred)))
print("F1: {:.2f}".format(f1_score(y_test, SVM_forst_test_pred)))
print("ROC: {:.2f}".format(roc_auc_score(y_test, SVM_forst_test_pred)))

## Predicting h1n1_vaccine with Seasonal Flu Vaccine  in features

In [None]:
y = df['h1n1_vaccine'].copy() # for h1n1_vaccine only
y = y.to_numpy()
y

In [None]:
cat_features = list(df.columns)
cat_features.remove('h1n1_vaccine')

#cat_features

In [None]:
#NB: dropping the 'h1n1_vaccine' column
X = df.drop(columns=['h1n1_vaccine'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size=0.2, random_state=RSEED)

In [None]:
preprocessor = ColumnTransformer([
    ('cat', cat_pipeline, cat_features)
])

In [None]:
# for KNN
full_pipeline_knn = Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", knn_model),
])

# for Random Forest

full_pipeline_rand_forst= Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", rand_forst_model),
    ])

    # for SVM

full_pipeline_svm= Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", svm_model),
    ]) 

In [None]:
full_pipeline_svm.fit(X_train, y_train)
full_pipeline_knn.fit(X_train, y_train)
full_pipeline_rand_forst.fit(X_train, y_train)

In [None]:
#KNN
knn_train_pred = full_pipeline_knn.predict(X_train)
knn_test_pred = full_pipeline_knn.predict(X_test)

In [None]:
#Random forest

rand_forst_train_pred = full_pipeline_rand_forst.predict(X_train)
rand_forst_test_pred = full_pipeline_rand_forst.predict(X_test)

In [None]:
#SVM

SVM_train_pred = full_pipeline_svm.predict(X_train)
SVM_test_pred = full_pipeline_svm.predict(X_test)

KNN:

In [None]:
# Evaluation Metrices for H1N1 Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test, knn_test_pred)))
print("Recall: {:.2f}".format(recall_score(y_test, knn_test_pred)))
print("Precision: {:.2f}".format(precision_score(y_test, knn_test_pred)))
print("F1: {:.2f}".format(f1_score(y_test, knn_test_pred)))
print("ROC: {:.2f}".format(roc_auc_score(y_test, knn_test_pred)))

Random Forest:

In [None]:
# Evaluation Metrices for H1N1 Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test, rand_forst_test_pred)))
print("Recall: {:.2f}".format(recall_score(y_test, rand_forst_test_pred)))
print("Precision: {:.2f}".format(precision_score(y_test, rand_forst_test_pred)))
print("F1: {:.2f}".format(f1_score(y_test, rand_forst_test_pred)))
print("ROC: {:.2f}".format(roc_auc_score(y_test, rand_forst_test_pred)))

SVM:

In [None]:
# Evaluation Metrices for H1N1 Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test, SVM_test_pred)))
print("Recall: {:.2f}".format(recall_score(y_test, SVM_test_pred)))
print("Precision: {:.2f}".format(precision_score(y_test, SVM_test_pred)))
print("F1: {:.2f}".format(f1_score(y_test, SVM_test_pred)))
print("ROC: {:.2f}".format(roc_auc_score(y_test, SVM_test_pred)))

## Predicting Seasonal Flu Vaccine with h1n1_vaccine in features

In [None]:
y = df['seasonal_vaccine'].copy() # seasonal_vaccine only
y = y.to_numpy()
y

In [None]:
cat_features = list(df.columns)

cat_features.remove('seasonal_vaccine')
#cat_features

In [None]:
#NB: dropping the 'seasonal_vaccine' column
X = df.drop(columns=['seasonal_vaccine'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size=0.2, random_state=RSEED)

In [None]:
preprocessor = ColumnTransformer([
    ('cat', cat_pipeline, cat_features)
])

In [None]:
# for KNN
full_pipeline_knn = Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", knn_model),
])

# for Random Forest

full_pipeline_rand_forst= Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", rand_forst_model),
    ])

#SVM

full_pipeline_svm= Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", svm_model),
    ]) 

In [None]:
full_pipeline_svm.fit(X_train, y_train)
full_pipeline_knn.fit(X_train, y_train)
full_pipeline_rand_forst.fit(X_train, y_train)

In [None]:
#KNN
knn_train_pred = full_pipeline_knn.predict(X_train)
knn_test_pred = full_pipeline_knn.predict(X_test)

In [None]:
#Random forest

rand_forst_train_pred = full_pipeline_rand_forst.predict(X_train)
rand_forst_test_pred = full_pipeline_rand_forst.predict(X_test)

In [None]:
#SVM

SVM_train_pred = full_pipeline_svm.predict(X_train)
SVM_test_pred = full_pipeline_svm.predict(X_test)

In [None]:
#full_pipeline_knn.fit(X_train, y_train)
#full_pipeline_rand_forst.fit(X_train, y_train)
#svm_model.fit(X_train, y_train)
#model_pipeline.fit(train.data, train.target)

KNN:

In [None]:
# Evaluation Metrices for H1N1 Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test, knn_test_pred)))
print("Recall: {:.2f}".format(recall_score(y_test, knn_test_pred)))
print("Precision: {:.2f}".format(precision_score(y_test, knn_test_pred)))
print("F1: {:.2f}".format(f1_score(y_test, knn_test_pred)))
print("ROC: {:.2f}".format(roc_auc_score(y_test, knn_test_pred)))

Random Forest:

In [None]:
# Evaluation Metrices for H1N1 Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test, rand_forst_test_pred)))
print("Recall: {:.2f}".format(recall_score(y_test, rand_forst_test_pred)))
print("Precision: {:.2f}".format(precision_score(y_test, rand_forst_test_pred)))
print("F1: {:.2f}".format(f1_score(y_test, rand_forst_test_pred)))
print("ROC: {:.2f}".format(roc_auc_score(y_test, rand_forst_test_pred)))

SVM:

In [None]:
# Evaluation Metrices for H1N1 Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test, SVM_test_pred)))
print("Recall: {:.2f}".format(recall_score(y_test, SVM_test_pred)))
print("Precision: {:.2f}".format(precision_score(y_test, SVM_test_pred)))
print("F1: {:.2f}".format(f1_score(y_test, SVM_test_pred)))
print("ROC: {:.2f}".format(roc_auc_score(y_test, SVM_test_pred)))

The dummy classifier predicts everything to belong to the same class and thus has no discriminatory ability (between negative and positive class). Therefore, the AUC of 0.5 is expected.

## FEATURE IMPORTANCE (STILL A WORK IN PROGRESS)

In [None]:
from sklearn.feature_selection import RFE

### Trial with coef_ method from logistic regresion

In [None]:
for clf in full_pipeline.steps[1][1].estimators_: #this part is necessary to go into the relevant step of the pipeline and get the logistic regression estimator
    print(clf.coef_)

      

    #for i,v in enumerate(clf.coef_[0]):
    #    print(i,v)
#grid_fit.best_estimator_.feature_importances_})
#print(feat_impts)
#importance = np.mean(feat_impts, axis=0)

#for i,v in enumerate(importance):
#	print('Feature: %0d, Score: %.5f' % (i,v))

The output gives us coefficients for features, but without knowing what features these coefficients belong to, this output is quite meaningless

### Feature importance extracted using ELI5  
https://towardsdatascience.com/extracting-feature-importances-from-scikit-learn-pipelines-18c79b4ae09a

In [None]:
#pip install eli5 in external terminal
import eli5

In [None]:
full_pipeline.steps

In [None]:
onehot_columns = list(full_pipeline.named_steps['preprocessor'].named_transformers_['cat'].named_steps['1hot'].get_feature_names(input_features=cat_features))
#numeric_features_list = list(numeric_features)
#numeric_features_list.extend(onehot_columns)

In [None]:
for clf in full_pipeline.steps[1][1].estimators_:
    print(eli5.explain_weights(clf, feature_names=onehot_columns))

Here we get the weights of the features and the feature name--but it looks fairly unreadable. We should be able to get the visual table from ELI5 with the ranking of the features

### Trying using permutation_importance (not really working for now)
https://scikit-learn.org/stable/modules/generated/sklearn.inspection.permutation_importance.html#sklearn.inspection.permutation_importance

In [None]:
from sklearn.inspection import permutation_importance

In [None]:
for clf in full_pipeline.steps[1][1].estimators_:
    print(permutation_importance(clf, X_test, y_test, random_state=RSEED))