In [1]:
import sys
# adding to the path variables the one folder higher (locally, not changing system variables)
sys.path.append("..")
import pandas as pd
import numpy as np
import warnings
import mlflow
from modeling.config import TRACKING_URI, EXPERIMENT_NAME

RSEED = 42
# Modeling Libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.dummy import DummyClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_predict, cross_val_score, cross_validate
from sklearn.metrics import roc_curve, confusion_matrix, accuracy_score, recall_score, precision_score, f1_score, roc_auc_score

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier



warnings.filterwarnings('ignore')


In [2]:
# The initial notebook end_of_first_iteration has been stripped down to only the necessary code 

In [3]:
df_features = pd.read_csv('../data/Flu_Shot_Learning_Predict_H1N1_and_Seasonal_Flu_Vaccines_-_Training_Features.csv')

In [4]:
df_target = pd.read_csv('../data/Flu_Shot_Learning_Predict_H1N1_and_Seasonal_Flu_Vaccines_-_Training_Labels.csv')

In [5]:
# We are concatenating both dataframes into one 

df = pd.merge(df_target, df_features, on=['respondent_id'])

# Data cleaning

Dropping of features with too many missing values:

In [6]:
col_drop = ['health_insurance', 'employment_industry', 'employment_occupation', 'income_poverty', 'marital_status', 'employment_status']

df.drop(col_drop, axis=1, inplace=True)

Dropping of all rows with null values:

In [7]:
df.dropna(inplace=True)

In [8]:
df.reset_index(inplace=True)

We dropped all rows with missing values
- Maybe later on, we will want to refine this approach.

## EDA

In [9]:
# checking for balance in data

print(df.h1n1_vaccine.value_counts())
print(df.seasonal_vaccine.value_counts())

0    16906
1     4947
Name: h1n1_vaccine, dtype: int64
0    11371
1    10482
Name: seasonal_vaccine, dtype: int64


In [10]:
#columns to drop because they're little use
col_drop = ['index', 'respondent_id']

df.drop(col_drop, axis=1, inplace=True)

We save the 'final' dataframe of our first iteration into a new .csv-file. 

In [11]:
df.to_csv('../data/Flu_Shot_Data_cleaned_1.csv')

## Creating Pipelines

In [12]:
# Pipeline for categorical features
#CHECK HERE--ARE WE DROPPING THE FIRST COLUMN TO PREVENT MULTICOLINEARITY?
cat_pipeline = Pipeline([
    ('1hot', OneHotEncoder(handle_unknown='error', drop='first'))
])

In [13]:
cat_features = list(df.columns)

Removal of target variables from cat_features list:

In [14]:
cat_features.remove('h1n1_vaccine')

In [15]:
'''cat_features.remove('seasonal_vaccine')'''

"cat_features.remove('seasonal_vaccine')"

Rename the features and target to 'X' and 'y', to make the test-train split easier:

In [16]:
# y = df[['h1n1_vaccine', 'seasonal_vaccine']].copy()
y = df[['h1n1_vaccine']].copy()

In [17]:
y = y.to_numpy()

In [18]:
#NB: the H1N1 vaccine and seasonal vaccine are left in, otherwise the pipeline doesn't run properly
X = df

In [19]:
preprocessor = ColumnTransformer([
    ('cat', cat_pipeline, cat_features)
])

In [20]:
# For applying linear regression, the estimator is changed to single outputclassifier and linear regression

estimators = LinearRegression()



In [21]:
full_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", estimators),
])

In [22]:

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size=0.2, random_state=RSEED)

In [23]:
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

X_train shape: (17482, 31)
X_test shape: (4371, 31)
y_train shape: (17482, 1)
y_test shape: (4371, 1)


In [24]:
full_pipeline.fit(X_train, y_train)



Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat',
                                                  Pipeline(steps=[('1hot',
                                                                   OneHotEncoder(drop='first'))]),
                                                  ['seasonal_vaccine',
                                                   'h1n1_concern',
                                                   'h1n1_knowledge',
                                                   'behavioral_antiviral_meds',
                                                   'behavioral_avoidance',
                                                   'behavioral_face_mask',
                                                   'behavioral_wash_hands',
                                                   'behavioral_large_gatherings',
                                                   'behavioral_outside_home',
                                                   'behavioral_

In [25]:
# Figure out later what this does and if we want to use it


#y_train_predicted = cross_val_predict(full_pipeline, X_train, y_train, cv=5)

In [26]:
preds = full_pipeline.predict(X_test)


Model evaluation

In [27]:
'''# Evaluation Metrices for H1N1 Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test[:, 0], preds[:, 0])))
print("Recall: {:.2f}".format(recall_score(y_test[:, 0], preds[:, 0])))
print("Precision: {:.2f}".format(precision_score(y_test[:, 0], preds[:, 0])))
print("F1: {:.2f}".format(f1_score(y_test[:, 0], preds[:, 0])))
print("ROC: {:.2f}".format(roc_auc_score(y_test[:, 0], preds[:, 0])))'''

'# Evaluation Metrices for H1N1 Vaccines\nprint("Accuracy: {:.2f}".format(accuracy_score(y_test[:, 0], preds[:, 0])))\nprint("Recall: {:.2f}".format(recall_score(y_test[:, 0], preds[:, 0])))\nprint("Precision: {:.2f}".format(precision_score(y_test[:, 0], preds[:, 0])))\nprint("F1: {:.2f}".format(f1_score(y_test[:, 0], preds[:, 0])))\nprint("ROC: {:.2f}".format(roc_auc_score(y_test[:, 0], preds[:, 0])))'

In [28]:
'''# Evaluation Metrices for Seasonal Flu Vaccines
print("Accuracy: {:.2f}".format(accuracy_score(y_test[:, 1], preds[:, 1])))
print("Recall: {:.2f}".format(recall_score(y_test[:, 1], preds[:, 1])))
print("Precision: {:.2f}".format(precision_score(y_test[:, 1], preds[:, 1])))
print("F1: {:.2f}".format(f1_score(y_test[:, 1], preds[:, 1])))
print("ROC: {:.2f}".format(roc_auc_score(y_test[:, 1], preds[:, 1])))'''

'# Evaluation Metrices for Seasonal Flu Vaccines\nprint("Accuracy: {:.2f}".format(accuracy_score(y_test[:, 1], preds[:, 1])))\nprint("Recall: {:.2f}".format(recall_score(y_test[:, 1], preds[:, 1])))\nprint("Precision: {:.2f}".format(precision_score(y_test[:, 1], preds[:, 1])))\nprint("F1: {:.2f}".format(f1_score(y_test[:, 1], preds[:, 1])))\nprint("ROC: {:.2f}".format(roc_auc_score(y_test[:, 1], preds[:, 1])))'

In [29]:
y_test[:, 0]

array([0, 0, 1, ..., 0, 0, 0])

In [30]:
y_test

array([[0],
       [0],
       [1],
       ...,
       [0],
       [0],
       [0]])

## FEATURE IMPORTANCE (STILL A WORK IN PROGRESS)

### Applying statsmodels.formula for linear regression

In [31]:
# Importing the statsmodels formula
import statsmodels.formula.api as smf

Starting with a first try: 
- No split of train and test data 
- No dummy variables (because statsmodels will dummy code the categorical variables)

In [32]:

smf.ols(formula='h1n1_vaccine ~ h1n1_concern + h1n1_knowledge + + behavioral_antiviral_meds + behavioral_avoidance + behavioral_face_mask + behavioral_wash_hands + behavioral_large_gatherings + behavioral_outside_home + behavioral_touch_face + doctor_recc_h1n1 + doctor_recc_seasonal + chronic_med_condition + child_under_6_months + health_worker + opinion_h1n1_vacc_effective + opinion_h1n1_risk + opinion_h1n1_sick_from_vacc + opinion_seas_vacc_effective + opinion_seas_risk + opinion_seas_sick_from_vacc + age_group + education + race + sex + rent_or_own + hhs_geo_region + census_msa + household_adults + household_children', data=df).fit().summary()

0,1,2,3
Dep. Variable:,h1n1_vaccine,R-squared:,0.281
Model:,OLS,Adj. R-squared:,0.28
Method:,Least Squares,F-statistic:,189.4
Date:,"Mon, 19 Jul 2021",Prob (F-statistic):,0.0
Time:,09:47:09,Log-Likelihood:,-8366.6
No. Observations:,21853,AIC:,16830.0
Df Residuals:,21807,BIC:,17190.0
Df Model:,45,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.3751,0.020,-18.432,0.000,-0.415,-0.335
age_group[T.35 - 44 Years],0.0059,0.009,0.676,0.499,-0.011,0.023
age_group[T.45 - 54 Years],0.0128,0.008,1.594,0.111,-0.003,0.029
age_group[T.55 - 64 Years],0.0578,0.008,6.846,0.000,0.041,0.074
age_group[T.65+ Years],0.0746,0.009,8.535,0.000,0.057,0.092
education[T.< 12 Years],-0.0255,0.010,-2.611,0.009,-0.045,-0.006
education[T.College Graduate],0.0235,0.007,3.476,0.001,0.010,0.037
education[T.Some College],0.0050,0.007,0.730,0.465,-0.008,0.019
race[T.Hispanic],0.0139,0.013,1.042,0.298,-0.012,0.040

0,1,2,3
Omnibus:,2128.594,Durbin-Watson:,2.001
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2798.28
Skew:,0.855,Prob(JB):,0.0
Kurtosis:,3.383,Cond. No.,102.0


- the linear model performs poorly on the dataset (as expected)
- next step: not all variables have been converted to dummies: we will do this in the next step 
- Finding a way to recombine the dummy variables to their original form

In [42]:
# Performing linear regression with H1N1 as target and dummy variables 
results_h1n1 = smf.ols(formula='h1n1_vaccine ~ C(seasonal_vaccine) + C(h1n1_concern) + C(h1n1_knowledge) + C(behavioral_antiviral_meds) + C(behavioral_avoidance) + C(behavioral_face_mask) + C(behavioral_wash_hands) + C(behavioral_large_gatherings) + C(behavioral_outside_home) + C(behavioral_touch_face) + C(doctor_recc_h1n1) + C(doctor_recc_seasonal) + C(chronic_med_condition) + C(child_under_6_months) + C(health_worker) + C(opinion_h1n1_vacc_effective) + C(opinion_h1n1_risk) + C(opinion_h1n1_sick_from_vacc) + C(opinion_seas_vacc_effective) + C(opinion_seas_risk) + C(opinion_seas_sick_from_vacc) + C(age_group) + C(education) + C(race) + C(sex) + C(rent_or_own) + C(hhs_geo_region) + C(census_msa) + C(household_adults) + C(household_children)', data=df).fit()

In [43]:
results_summary_h1n1 = results_h1n1.summary()
print(results_summary_h1n1)

                            OLS Regression Results                            
Dep. Variable:           h1n1_vaccine   R-squared:                       0.351
Model:                            OLS   Adj. R-squared:                  0.349
Method:                 Least Squares   F-statistic:                     165.8
Date:                Mon, 19 Jul 2021   Prob (F-statistic):               0.00
Time:                        11:32:57   Log-Likelihood:                -7249.9
No. Observations:               21853   AIC:                         1.464e+04
Df Residuals:                   21781   BIC:                         1.522e+04
Df Model:                          71                                         
Covariance Type:            nonrobust                                         
                                            coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------

- slightly better adjusted R2: 0.349 -> still very bad. 
- Trying with only the most correlated features (according to correlation matrix)

In [44]:
def results_summary_to_dataframe(results_h1n1):
    '''take the result of an statsmodel results table and transforms it into a dataframe'''
    pvals = results_h1n1.pvalues
    coeff = results_h1n1.params
    conf_lower = results_h1n1.conf_int()[0]
    conf_higher = results_h1n1.conf_int()[1]

    results_df_h1n1 = pd.DataFrame({"pvals":pvals,
                               "coeff":coeff,
                               "conf_lower":conf_lower,
                               "conf_higher":conf_higher
                                })

    #Reordering...
    results_df_h1n1 = results_df_h1n1[["coeff","pvals","conf_lower","conf_higher"]]
    return results_df_h1n1


In [45]:
results_df_h1n1 = results_summary_to_dataframe(results_h1n1)

In [47]:
results_df_h1n1.sort_values(by=['coeff'], ascending=False)

Unnamed: 0,coeff,pvals,conf_lower,conf_higher
C(doctor_recc_h1n1)[T.1.0],0.345598,0.000000e+00,0.331622,0.359574
C(seasonal_vaccine)[T.1],0.254506,0.000000e+00,0.243131,0.265881
C(opinion_h1n1_risk)[T.5.0],0.250005,9.666086e-98,0.226768,0.273242
C(opinion_h1n1_vacc_effective)[T.5.0],0.192920,9.683696e-35,0.162217,0.223622
C(opinion_h1n1_risk)[T.4.0],0.168915,1.892024e-93,0.152847,0.184982
...,...,...,...,...
C(opinion_h1n1_sick_from_vacc)[T.2.0],-0.039333,4.320678e-10,-0.051678,-0.026987
C(h1n1_concern)[T.3.0],-0.039335,1.362052e-04,-0.059541,-0.019129
C(opinion_h1n1_sick_from_vacc)[T.3.0],-0.065372,9.956966e-02,-0.143170,0.012426
Intercept,-0.077785,5.196546e-04,-0.121712,-0.033859


In [48]:
results_df_h1n1.to_csv('feature_importance_h1n1.csv')

### Linear regression with limited number of features
- Choice of features: most influential ones according to coefficient weights

In [None]:
# including the following features: seasonal_vaccine, doctor_recc_h1n1, doctor_recc_seasonal, opinion_h1n1_vacc_effective, opinion_h1n1_risk, opinion_seas_vacc_effective, opinion_seas_risk
smf.ols(formula='h1n1_vaccine ~ C(seasonal_vaccine) + C(doctor_recc_h1n1) + C(doctor_recc_seasonal) + C(opinion_h1n1_vacc_effective) + C(opinion_h1n1_risk) + C(opinion_seas_vacc_effective) + C(opinion_seas_risk)', data=df).fit().summary()

0,1,2,3
Dep. Variable:,h1n1_vaccine,R-squared:,0.335
Model:,OLS,Adj. R-squared:,0.335
Method:,Least Squares,F-statistic:,579.6
Date:,"Sat, 17 Jul 2021",Prob (F-statistic):,0.0
Time:,16:18:05,Log-Likelihood:,-7509.3
No. Observations:,21853,AIC:,15060.0
Df Residuals:,21833,BIC:,15220.0
Df Model:,19,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.0425,0.015,-2.906,0.004,-0.071,-0.014
C(seasonal_vaccine)[T.1],0.2695,0.006,48.508,0.000,0.259,0.280
C(doctor_recc_h1n1)[T.1.0],0.3584,0.007,50.295,0.000,0.344,0.372
C(doctor_recc_seasonal)[T.1.0],-0.1420,0.006,-21.946,0.000,-0.155,-0.129
C(opinion_h1n1_vacc_effective)[T.2.0],-0.0072,0.017,-0.430,0.667,-0.040,0.026
C(opinion_h1n1_vacc_effective)[T.3.0],-0.0047,0.016,-0.293,0.770,-0.036,0.027
C(opinion_h1n1_vacc_effective)[T.4.0],0.0454,0.015,2.988,0.003,0.016,0.075
C(opinion_h1n1_vacc_effective)[T.5.0],0.1871,0.016,11.936,0.000,0.156,0.218
C(opinion_h1n1_risk)[T.2.0],0.0291,0.006,4.662,0.000,0.017,0.041

0,1,2,3
Omnibus:,1439.162,Durbin-Watson:,1.99
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1745.634
Skew:,0.661,Prob(JB):,0.0
Kurtosis:,3.41,Cond. No.,23.5


- When we consider h1n1_vaccines as target, including seasonal_vaccine into features, the following features show the highest importance:
    - doctor_recc_h1n1
    - seasonal_vaccine
    - opinion_h1n1_vacc_effective
    - opinion_h1n1_risk

## Linear regression with seasonal vaccine as target variable 
- the H1N1 vaccine is included as a feature

In [None]:
# performing linear regression with seasonal vaccine as target 
results = smf.ols(formula='seasonal_vaccine ~ C(h1n1_vaccine) + C(h1n1_concern) + C(h1n1_knowledge) + C(behavioral_antiviral_meds) + C(behavioral_avoidance) + C(behavioral_face_mask) + C(behavioral_wash_hands) + C(behavioral_large_gatherings) + C(behavioral_outside_home) + C(behavioral_touch_face) + C(doctor_recc_h1n1) + C(doctor_recc_seasonal) + C(chronic_med_condition) + C(child_under_6_months) + C(health_worker) + C(opinion_h1n1_vacc_effective) + C(opinion_h1n1_risk) + C(opinion_h1n1_sick_from_vacc) + C(opinion_seas_vacc_effective) + C(opinion_seas_risk) + C(opinion_seas_sick_from_vacc) + C(age_group) + C(education) + C(race) + C(sex) + C(rent_or_own) + C(hhs_geo_region) + C(census_msa) + C(household_adults) + C(household_children)', data=df).fit()

In [None]:
results_summary = results.summary()
print(results_summary)

                            OLS Regression Results                            
Dep. Variable:       seasonal_vaccine   R-squared:                       0.429
Model:                            OLS   Adj. R-squared:                  0.428
Method:                 Least Squares   F-statistic:                     230.9
Date:                Sat, 17 Jul 2021   Prob (F-statistic):               0.00
Time:                        16:18:10   Log-Likelihood:                -9710.7
No. Observations:               21853   AIC:                         1.957e+04
Df Residuals:                   21781   BIC:                         2.014e+04
Df Model:                          71                                         
Covariance Type:            nonrobust                                         
                                            coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------

- Adj. R-squared:	0.428 (seasonal_vaccine as target, all other variables as features)

In [None]:
# Creating a function that transforms the results into a dataframe
# hoping to have a better grip on the feature importance

def results_summary_to_dataframe(results):
    '''take the result of an statsmodel results table and transforms it into a dataframe'''
    pvals = results.pvalues
    coeff = results.params
    conf_lower = results.conf_int()[0]
    conf_higher = results.conf_int()[1]

    results_df = pd.DataFrame({"pvals":pvals,
                               "coeff":coeff,
                               "conf_lower":conf_lower,
                               "conf_higher":conf_higher
                                })

    #Reordering...
    results_df = results_df[["coeff","pvals","conf_lower","conf_higher"]]
    return results_df


In [None]:
results_df = results_summary_to_dataframe(results)

In [None]:
results_df.sort_values(by=['coeff'], ascending=False)

Unnamed: 0,coeff,pvals,conf_lower,conf_higher
C(opinion_seas_risk)[T.5.0],0.355755,1.349827e-204,0.333153,0.378357
C(h1n1_vaccine)[T.1],0.318791,0.000000e+00,0.304543,0.333039
C(opinion_seas_risk)[T.4.0],0.278352,9.697290e-211,0.260936,0.295769
C(doctor_recc_seasonal)[T.1.0],0.267276,4.406808e-299,0.253325,0.281227
C(age_group)[T.65+ Years],0.247381,7.457680e-147,0.228735,0.266026
...,...,...,...,...
Intercept,-0.086147,5.944759e-04,-0.135309,-0.036984
C(opinion_h1n1_risk)[T.5.0],-0.087208,7.531097e-11,-0.113453,-0.060962
C(opinion_seas_sick_from_vacc)[T.5.0],-0.112763,2.773389e-19,-0.137361,-0.088166
C(doctor_recc_h1n1)[T.1.0],-0.155225,7.684738e-77,-0.171559,-0.138891


In [None]:
results_df.to_csv('feature_importance.csv')

## Applying the sklearn linear regression model

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
full_pipeline.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat',
                                                  Pipeline(steps=[('1hot',
                                                                   OneHotEncoder(drop='first'))]),
                                                  ['seasonal_vaccine',
                                                   'h1n1_concern',
                                                   'h1n1_knowledge',
                                                   'behavioral_antiviral_meds',
                                                   'behavioral_avoidance',
                                                   'behavioral_face_mask',
                                                   'behavioral_wash_hands',
                                                   'behavioral_large_gatherings',
                                                   'behavioral_outside_home',
                                                   'behavioral_

In [None]:
y_pred = full_pipeline.predict(X_test)

In [None]:
from sklearn import metrics

In [None]:
print('Mean absolute error      :', metrics.mean_absolute_error(y_test, y_pred))
print('R squared                :', metrics.r2_score(y_test, y_pred))

Mean absolute error      : 0.25095079075737264
R squared                : 0.3408536515020424


- model performs poorly on our dataset (but two target variables were given)
- next step: do lin reg for single targets 
- pipeline and data selection will be adjusted 
- results for SGDClassifier on H1N1: very poor
- results for LinearRegression on H1N1: R squared : 0.3408536515020424 (still very poor, statsmodel approach still works a bit better)

This is an extensive source for interpreting coefficients in linear reg models
However, there does not seem to be a way of recombining the dummy variables    

https://scikit-learn.org/stable/auto_examples/inspection/plot_linear_model_coefficient_interpretation.html

In [None]:
'''feature_names = (model.named_steps['columntransformer']
                      .named_transformers_['onehotencoder']
                      .get_feature_names(input_features=categorical_columns))
feature_names = np.concatenate(
    [feature_names, numerical_columns])

coefs = pd.DataFrame(
    model.named_steps['transformedtargetregressor'].regressor_.coef_,
    columns=['Coefficients'], index=feature_names
)

coefs
'''