#### **DATA UNDERSTANDING**


In [61]:
# import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, classification_report, confusion_matrix, ConfusionMatrixDisplay, log_loss

In [2]:
# loading our data with selected features
columns = ['chronic_med_condition', 'child_under_6_months', 'health_worker', 'health_insurance', 'household_adults', 'household_children', 'age_group', 'education', 'sex', 'income_poverty', 'employment_status']

df_x = pd.read_csv('data/training_set_features.csv', usecols=columns)
df_x.head()

Unnamed: 0,chronic_med_condition,child_under_6_months,health_worker,health_insurance,age_group,education,sex,income_poverty,employment_status,household_adults,household_children
0,0.0,0.0,0.0,1.0,55 - 64 Years,< 12 Years,Female,Below Poverty,Not in Labor Force,0.0,0.0
1,0.0,0.0,0.0,1.0,35 - 44 Years,12 Years,Male,Below Poverty,Employed,0.0,0.0
2,1.0,0.0,0.0,,18 - 34 Years,College Graduate,Male,"<= $75,000, Above Poverty",Employed,2.0,0.0
3,1.0,0.0,0.0,,65+ Years,12 Years,Female,Below Poverty,Not in Labor Force,0.0,0.0
4,0.0,0.0,0.0,,45 - 54 Years,Some College,Female,"<= $75,000, Above Poverty",Employed,1.0,0.0


In [3]:
# loading our target data
df_y = pd.read_csv('data/training_set_labels.csv', index_col=0)
df_y.head()

Unnamed: 0_level_0,h1n1_vaccine,seasonal_vaccine
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0,0
1,0,1
2,0,0
3,0,1
4,0,0


In [4]:
df_x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26707 entries, 0 to 26706
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   chronic_med_condition  25736 non-null  float64
 1   child_under_6_months   25887 non-null  float64
 2   health_worker          25903 non-null  float64
 3   health_insurance       14433 non-null  float64
 4   age_group              26707 non-null  object 
 5   education              25300 non-null  object 
 6   sex                    26707 non-null  object 
 7   income_poverty         22284 non-null  object 
 8   employment_status      25244 non-null  object 
 9   household_adults       26458 non-null  float64
 10  household_children     26458 non-null  float64
dtypes: float64(6), object(5)
memory usage: 2.2+ MB


### **DATA PREPARATION**

In [5]:
# check for missing values in target dataset
df_y.isnull().any()

h1n1_vaccine        False
seasonal_vaccine    False
dtype: bool

In [6]:
# check count of the missing values in each column
missing_values = df_x.isnull().sum()

missing_values.sort_values(ascending=False)

health_insurance         12274
income_poverty            4423
employment_status         1463
education                 1407
chronic_med_condition      971
child_under_6_months       820
health_worker              804
household_adults           249
household_children         249
sex                          0
age_group                    0
dtype: int64

In [7]:
df_x['household_adults'].value_counts()

household_adults
1.0    14474
0.0     8056
2.0     2803
3.0     1125
Name: count, dtype: int64

In [8]:

df_x['household_adults'] = df_x['household_adults'].replace(0.0, 1.0)

# Verify the change
print(df_x['household_adults'].value_counts())

household_adults
1.0    22530
2.0     2803
3.0     1125
Name: count, dtype: int64


In [9]:
# impute numerical missing values 
numerical_columns = df_x.select_dtypes(include=np.number).columns.drop('health_insurance')

impute_number = SimpleImputer(strategy='most_frequent')
df_x[numerical_columns] = impute_number.fit_transform(df_x[numerical_columns])

In [10]:
# impute categorical missing values 
categorical_columns = df_x.select_dtypes(exclude=['number']).columns

impute_number = SimpleImputer(strategy='most_frequent')
df_x[categorical_columns] = impute_number.fit_transform(df_x[categorical_columns])

In [11]:
df_x.isnull().sum()

chronic_med_condition        0
child_under_6_months         0
health_worker                0
health_insurance         12274
age_group                    0
education                    0
sex                          0
income_poverty               0
employment_status            0
household_adults             0
household_children           0
dtype: int64

In [12]:
# fill missing values in health insurance using multiple linear regression
df_known = df_x.dropna(subset=['health_insurance']).copy()
df_missing = df_x[df_x['health_insurance'].isnull()].copy()

X_known = df_known[['chronic_med_condition', 'age_group', 'employment_status', 'health_worker', 'income_poverty', 'household_children', 'child_under_6_months']].copy()
y_known = df_known['health_insurance'].copy()
X_missing = df_missing[['chronic_med_condition', 'age_group', 'employment_status', 'health_worker', 'income_poverty', 'household_children', 'child_under_6_months']].copy()


label_encoder = LabelEncoder()
X_known['age_group_encoded'] = label_encoder.fit_transform(X_known['age_group'])
X_missing['age_group_encoded'] = label_encoder.transform(X_missing['age_group'])

X_known['income_poverty_encoded'] = label_encoder.fit_transform(X_known['income_poverty'])
X_missing['income_poverty_encoded'] = label_encoder.transform(X_missing['income_poverty'])


ohe = OneHotEncoder(sparse_output=False)
X_known_employment = ohe.fit_transform(X_known[['employment_status']])
X_missing_employment = ohe.transform(X_missing[['employment_status']])

ohe_feature_names = ohe.get_feature_names_out(['employment_status'])
X_known_employment_df = pd.DataFrame(X_known_employment, index=X_known.index, columns=ohe_feature_names)
X_missing_employment_df = pd.DataFrame(X_missing_employment, index=X_missing.index, columns=ohe_feature_names)


numerical_features = ['chronic_med_condition', 'health_worker', 'household_children', 'child_under_6_months']
X_known_numerical = X_known[numerical_features].copy()
X_missing_numerical = X_missing[numerical_features]

X_known_processed = pd.concat([X_known_numerical, X_known[['age_group_encoded', 'income_poverty_encoded']], X_known_employment_df], axis=1)
X_missing_processed = pd.concat([X_missing_numerical, X_missing[['age_group_encoded', 'income_poverty_encoded']], X_missing_employment_df], axis=1)


In [13]:
# modelling
model = LinearRegression()

model.fit(X_known_processed, y_known)
predicted_health_insurance = model.predict(X_missing_processed)

predicted_health_insurance

array([0.80587987, 0.9401926 , 0.88182975, ..., 0.89893579, 0.97346349,
       0.92976335], shape=(12274,))

In [14]:
threshold = 0.5
binary_outcomes = (predicted_health_insurance >= threshold).astype(int)
binary_outcomes

array([1, 1, 1, ..., 1, 1, 1], shape=(12274,))

In [15]:
(binary_outcomes == 0).sum()

np.int64(52)

In [16]:
df_x.loc[df_x['health_insurance'].isnull(), 'health_insurance'] = binary_outcomes
df_x.isnull().sum()

chronic_med_condition    0
child_under_6_months     0
health_worker            0
health_insurance         0
age_group                0
education                0
sex                      0
income_poverty           0
employment_status        0
household_adults         0
household_children       0
dtype: int64

In [17]:
df_x['health_insurance'].value_counts()

health_insurance
1.0    24919
0.0     1788
Name: count, dtype: int64

In [18]:
# defining X and y
X = df_x
y = df_y[['h1n1_vaccine', 'seasonal_vaccine']]

# splitting the data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y) 

In [19]:
categorical_columns = X_train.select_dtypes(exclude=['number']).columns
categorical_columns

Index(['age_group', 'education', 'sex', 'income_poverty', 'employment_status'], dtype='object')

In [20]:
# encoding categorical features
label_encoder = LabelEncoder()
columns_a = ['age_group', 'education', 'income_poverty']
columns_b = ['sex', 'employment_status']

# Applying Label Encoding 
for column in columns_a:
    X_train[column] = label_encoder.fit_transform(X_train[column])
    X_test[column] = label_encoder.transform(X_test[column])

# Applying One-Hot Encoding
X_train = pd.get_dummies(X_train, columns=columns_b, drop_first=False)

X_test = pd.get_dummies(X_test, columns=columns_b, drop_first=False)

X_train

Unnamed: 0,chronic_med_condition,child_under_6_months,health_worker,health_insurance,age_group,education,income_poverty,household_adults,household_children,sex_Female,sex_Male,employment_status_Employed,employment_status_Not in Labor Force,employment_status_Unemployed
962,0.0,0.0,1.0,1.0,0,0,0,1.0,1.0,True,False,True,False,False
6417,1.0,0.0,0.0,1.0,0,2,0,1.0,0.0,False,True,True,False,False
20805,0.0,0.0,0.0,1.0,3,2,1,1.0,0.0,False,True,True,False,False
13168,0.0,0.0,0.0,1.0,1,2,0,1.0,3.0,True,False,False,True,False
19226,1.0,0.0,0.0,1.0,3,3,0,2.0,1.0,True,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
314,1.0,1.0,0.0,1.0,0,2,0,1.0,0.0,True,False,True,False,False
22262,0.0,0.0,0.0,1.0,4,1,0,1.0,0.0,True,False,False,True,False
18765,0.0,0.0,0.0,1.0,4,2,1,1.0,0.0,False,True,False,True,False
3250,0.0,0.0,0.0,1.0,0,0,2,1.0,0.0,True,False,True,False,False


In [21]:
# convert features with boolean values to integer
convert_int = ['sex_Female', 'sex_Male', 'employment_status_Employed',
                      'employment_status_Not in Labor Force', 'employment_status_Unemployed']

for col in convert_int:
    X_train[col] = X_train[col].astype(int)
for col in convert_int:
    X_test[col] = X_test[col].astype(int)

X_train.head()

Unnamed: 0,chronic_med_condition,child_under_6_months,health_worker,health_insurance,age_group,education,income_poverty,household_adults,household_children,sex_Female,sex_Male,employment_status_Employed,employment_status_Not in Labor Force,employment_status_Unemployed
962,0.0,0.0,1.0,1.0,0,0,0,1.0,1.0,1,0,1,0,0
6417,1.0,0.0,0.0,1.0,0,2,0,1.0,0.0,0,1,1,0,0
20805,0.0,0.0,0.0,1.0,3,2,1,1.0,0.0,0,1,1,0,0
13168,0.0,0.0,0.0,1.0,1,2,0,1.0,3.0,1,0,0,1,0
19226,1.0,0.0,0.0,1.0,3,3,0,2.0,1.0,1,0,1,0,0


### Normalization

In [22]:
# scaling using StandardScaler
scaling_columns = ['household_adults', 'household_children', 'age_group', 'education', 'income_poverty']

scaler = StandardScaler()

X_train[scaling_columns] = scaler.fit_transform(X_train[scaling_columns])
X_test[scaling_columns] = scaler.transform(X_test[scaling_columns])

X_train.head()

Unnamed: 0,chronic_med_condition,child_under_6_months,health_worker,health_insurance,age_group,education,income_poverty,household_adults,household_children,sex_Female,sex_Male,employment_status_Employed,employment_status_Not in Labor Force,employment_status_Unemployed
962,0.0,0.0,1.0,1.0,-1.496956,-1.624917,-0.681908,-0.385359,0.508168,1,0,1,0,0
6417,1.0,0.0,0.0,1.0,-1.496956,0.240246,-0.681908,-0.385359,-0.573511,0,1,1,0,0
20805,0.0,0.0,0.0,1.0,0.55773,0.240246,0.806932,-0.385359,-0.573511,0,1,1,0,0
13168,0.0,0.0,0.0,1.0,-0.812061,0.240246,-0.681908,-0.385359,2.671526,1,0,0,1,0
19226,1.0,0.0,0.0,1.0,0.55773,1.172827,-0.681908,1.674068,0.508168,1,0,1,0,0


### Build and Evaluate Baseline Model

In [24]:
# instatiate the base Logistic Regression model
logistic_model = LogisticRegression(random_state=42, solver='liblinear')

# instantiate the MultiOutputClassifier
multilabel_logistic = MultiOutputClassifier(logistic_model)

# fitting the multilabel model
multilabel_logistic.fit(X_train, y_train)

In [25]:
# predictions for train and test sets
y_pred_train = multilabel_logistic.predict(X_train)
y_pred_test = multilabel_logistic.predict(X_test)

In [56]:
precision_micro= precision_score(y_test, y_pred_test, average='micro')
precision_macro = precision_score(y_test, y_pred_test, average='macro')
print(f'Precision micro: {precision_micro}')
print(f'Precision macro: {precision_macro}')


Precision micro: 0.6415218104296491
Precision macro: 0.5932353825280894


* Micro - It tells you that out of all the times your model predicted that someone received a vaccine (either H1N1 or seasonal), it was correct about 64.15% of the time
* Macro - It gives equal weight to each class, regardless of its frequency in the dataset.
A macro precision of 0.5932 suggests that, on average, for each vaccine type, when your model predicted that someone received it, it was correct about 59.32% of the time.
*  The lower macro precision indicates that the model's performance might be worse on the minority class(es)
* Model Improvement: Based on these baseline scores, you can now try to improve your model by:
Feature engineering
Trying different classification algorithms
Tuning the hyperparameters of your current model
Addressing potential class imbalance issues (if one vaccine uptake is much higher than the other)

In [36]:
recall_micro = recall_score(y_test, y_pred_test, average='micro')
recall_macro = recall_score(y_test, y_pred_test, average='macro')

print(f'Recall micro: {recall_micro}')
print(f'Recall macro: {recall_macro}')

Recall micro: 0.4320742213386349
Recall macro: 0.32320096666441345


Micro vs. Macro: Similar to precision, the micro recall is higher than the macro recall. This again suggests that the model might be better at identifying the more frequent class (if there is one). The lower macro recall indicates that the model struggles more with at least one of the vaccine types (likely the less frequent one).

In [37]:
f1_micro = f1_score(y_test, y_pred_test, average='micro')
f1_macro = f1_score(y_test, y_pred_test, average='macro')

print(f'f1 micro: {f1_micro}')
print(f'f1 macro: {f1_macro}')

f1 micro: 0.5163674762407603
f1 macro: 0.3445479145590183


In [39]:
print(classification_report(y_test, y_pred_test, zero_division=0))

              precision    recall  f1-score   support

           0       0.54      0.03      0.06      1418
           1       0.64      0.61      0.63      3109

   micro avg       0.64      0.43      0.52      4527
   macro avg       0.59      0.32      0.34      4527
weighted avg       0.61      0.43      0.45      4527
 samples avg       0.28      0.23      0.25      4527



In [55]:
# for H1N1 vaccine
cm_h1n1 = confusion_matrix(y_test['h1n1_vaccine'], y_pred_test[:, 0])

# for Seasonal vaccine
cm_seasonal = confusion_matrix(y_test['seasonal_vaccine'], y_pred_test[:, 1])
print(f'H1N1: {cm_h1n1}')
print('\n........\n')
print(f'Seasonal: {cm_seasonal}')

H1N1: [[5221   38]
 [1373   45]]

........

Seasonal: [[2513 1055]
 [1198 1911]]


In [63]:
cv = KFold(n_splits=5, shuffle=True, random_state=42)

def multilabel_neg_log_loss(estimator, X, y):
    y_pred_proba_list = estimator.predict_proba(X)
    total_loss = 0
    for i in range(y.shape[1]):
        y_true_label = y.iloc[:, i].values  # Access column by integer position and get NumPy array
        y_pred_proba_label = y_pred_proba_list[i][:, 1] # Probabilities for the positive class
        total_loss += log_loss(y_true_label, y_pred_proba_label, labels=[0, 1])
    return -total_loss / y.shape[1]

cv_scores_neg_log_loss = cross_val_score(multilabel_logistic, X_train, y_train, scoring=multilabel_neg_log_loss, cv=cv)
