In [42]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score, hamming_loss, f1_score


In [50]:
y = pd.read_csv('../Data/Flu_Shot_Learning_Predict_H1N1_and_Seasonal_Flu_Vaccines_-_Training_Labels.csv')
X = pd.read_csv('../Data/X_train_clean.csv')
print(f"Shape of X_train: {X.shape}")
print(f"Shape of y_train: {y.shape}")
labels = ['h1n1_vaccine', 'seasonal_vaccine']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

Shape of X_train: (26707, 30)
Shape of y_train: (26707, 3)


In [51]:
y_train

Unnamed: 0,respondent_id,h1n1_vaccine,seasonal_vaccine
5303,5303,0,1
2703,2703,0,0
6586,6586,0,0
22563,22563,1,1
2338,2338,1,1
...,...,...,...
21575,21575,0,1
5390,5390,0,0
860,860,0,0
15795,15795,0,0


## This problem is a Multi-Label Problem:

$$
\mathcal{D} = \{(\mathbf{x_i}, \mathbf{y_i}) \; \text{for } i=1 ,2, ..., N\}
$$

Where:  $\mathbf{y_i} \: \in \; \mathbb{R}^2$

# Label Powerset Approach:

Treats each combination of labels as it's own distinct label in a multi-class setup:
The set of possible labels are therefore:

$$
\{ 00, \; 01, \; 10, \; 11 \}
$$

Therefore: $y \in \{ 1, \; 2, \; 3, \; 4 \}$ in that order.

In [52]:
print(y_train.head())

       respondent_id  h1n1_vaccine  seasonal_vaccine
5303            5303             0                 1
2703            2703             0                 0
6586            6586             0                 0
22563          22563             1                 1
2338            2338             1                 1


In [53]:
#Label Powerset model using XGBoost

# Vectorized encoding using binary logic
y_train_enc =  2 * y_train['h1n1_vaccine'].values + y_train['seasonal_vaccine'].values
y_test_enc =  2 * y_test['h1n1_vaccine'].values + y_test['seasonal_vaccine'].values

#y_train_enc = y_train_enc.to_numpy() if hasattr(y_train_enc, 'to_numpy') else y_train_enc


In [54]:
y_train_enc

array([1, 0, 0, ..., 0, 0, 0])

In [55]:
xgb_model = xgb.XGBClassifier(
    objective="multi:softprob",
    num_class=4,
    eval_metric="mlogloss",
    use_label_encoder=False
)
xgb_model.fit(X_train, y_train_enc)

y_pred_enc = xgb_model.predict(X_test)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [56]:
#decode the preds

codes = np.array(y_pred_enc) - 1

# Recover the original bits
h1n1 = (codes // 2) % 2
seasonal = codes % 2

# Recombine into DataFrame
y_pred = pd.DataFrame({'h1n1_vaccine': h1n1, 'seasonal_vaccine': seasonal})

In [57]:
y_pred.head()

Unnamed: 0,h1n1_vaccine,seasonal_vaccine
0,1,1
1,1,1
2,0,0
3,1,1
4,1,1


In [58]:
# Exact match accuracy
XGB_acc = np.mean(y_test_enc == y_pred_enc)
print(f"Exact Match Accuracy LPS XGBoost: {XGB_acc}")

Exact Match Accuracy LPS XGBoost: 0.5935355048046924


In [60]:
from sklearn.metrics import multilabel_confusion_matrix

mcm = multilabel_confusion_matrix(y_test[labels], y_pred[labels])
print(mcm)

[[[1547 4772]
  [ 442 1252]]

 [[ 864 3478]
  [2179 1492]]]


# Ensemble of Classifier Chains Approach

1) Have Predictors $k_1, k_2$