In [1]:
import numpy as np
import pandas as pd

In [2]:
TRAINING_FEATURES_PATH="D:/ml_competitions/flu_shot_learning/data/raw/training_set_features.csv"
TRAINING_LABELS_PATH="D:/ml_competitions/flu_shot_learning/data/raw/training_set_labels.csv"
TEST_FEATURES_PATH="D:/ml_competitions/flu_shot_learning/data/raw/test_set_features.csv"
SUBMISSION_FORMAT_PATH="D:/ml_competitions/flu_shot_learning/data/raw/submission_format.csv"

SUBMISSION_DIR="D:/ml_competitions/flu_shot_learning/submissions"

In [3]:
features_df         = pd.read_csv(TRAINING_FEATURES_PATH,   index_col='respondent_id')
labels_df           = pd.read_csv(TRAINING_LABELS_PATH,     index_col='respondent_id')
test_features_df    = pd.read_csv(TEST_FEATURES_PATH,       index_col="respondent_id")

In [4]:
categorical_columns = [
    'race',
    'sex',
    'marital_status',
    'employment_status',
    'hhs_geo_region',
    'census_msa',
    'employment_industry',
    'employment_occupation',
    'rent_or_own'
]

ordinal_numeric_columns = [
    'h1n1_concern',
    'h1n1_knowledge',
    'opinion_h1n1_vacc_effective',
    'opinion_h1n1_risk',
    'opinion_h1n1_sick_from_vacc',
    'opinion_seas_vacc_effective',
    'opinion_seas_risk',
    'opinion_seas_sick_from_vacc',
    'household_adults',
    'household_children',
]

ordinal_object_columns = [
    'age_group',
    'education',
    'income_poverty',
]

boolean_columns = [
    'behavioral_antiviral_meds',
    'behavioral_avoidance', 
    'behavioral_face_mask', 
    'behavioral_wash_hands',
    'behavioral_large_gatherings', 
    'behavioral_outside_home',
    'behavioral_touch_face', 
    'doctor_recc_h1n1', 
    'doctor_recc_seasonal',
    'chronic_med_condition', 
    'child_under_6_months', 
    'health_worker',
    'health_insurance',
]

# Ordinal Mapping
ordinal_mapping = [
    {
        'col': 'age_group',
        'mapping': {
            '18 - 34 Years': 0, 
            '35 - 44 Years': 1, 
            '45 - 54 Years': 2,
            '55 - 64 Years': 3, 
            '65+ Years': 4,
        }
    },
    {
        'col': 'education',
        'mapping': {
            '< 12 Years': 0, 
            '12 Years': 1, 
            'College Graduate': 2, 
            'Some College': 3
        }
    },
    {
        'col': 'income_poverty',
        'mapping': {
            'Below Poverty': 0, 
            '<= $75,000, Above Poverty': 1,
            '> $75,000': 2
        }
    }
]

In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from category_encoders import OrdinalEncoder

category_preprocessor = make_pipeline(
    OneHotEncoder(drop='first', sparse_output=False),
)

ordinal_object_preprocessor = make_pipeline(
    OrdinalEncoder(mapping=ordinal_mapping),
    SimpleImputer(strategy='constant', fill_value=-1),
)

default_preprocessor = ColumnTransformer(
    transformers=[
        ('boolean_imputer', SimpleImputer(strategy='mean'), boolean_columns),
        ('ordinal_numeric_imputer', SimpleImputer(strategy='mean'), ordinal_numeric_columns),
        ('ordinal_object_preprocessor', ordinal_object_preprocessor, ordinal_object_columns),
        ('category_preprocessor', category_preprocessor, categorical_columns)
    ],
    remainder='passthrough'
)
# default_preprocessor.set_output(transform='pandas')
default_preprocessor

# XGBoost Pipeline

In [10]:
from sklearn.model_selection import KFold, cross_val_score
from xgboost import XGBClassifier


xgb_clf = XGBClassifier(
    **{
        "objective":"binary:logistic",
        "tree_method": 'gpu_hist',
        "verbosity": 0,
        "eval_metric": "auc",
        'booster': 'dart',
        'lambda': 0.7140230812009954,
        'alpha': 1.6175277748886985e-07,
        'n_estimators': 32,
        'max_depth': 5,
        'eta': 0.5225844217014856,
        'gamma': 2.8043035483101756e-07,
        'grow_policy': 'depthwise',
        'sample_type': 'uniform',
        'normalize_type': 'tree',
        'rate_drop': 8.812616601318049e-05,
        'skip_drop': 5.645780622823094e-06,
    }
)


xgb_pipeline = Pipeline([
    ('preprocessor', default_preprocessor),
    ('classifier', xgb_clf),
]).set_output(transform='default')

xgb_pipeline

In [78]:
kfold = KFold(n_splits=10, shuffle=True, random_state=68)
results = cross_val_score(
    xgb_pipeline, 
    features_df, 
    labels_df.h1n1_vaccine, 
    cv=kfold, scoring='roc_auc')
results.mean()

0.862841462045534

# Light GBM Pipeline

In [11]:
from sklearn.model_selection import KFold, cross_val_score
from lightgbm import LGBMClassifier


lgbm_clf = LGBMClassifier(verbose=0, force_row_wise=True)


lgbm_pipeline = Pipeline([
    ('preprocessor', default_preprocessor),
    ('classifier', lgbm_clf),
]).set_output(transform='default')

lgbm_pipeline

In [76]:
kfold = KFold(n_splits=10, shuffle=True, random_state=68)
results = cross_val_score(
    lgbm_pipeline, 
    features_df, 
    labels_df.h1n1_vaccine, 
    cv=kfold, scoring='roc_auc')
results.mean()

0.8697359187570968

# Catboost Pipeline

In [12]:
from sklearn.model_selection import KFold, cross_val_score
from catboost import CatBoostClassifier

ordinal_object_preprocessor = make_pipeline(
    OrdinalEncoder(mapping=ordinal_mapping),
    SimpleImputer(strategy='constant', fill_value=-1),
)

preprocessor_catboost = ColumnTransformer(
    transformers=[
        ('boolean_imputer', SimpleImputer(strategy='mean'), boolean_columns),
        ('ordinal_numeric_imputer', SimpleImputer(strategy='mean'), ordinal_numeric_columns),
        ('ordinal_object_preprocessor', ordinal_object_preprocessor, ordinal_object_columns),
        ('category_preprocessor', SimpleImputer(strategy='constant', fill_value='None'), categorical_columns),
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
).set_output(transform='pandas')


catboost_pipeline = Pipeline([
    ('preprocessor', preprocessor_catboost),
    ('classifier', CatBoostClassifier(cat_features=categorical_columns, 
                                      **{
                                        'logging_level': 'Silent',
                                        # 'iterations': 1500,
                                        # 'learning_rate': 0.015494337354824417,
                                        # 'random_strength': 7,
                                        # 'bagging_temperature': 6,
                                        # 'max_bin': 5,
                                        # 'grow_policy': 'Depthwise',
                                        # 'min_data_in_leaf': 10,
                                        # 'max_depth': 10,
                                        # 'l2_leaf_reg': 80.31439723807587,
                                        # 'one_hot_max_size': 5,
                                        # 'auto_class_weights': 'SqrtBalanced'
                                        }
                                      )
     ),
]).set_output(transform='pandas')

catboost_pipeline

In [83]:
kfold = KFold(n_splits=2, shuffle=True, random_state=68)
results = cross_val_score(
    catboost_pipeline, 
    features_df, 
    labels_df.h1n1_vaccine, 
    cv=kfold, scoring='roc_auc')
results.mean()

0.8694785071220704

# Stacking Classifier

In [13]:
from sklearn.ensemble import StackingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score


clf = StackingClassifier(
    estimators=[
        ('xgboost', xgb_pipeline), 
        ('catboost', catboost_pipeline),
        ('lightgbm', lgbm_pipeline),
        
    ], 
    final_estimator=LogisticRegression(class_weight='balanced', random_state=68),
    stack_method='predict_proba',
    cv=5
)
clf

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    features_df, 
    labels_df.h1n1_vaccine, 
    test_size=0.3, random_state=68)

clf.fit(X_train, y_train)
roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])

0.873852743850767  
0.8751732886187005 - catboost no params  

In [15]:
# H1N1

h1n1_clf = StackingClassifier(
    estimators=[
        ('xgboost', xgb_pipeline), 
        ('catboost', catboost_pipeline),
        ('lightgbm', lgbm_pipeline),
        
    ], 
    final_estimator=LogisticRegression(class_weight='balanced', random_state=68),
    stack_method='predict_proba',
)

h1n1_clf.fit(features_df, labels_df.h1n1_vaccine)

In [16]:
# Seasonal

seasonal_clf = StackingClassifier(
    estimators=[
        ('xgboost', xgb_pipeline), 
        ('catboost', catboost_pipeline),
        ('lightgbm', lgbm_pipeline),
        
    ], 
    final_estimator=LogisticRegression(class_weight='balanced', random_state=68),
    stack_method='predict_proba',
)

seasonal_clf.fit(features_df, labels_df.seasonal_vaccine)

In [17]:
from pathlib import Path


test_features_df = pd.read_csv(TEST_FEATURES_PATH, index_col="respondent_id")

submission_df = pd.read_csv(
    SUBMISSION_FORMAT_PATH, 
    index_col="respondent_id"
)

np.testing.assert_array_equal(test_features_df.index.values, 
                              submission_df.index.values)

# Save predictions to submission data frame
submission_df["h1n1_vaccine"] = h1n1_clf.predict_proba(test_features_df)[:, 1]
submission_df["seasonal_vaccine"] = seasonal_clf.predict_proba(test_features_df)[:, 1]

submission_df.to_csv(Path(SUBMISSION_DIR) / 'stack-xgb-catboost-lgbm.csv', index=True)