In [25]:
import numpy as np
import pandas as pd
from constants import *

from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, roc_auc_score
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

RANDOM_SEED = 6    # Set a random seed for reproducibility!

pd.set_option("display.max_columns", 100)

In [2]:
features_df = pd.read_csv(
    TRAINING_FEATURES_PATH, 
    index_col='respondent_id'
)
labels_df = pd.read_csv(
    TRAINING_LABELS_PATH,
    index_col='respondent_id'
)
joined_df = features_df.join(labels_df)

In [3]:
numeric_cols = features_df.columns[features_df.dtypes != "object"].values
numeric_cols

array(['h1n1_concern', 'h1n1_knowledge', 'behavioral_antiviral_meds',
       'behavioral_avoidance', 'behavioral_face_mask',
       'behavioral_wash_hands', 'behavioral_large_gatherings',
       'behavioral_outside_home', 'behavioral_touch_face',
       'doctor_recc_h1n1', 'doctor_recc_seasonal',
       'chronic_med_condition', 'child_under_6_months', 'health_worker',
       'health_insurance', 'opinion_h1n1_vacc_effective',
       'opinion_h1n1_risk', 'opinion_h1n1_sick_from_vacc',
       'opinion_seas_vacc_effective', 'opinion_seas_risk',
       'opinion_seas_sick_from_vacc', 'household_adults',
       'household_children'], dtype=object)

In [4]:
categorical_columns = [
    'race',
    'sex',
    'marital_status',
    'hhs_geo_region',
    'census_msa',
    'employment_industry',
    'employment_occupation'
]

ord_cols = [
    'age_group', 
    'education',  
    'income_poverty',
    'employment_status'
]

ordinal_numeric_columns = [
    'h1n1_concern',
    'h1n1_knowledge',

    'opinion_h1n1_vacc_effective',
    'opinion_h1n1_risk',
    'opinion_h1n1_sick_from_vacc',
    'opinion_seas_vacc_effective',
    'opinion_seas_risk',
    'opinion_seas_sick_from_vacc',
]

In [17]:
len(numeric_cols) + len(categorical_columns) + len(ord_cols)

34

In [39]:
# chain preprocessing into a Pipeline object
# each step is a tuple of (name you chose, sklearn transformer)

numeric_preprocessing_steps = Pipeline([
    ('simple_imputer', SimpleImputer(strategy='most_frequent')),
    ('standard_scaler', StandardScaler()),
])

categorical_preprocessing_steps = OneHotEncoder(drop='first', handle_unknown="ignore")

ordinal_preprocessing_steps = OneHotEncoder(drop='first', handle_unknown="ignore")

# num_ordinal_preprocessing_steps = SimpleImputer(strategy='most_frequent')

# create the preprocessor stage of final pipeline
# each entry in the transformer list is a tuple of
# (name you choose, sklearn transformer, list of columns)
preprocessor = ColumnTransformer(
    transformers = [
        ("numeric", numeric_preprocessing_steps, numeric_cols),
        ('categorical', categorical_preprocessing_steps, categorical_columns),
        ('ordinal_columns', ordinal_preprocessing_steps, ord_cols),
        # ('numeric_ordinal_columns', num_ordinal_preprocessing_steps, ordinal_numeric_columns)
    ],
    remainder = "drop"
)

estimators = MultiOutputClassifier(
    estimator=CatBoostClassifier(verbose=0)
)

full_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", estimators),
])

In [40]:
X_train, X_eval, y_train, y_eval = train_test_split(
    features_df,
    labels_df,
    test_size=0.33,
    shuffle=True,
    stratify=labels_df,
    random_state=RANDOM_SEED
)

In [41]:
full_pipeline.fit(X_train, y_train)
preds = full_pipeline.predict_proba(X_eval)

In [34]:
y_preds = pd.DataFrame(
    {
        "h1n1_vaccine": preds[0][:, 1],
        "seasonal_vaccine": preds[1][:, 1],
    },
    index = y_eval.index
)
print("y_preds.shape:", y_preds.shape)
y_preds.head()

y_preds.shape: (8814, 2)


Unnamed: 0_level_0,h1n1_vaccine,seasonal_vaccine
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1
6728,0.416994,0.543614
16516,0.142872,0.50071
3106,0.143042,0.706672
16981,0.898247,0.950919
19111,0.229258,0.831416


In [35]:
roc_auc_score(y_eval, y_preds)

0.850154136281466

# Fit on Full Training Dataset

In [39]:
full_pipeline.fit(features_df, labels_df)

Learning rate set to 0.041889
0:	learn: 0.6635008	total: 19.9ms	remaining: 19.9s
1:	learn: 0.6363401	total: 37.6ms	remaining: 18.8s
2:	learn: 0.6130426	total: 54.8ms	remaining: 18.2s
3:	learn: 0.5909012	total: 72.3ms	remaining: 18s
4:	learn: 0.5714509	total: 88.9ms	remaining: 17.7s
5:	learn: 0.5546941	total: 107ms	remaining: 17.7s
6:	learn: 0.5384350	total: 123ms	remaining: 17.4s
7:	learn: 0.5245718	total: 141ms	remaining: 17.5s
8:	learn: 0.5119335	total: 159ms	remaining: 17.5s
9:	learn: 0.5000462	total: 174ms	remaining: 17.2s
10:	learn: 0.4897264	total: 189ms	remaining: 17s
11:	learn: 0.4804173	total: 204ms	remaining: 16.8s
12:	learn: 0.4733017	total: 219ms	remaining: 16.6s
13:	learn: 0.4654231	total: 235ms	remaining: 16.6s
14:	learn: 0.4590457	total: 250ms	remaining: 16.4s
15:	learn: 0.4524228	total: 264ms	remaining: 16.2s
16:	learn: 0.4461560	total: 279ms	remaining: 16.1s
17:	learn: 0.4415083	total: 294ms	remaining: 16s
18:	learn: 0.4365798	total: 308ms	remaining: 15.9s
19:	learn: 0

# GENERATING THE PREDICTIONS FOR THE TEST SET

In [40]:
test_features_df = pd.read_csv(
    TEST_FEATURES_PATH, 
    index_col="respondent_id"
)

submission_df = pd.read_csv(
    SUBMISSION_FORMAT_PATH, 
    index_col="respondent_id"
)

test_probas = full_pipeline.predict_proba(test_features_df)

# Make sure we have the rows in the same order
np.testing.assert_array_equal(
    test_features_df.index.values, 
    submission_df.index.values
)

# Save predictions to submission data frame
submission_df["h1n1_vaccine"] = test_probas[0][:, 1]
submission_df["seasonal_vaccine"] = test_probas[1][:, 1]

submission_df.to_csv(SUBMISSION_DIR / 'catboost-1.csv', index=True)