In [1]:
from pathlib import Path

import numpy as np
import pandas as pd

pd.set_option("display.max_columns", 100)

In [2]:
features_df = pd.read_csv(
   "Data/training_set_features.csv", 
    index_col="respondent_id"
)
labels_df = pd.read_csv(
    "Data/training_set_labels.csv" ,
    index_col="respondent_id"
)

In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier

from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_curve, roc_auc_score

from xgboost.sklearn import XGBClassifier
from sklearn.neural_network import MLPClassifier
RANDOM_SEED = 8    # Set a random seed for reproducibility!


from sklearn.preprocessing import OneHotEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

from sklearn.svm import SVC

from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, BaggingClassifier
from sklearn.ensemble import StackingClassifier

from sklearn.preprocessing import FunctionTransformer

from numpy import nan
import copy

In [4]:
numeric_cols = features_df.columns[features_df.dtypes != "object"].values
non_numeric_cols = features_df.columns[features_df.dtypes == 'object'].values

In [5]:
# create age groups numeric values

seperate_processing_cols = ["age_group", "income_poverty"]
#seperate_processing_cols = []

age_mean_map = {'18 - 34 Years': 26,
                '35 - 44 Years': 40,
                '45 - 54 Years': 50,
                '55 - 64 Years': 60,
                '65+ Years': 70, 
                }

income_map = {'Below Poverty': 1,
              '<= $75,000, Above Poverty': 2,
              '> $75,000':3
             }

full_map = {"age_group":age_mean_map,
            "income_poverty":income_map}

def map_to_numeric(x, mapping):
    if mapping.get(x) == None:
        return nan
    else:
        return mapping.get(x)

def convert(df):
    new_df = copy.deepcopy(df)
    for col in seperate_processing_cols:
        new_df[col] = df[col].apply(lambda x: map_to_numeric(x, full_map.get(col))) 
    return new_df

numeric_to_cat_transofrmer = FunctionTransformer(convert)   
convert(features_df)["income_poverty"].unique()
#features_df


array([ 1.,  2.,  3., nan])

In [6]:
# chain preprocessing into a Pipeline object
# each step is a tuple of (name you chose, sklearn transformer)
high_missing_cols = []
non_numeric_cols = [x for x in non_numeric_cols if (x not in high_missing_cols and x not in seperate_processing_cols)]
numeric_cols = [x for x in numeric_cols if (x not in high_missing_cols and x not in seperate_processing_cols)]
# chain preprocessing into a Pipeline object
# each step is a tuple of (name you chose, sklearn transformer)
numeric_to_cat_preprocessing_steps = Pipeline([
    ('convert_to_cat', numeric_to_cat_transofrmer),
    ('simple_imputer', SimpleImputer(strategy='mean')),
    ('standard_scaler', StandardScaler()),
    ])

numeric_preprocessing_steps = Pipeline(steps=[
    ('standard_scaler', StandardScaler()),
    ('imputer', SimpleImputer(strategy='mean'))])

non_numeric_preprocessing_steps = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])
    
# create the preprocessor stage of final pipeline
# each entry in the transformer list is a tuple of
# (name you choose, sklearn transformer, list of columns)
preprocessor = ColumnTransformer(
    transformers = [
        ("numeric", numeric_preprocessing_steps, numeric_cols),
        ("non_numeric",non_numeric_preprocessing_steps,non_numeric_cols),
        ("numeric_to_cat", numeric_to_cat_preprocessing_steps, seperate_processing_cols)
    ],
    remainder = "drop"
)

In [7]:
est_mlp = MLPClassifier(hidden_layer_sizes=(400,300,200), learning_rate='invscaling', learning_rate_init=0.0001,
                          power_t=0.5, max_iter=1500, shuffle=True, tol=0.0001, 
                          early_stopping=True, validation_fraction=0.1, n_iter_no_change=10, max_fun=15000,
                         random_state=RANDOM_SEED)
est_xgb = XGBClassifier(learning_rate =0.02, n_estimators=750, max_depth=6, min_child_weight=2, gamma=0.2,
                             subsample=0.8,colsample_bytree=0.4,reg_alpha=0.1,objective= 'binary:logistic',nthread=4,
                              scale_pos_weight=1 ,reg_lambda = 10 , random_state = 42)
est_gb = GradientBoostingClassifier(random_state=8, n_estimators=750)
est_rn = RandomForestClassifier(max_depth=15, bootstrap=True, n_estimators=1500)
est_svc = SVC()
est_nb = GaussianNB()
est_kn = KNeighborsClassifier()

estimators_stacked = [('xbg', est_xgb),
              ('mlp', est_mlp),
              ('gb', est_gb),
              ('rn', est_rn)
             ]

stacked_estimator =  StackingClassifier(estimators=estimators_stacked,
                                    final_estimator=LogisticRegression(),
                                    stack_method='predict_proba'
                                  )

In [8]:
estimators = MultiOutputClassifier(
    estimator = stacked_estimator
)

In [9]:
full_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", estimators),
])

In [10]:
X_train, X_eval, y_train, y_eval = train_test_split(
    features_df,
    labels_df,
    test_size=0.33,
    shuffle=True,
    stratify=labels_df,
    random_state=RANDOM_SEED
)

In [11]:
%%time

# Train model
full_pipeline.fit(X_train, y_train)

# Predict on evaluation set
# This competition wants probabilities, not labels
preds = full_pipeline.predict_proba(X_eval)
preds

CPU times: total: 14min 28s
Wall time: 19min 1s


[array([[0.94363755, 0.05636245],
        [0.9426512 , 0.0573488 ],
        [0.94257069, 0.05742931],
        ...,
        [0.93232379, 0.06767621],
        [0.94885224, 0.05114776],
        [0.82086076, 0.17913924]]),
 array([[0.8760435 , 0.1239565 ],
        [0.91100972, 0.08899028],
        [0.09721584, 0.90278416],
        ...,
        [0.89273101, 0.10726899],
        [0.92733726, 0.07266274],
        [0.5351588 , 0.4648412 ]])]

In [12]:
print("test_probas[0].shape", preds[0].shape)
print("test_probas[1].shape", preds[1].shape)

test_probas[0].shape (8814, 2)
test_probas[1].shape (8814, 2)


In [13]:
y_preds = pd.DataFrame(
    {
        "h1n1_vaccine": preds[0][:, 1],
        "seasonal_vaccine": preds[1][:, 1],
    },
    index = y_eval.index
)
print("y_preds.shape:", y_preds.shape)
y_preds.head()

y_preds.shape: (8814, 2)


Unnamed: 0_level_0,h1n1_vaccine,seasonal_vaccine
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1
3610,0.056362,0.123956
12424,0.057349,0.08899
11459,0.057429,0.902784
11896,0.048415,0.077581
26524,0.295623,0.652902


In [14]:
roc_auc_score(y_eval, y_preds)

0.8718649711116049

In [15]:
%time 

full_pipeline.fit(features_df, labels_df)

None   # So we don't print out the whole pipeline representation

CPU times: total: 0 ns
Wall time: 0 ns


In [16]:
test_features_df = pd.read_csv(
    "Data/test_set_features.csv", 
    index_col="respondent_id"
)

In [17]:
test_probas = full_pipeline.predict_proba(test_features_df)

In [18]:
submission_df = pd.read_csv(
    "Data/submission_format.csv", 
    index_col="respondent_id"
)

In [19]:
# Make sure we have the rows in the same order
np.testing.assert_array_equal(test_features_df.index.values, 
                              submission_df.index.values)

# Save predictions to submission data frame
submission_df["h1n1_vaccine"] = test_probas[0][:, 1]
submission_df["seasonal_vaccine"] = test_probas[1][:, 1]

submission_df.head()

Unnamed: 0_level_0,h1n1_vaccine,seasonal_vaccine
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1
26707,0.102491,0.20097
26708,0.051829,0.071105
26709,0.151248,0.766801
26710,0.744467,0.897939
26711,0.235384,0.411965


In [20]:
submission_df.to_csv('submissionAbz.csv', index=True)