In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
from pathlib import Path
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
pd.set_option("display.max_columns", 100)


from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_curve, roc_auc_score

RANDOM_SEED = 6    # Set a random seed for reproducibility!


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
features_df = pd.read_csv("/kaggle/input/h1n1-dataset/training_set_features.csv", index_col="respondent_id")
labels_df = pd.read_csv("/kaggle/input/h1n1-dataset/training_set_labels.csv", index_col="respondent_id")
test_features_df = pd.read_csv("/kaggle/input/h1n1-dataset/test_set_features.csv", index_col="respondent_id")
submission_df = pd.read_csv("/kaggle/input/h1n1-dataset/submission_format.csv", index_col="respondent_id")

In [None]:
print("features_df.shape", features_df.shape)
features_df.head()

In [None]:
features_df.dtypes

In [None]:
print("labels_df.shape", labels_df.shape)
labels_df.head()

In [None]:
np.testing.assert_array_equal(features_df.index.values, labels_df.index.values)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
fig, ax = plt.subplots(2, 1, sharex=True)

n_obs = labels_df.shape[0]

(labels_df['h1n1_vaccine']
    .value_counts()
    .div(n_obs)
    .plot.barh(title="Proportion of H1N1 Vaccine", ax=ax[0])
)
ax[0].set_ylabel("h1n1_vaccine")

(labels_df['seasonal_vaccine']
    .value_counts()
    .div(n_obs)
    .plot.barh(title="Proportion of Seasonal Vaccine", ax=ax[1])
)
ax[1].set_ylabel("seasonal_vaccine")

fig.tight_layout()

In [None]:
pd.crosstab(
    labels_df["h1n1_vaccine"], 
    labels_df["seasonal_vaccine"], 
    margins=True,
    normalize=True
)

In [None]:
# Phi Coefficient is the same as Pearson for two binary variables
(labels_df["h1n1_vaccine"]
     .corr(labels_df["seasonal_vaccine"], method="pearson")
)

In [None]:
# features_df["rent_or_own"] = features_df["rent_or_own"].map({"Own": 1, "Rent":0})
# features_df["marital_status"] = features_df["marital_status"].map({"Married": 1, "Not Married":0})
# #features_df["sex"] = features_df["sex"].map({"Male": 0, "Female":1})
# features_df['age_group'] = features_df['age_group'].map({"18 - 34 Years": 1, "35 - 44 Years":2, "45 - 54 Years":3, "55 - 64 Years":4, "65+ Years":5})
# ############################

# features_df['education'] = features_df['education'].map({"< 12 Years": 1, "12 Years":2, "Some College":3, "College Graduate":4})
# features_df['income_poverty'] = features_df['income_poverty'].map({"Below Poverty": 1, "> $75,000":2, "<= $75,000, Above Poverty":3})
# #features_df["race"] = features_df["race"].astype("category")
# features_df = pd.get_dummies(features_df, columns = ["race"],prefix="R")
# features_df['census_msa'] = features_df['census_msa'].astype("category")
# features_df = pd.get_dummies(features_df, columns = ['census_msa'],prefix="C")

In [None]:
joined_df = features_df.join(labels_df)
print(joined_df.shape)
joined_df.head()

In [None]:
counts = (joined_df[['h1n1_concern', 'h1n1_vaccine']]
              .groupby(['h1n1_concern', 'h1n1_vaccine'])
              .size()
              .unstack('h1n1_vaccine')
         )
counts

In [None]:
ax = counts.plot.barh()
ax.invert_yaxis()
ax.legend(
    loc='center right', 
    bbox_to_anchor=(1.3, 0.5), 
    title='h1n1_vaccine'
)

In [None]:
h1n1_concern_counts = counts.sum(axis='columns')
h1n1_concern_counts

In [None]:
props = counts.div(h1n1_concern_counts, axis='index')
props

In [None]:
ax = props.plot.barh()
ax.invert_yaxis()
ax.legend(
    loc='center left', 
    bbox_to_anchor=(1.05, 0.5),
    title='h1n1_vaccine'
)

In [None]:
ax = props.plot.barh(stacked=True)
ax.invert_yaxis()
ax.legend(
    loc='center left', 
    bbox_to_anchor=(1.05, 0.5),
    title='h1n1_vaccine'
)

In [None]:
def vaccination_rate_plot(col, target, data, ax=None):
    """Stacked bar chart of vaccination rate for `target` against 
    `col`. 
    
    Args:
        col (string): column name of feature variable
        target (string): column name of target variable
        data (pandas DataFrame): dataframe that contains columns 
            `col` and `target`
        ax (matplotlib axes object, optional): matplotlib axes 
            object to attach plot to
    """
    counts = (joined_df[[target, col]]
                  .groupby([target, col])
                  .size()
                  .unstack(target)
             )
    group_counts = counts.sum(axis='columns')
    props = counts.div(group_counts, axis='index')

    props.plot(kind="barh", stacked=True, ax=ax)
    ax.invert_yaxis()
    ax.legend().remove()

In [None]:
cols_to_plot = [
    'h1n1_concern',
    'h1n1_knowledge',
#     'opinion_h1n1_vacc_effective',
#     'opinion_h1n1_risk',
#     'opinion_h1n1_sick_from_vacc',
#     'opinion_seas_vacc_effective',
#     'opinion_seas_risk',
#     'opinion_seas_sick_from_vacc',
#     'sex',
#     'age_group',
#     'race',
#     'rent_or_own',
#     'marital_status',
#     'education',
#     'income_poverty',
#     'census_msa',
#     'behavioral_antiviral_meds',
#     'behavioral_avoidance',
#     'behavioral_face_mask',
#     'behavioral_wash_hands',
#     'behavioral_large_gatherings',
#     'behavioral_outside_home',
#     'behavioral_touch_face',
#     'doctor_recc_h1n1',
#     'doctor_recc_seasonal',
#     'chronic_med_condition',
#     'child_under_6_months',
#     'health_worker',
#     'health_insurance',
#     'employment_status'
#     'hhs_geo_region',
#     'household_adults',
#     'household_children',
#     'employment_industry',
#     'employment_occupation'
    
    
]

fig, ax = plt.subplots(
    len(cols_to_plot), 2, figsize=(9,len(cols_to_plot)*2.5)
)
for idx, col in enumerate(cols_to_plot):
    vaccination_rate_plot(
        col, 'h1n1_vaccine', joined_df, ax=ax[idx, 0]
    )
    vaccination_rate_plot(
        col, 'seasonal_vaccine', joined_df, ax=ax[idx, 1]
    )
    
ax[0, 0].legend(
    loc='lower center', bbox_to_anchor=(0.5, 1.05), title='h1n1_vaccine'
)
ax[0, 1].legend(
    loc='lower center', bbox_to_anchor=(0.5, 1.05), title='seasonal_vaccine'
)
fig.tight_layout()

In [None]:
features_df.dtypes != "object"

In [None]:
numeric_cols = features_df.columns[features_df.dtypes != "object"].values
print(numeric_cols)

In [None]:
features_df["rent_or_own"] = features_df["rent_or_own"].map({"Own": 1, "Rent":0})
features_df["marital_status"] = features_df["marital_status"].map({"Married": 1, "Not Married":0})
features_df["sex"] = features_df["sex"].map({"Male": 0, "Female":1})
features_df['age_group'] = features_df['age_group'].map({"18 - 34 Years": 1, "35 - 44 Years":2, "45 - 54 Years":3, "55 - 64 Years":4, "65+ Years":5})
############################

features_df['education'] = features_df['education'].map({"< 12 Years": 1, "12 Years":2, "Some College":3, "College Graduate":4})
features_df['income_poverty'] = features_df['income_poverty'].map({"Below Poverty": 1, "> $75,000":2, "<= $75,000, Above Poverty":3})

features_df["race"] = features_df["race"].astype("category")
features_df = pd.get_dummies(features_df, columns = ["race"],prefix="R")

features_df['census_msa'] = features_df['census_msa'].astype("category")
features_df = pd.get_dummies(features_df, columns = ['census_msa'],prefix="C")

features_df['employment_industry'] = features_df['employment_industry'].fillna('other')
features_df['employment_industry'] = features_df['employment_industry'].astype("category")
features_df = pd.get_dummies(features_df, columns = ['employment_industry'],prefix="EI")

# features_df['employment_occupation'] = features_df['employment_occupation'].fillna('other')
# features_df['employment_occupation'] = features_df['employment_occupation'].astype("category")
# features_df = pd.get_dummies(features_df, columns = ['employment_occupation'],prefix="EO")

# features_df['hhs_geo_region'] = features_df['hhs_geo_region'].astype("category")
# features_df = pd.get_dummies(features_df, columns = ['hhs_geo_region'],prefix="H")


In [None]:
numeric_cols = features_df.columns[features_df.dtypes != "object"].values
print(numeric_cols)

In [None]:
# chain preprocessing into a Pipeline object
# each step is a tuple of (name you chose, sklearn transformer)
numeric_preprocessing_steps = Pipeline([
    ('standard_scaler', StandardScaler()),
    ('simple_imputer', SimpleImputer(strategy='mean'))
])

# create the preprocessor stage of final pipeline
# each entry in the transformer list is a tuple of
# (name you choose, sklearn transformer, list of columns)
preprocessor = ColumnTransformer(
    transformers = [
        ("numeric", numeric_preprocessing_steps, numeric_cols)
    ],
    remainder = "drop"
)

In [None]:
# estimators = MultiOutputClassifier(
#     estimator=LogisticRegression(penalty="l2", C=1)
# )

In [None]:
estimators = MultiOutputClassifier(
    estimator=LGBMClassifier()
)

In [None]:
full_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", estimators),
])

In [None]:
full_pipeline

# Training & Evaluation

In [None]:
X_train, X_eval, y_train, y_eval = train_test_split(
    features_df,
    labels_df,
    test_size=0.33,
    shuffle=True,
    stratify=labels_df,
    random_state=RANDOM_SEED
)

In [None]:
features_df.head()

In [None]:
%%time

# Train model
full_pipeline.fit(X_train, y_train)

# Predict on evaluation set
# This competition wants probabilities, not labels
preds = full_pipeline.predict_proba(X_eval)
preds

In [None]:
print("test_probas[0].shape", preds[0].shape)
print("test_probas[1].shape", preds[1].shape)

In [None]:
y_preds = pd.DataFrame(
    {
        "h1n1_vaccine": preds[0][:, 1],
        "seasonal_vaccine": preds[1][:, 1],
    },
    index = y_eval.index
)
print("y_preds.shape:", y_preds.shape)
y_preds.head()

In [None]:
def plot_roc(y_true, y_score, label_name, ax):
    fpr, tpr, thresholds = roc_curve(y_true, y_score)
    ax.plot(fpr, tpr)
    ax.plot([0, 1], [0, 1], color='grey', linestyle='--')
    ax.set_ylabel('TPR')
    ax.set_xlabel('FPR')
    ax.set_title(
        f"{label_name}: AUC = {roc_auc_score(y_true, y_score):.4f}"
    )


In [None]:
fig, ax = plt.subplots(1, 2, figsize=(7, 3.5))

plot_roc(
    y_eval['h1n1_vaccine'], 
    y_preds['h1n1_vaccine'], 
    'h1n1_vaccine',
    ax=ax[0]
)
plot_roc(
    y_eval['seasonal_vaccine'], 
    y_preds['seasonal_vaccine'], 
    'seasonal_vaccine',
    ax=ax[1]
)
fig.tight_layout()


first four -0.8484697480778163

all -0.8534343989219157, without final - 0.854415004633476,  without final and race -0.8548721887341852

0.8634498111280882 - gbc all

In [None]:
roc_auc_score(y_eval, y_preds)

In [None]:
%%time 

full_pipeline.fit(features_df, labels_df)

None   # So we don't print out the whole pipeline representation

In [None]:
test_features_df["rent_or_own"] = test_features_df["rent_or_own"].map({"Own": 1, "Rent":0})
test_features_df["marital_status"] = test_features_df["marital_status"].map({"Married": 1, "Not Married":0})
test_features_df["sex"] = test_features_df["sex"].map({"Male": 0, "Female":1})
test_features_df['age_group'] = features_df['age_group'].map({"18 - 34 Years": 1, "35 - 44 Years":2, "45 - 54 Years":3, "55 - 64 Years":4, "65+ Years":5})
#########

test_features_df['education'] = test_features_df['education'].map({"< 12 Years": 1, "12 Years":2, "Some College":3, "College Graduate":4})
test_features_df['income_poverty'] = test_features_df['income_poverty'].map({"Below Poverty": 1, "> $75,000":2, "<= $75,000, Above Poverty":3})

test_features_df["race"] = test_features_df["race"].astype("category")
test_features_df = pd.get_dummies(test_features_df, columns = ["race"],prefix="R")

test_features_df['census_msa'] = test_features_df['census_msa'].astype("category")
test_features_df = pd.get_dummies(test_features_df, columns = ['census_msa'],prefix="C")

test_features_df['employment_industry'] = test_features_df['employment_industry'].fillna('other')
test_features_df['employment_industry'] = test_features_df['employment_industry'].astype("category")
test_features_df = pd.get_dummies(test_features_df, columns = ['employment_industry'],prefix="EI")

# test_features_df['employment_occupation'] = test_features_df['employment_occupation'].fillna('other')
# test_features_df['employment_occupation'] = test_features_df['employment_occupation'].astype("category")
# test_features_df = pd.get_dummies(test_features_df, columns = ['employment_occupation'],prefix="EO")

# test_features_df['hhs_geo_region'] = features_df['hhs_geo_region'].astype("category")
# test_features_df = pd.get_dummies(test_features_df, columns = ['hhs_geo_region'],prefix="H")

In [None]:
test_probas = full_pipeline.predict_proba(test_features_df)
test_probas

In [None]:
submission_df.head()

In [None]:
# Make sure we have the rows in the same order
np.testing.assert_array_equal(test_features_df.index.values, 
                              submission_df.index.values)

# Save predictions to submission data frame
submission_df["h1n1_vaccine"] = test_probas[0][:, 1]
submission_df["seasonal_vaccine"] = test_probas[1][:, 1]

submission_df.head()

In [None]:
submission_df.to_csv('cbc.csv', index=True)

In [None]:
#!head my_submission.csv