In [None]:
from google.colab import drive
drive.mount('/content/drive/')

In [2]:
cd "/content/drive/My Drive/DM"

/content/drive/My Drive/DM


In [4]:
from pathlib import Path
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, roc_auc_score

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

RANDOM_SEED = 42    # Set a random seed for reproducibility!

pd.set_option('display.max_columns', 100)

In [5]:
DATA_PATH = Path.cwd().parent / "data" / "final" / "public"

In [6]:
features_df = pd.read_csv(
    DATA_PATH / "/content/drive/My Drive/DM/training_set_features.csv", 
    index_col="respondent_id"
)
labels_df = pd.read_csv(
    DATA_PATH / "/content/drive/My Drive/DM/training_set_labels.csv", 
    index_col="respondent_id"
)
test_features_df = pd.read_csv(
    DATA_PATH / "/content/drive/My Drive/DM/test_set_features.csv", 
    index_col="respondent_id"
)

In [7]:
print('features_df' , features_df.shape)
print('labels_df', labels_df.shape)
print('test_features_df' , test_features_df.shape)

features_df (26707, 35)
labels_df (26707, 2)
test_features_df (26708, 35)


In [8]:
labels_df_h1n1 = labels_df[['h1n1_vaccine']]
labels_df_seasonal = labels_df[['seasonal_vaccine']]

# Preprocessing

In [9]:
numeric_cols = features_df.columns[features_df.dtypes != 'object'].values
non_numeric_cols = features_df.columns[features_df.dtypes == 'object'].values

In [10]:
numeric_preprocessing_steps = Pipeline([
    ('standard_scaler', StandardScaler()),
    ('simple_imputer', SimpleImputer(strategy = 'mean'))
])

non_numeric_preprocessing_steps = Pipeline([
     ('simple_imputer', SimpleImputer(strategy='constant', fill_value='missing')), 
    ('one_hot_encoder', OneHotEncoder())
])

preprocessor = ColumnTransformer(
    transformers = [
        ('numeric', numeric_preprocessing_steps, numeric_cols),
        ('non_numeric', non_numeric_preprocessing_steps, non_numeric_cols)  
    ],
    remainder = "drop"
)

In [11]:
X_train, X_eval, y_train, y_eval = train_test_split(
    features_df,
    labels_df_seasonal,
    test_size=0.33,
    shuffle=True,
    stratify=labels_df_h1n1,
    random_state=RANDOM_SEED
)

In [12]:
X_train_preprocess = pd.DataFrame(preprocessor.fit_transform(X_train))
X_eval_preprocess =  pd.DataFrame(preprocessor.transform(X_eval))
print ('X_train_preprocess.shape' , X_train_preprocess.shape)
print ('X_eval_preprocess.shape' , X_eval_preprocess.shape)

X_train_preprocess.shape (17893, 112)
X_eval_preprocess.shape (8814, 112)


In [13]:
features_df_preprocess = pd.DataFrame(preprocessor.transform(features_df))
test_features_df_preprocess = pd.DataFrame(preprocessor.transform(test_features_df))
print ('features_df_preprocess.shape' , features_df_preprocess.shape)
print ('test_features_df_preprocess.shape' , test_features_df_preprocess.shape)

features_df_preprocess.shape (26707, 112)
test_features_df_preprocess.shape (26708, 112)


# Remove Collinear Variables

In [14]:
#Threshold for removing correlated variables
threshold = 0.9

# Absolute value correlation matrix
corr_matrix = X_train_preprocess.corr().abs()
corr_matrix.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,...,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111
0,1.0,0.061255,0.097908,0.235519,0.153747,0.296449,0.250226,0.245242,0.242955,0.143139,0.128204,0.083699,0.046147,0.023825,0.001166,0.239307,0.367953,0.353836,0.229209,0.322467,0.221108,0.012822,0.053947,0.08984,0.025439,0.028723,0.030126,0.006951,0.030633,0.053219,0.034692,0.022967,0.004044,0.101911,0.072349,0.030513,0.130322,0.131524,0.131524,0.019951,0.022204,0.071412,0.005709,0.025334,0.024106,0.003151,0.001551,0.001322,0.000605,0.05562,...,0.002191,0.004912,0.000813,0.004651,0.0062,0.011635,0.011033,0.027165,0.040901,0.054317,0.031498,0.010318,0.0234,0.054592,0.026762,0.024159,0.001698,0.01944,0.012658,0.013749,0.005596,0.017371,0.020168,0.001854,0.036841,0.003189,0.003768,0.031938,0.019541,0.054317,0.018505,0.006188,0.00882,0.01078,0.00755,0.026797,0.052368,0.002064,0.006276,0.01939,0.017358,0.029372,0.01841,0.018153,0.020319,0.020459,0.023357,0.0063,0.023514,0.012959
1,0.061255,1.0,0.019318,0.089261,0.027572,0.091289,0.047968,0.067313,0.086992,0.090355,0.070768,0.019057,0.026587,0.168941,0.098381,0.112882,0.078066,0.020658,0.077578,0.074995,0.058886,0.024087,0.053534,0.048839,0.048039,0.081268,0.063335,0.127749,0.141786,0.210952,0.261835,0.026563,0.088745,0.114727,0.069249,0.028289,0.135692,0.069205,0.069205,0.00376,0.195859,0.163628,0.100342,0.140062,0.103863,0.082978,0.153963,0.113285,0.088342,0.161453,...,0.013197,0.058255,0.000207,0.06527,0.043238,0.003587,0.022077,0.001704,0.170398,0.003705,0.04928,0.000932,0.026029,0.161001,0.001203,0.014658,0.011647,0.002205,0.009047,0.039806,0.003761,0.004687,0.01107,0.086872,0.039943,0.014223,0.02985,0.01818,0.165063,0.003705,0.000323,0.071638,0.024342,0.041625,0.032582,0.031485,0.159145,0.028067,0.001149,0.016649,0.043634,0.015993,0.031861,0.034243,0.039054,0.033432,0.011109,0.029899,0.069826,0.028005
2,0.097908,0.019318,1.0,0.056575,0.151058,0.069054,0.112054,0.137627,0.073443,0.047985,0.02846,0.007553,0.026999,0.004883,0.05704,0.033543,0.112311,0.081772,0.016917,0.09141,0.089189,0.049582,0.078378,0.067131,0.045152,0.001482,0.038813,0.059568,0.017053,0.062343,0.060574,0.008419,0.036677,0.04585,0.128876,0.019857,0.120744,0.006204,0.006204,0.034006,0.033295,0.091928,0.009372,0.022855,0.008902,0.031475,0.066889,0.051626,0.034587,0.007479,...,0.006573,0.025613,0.019998,0.008076,0.017458,0.005947,0.008125,0.003877,0.001638,8e-05,0.013682,0.013659,0.006706,0.005693,0.015737,0.001489,0.007662,0.005533,0.005553,0.022834,0.001896,0.011442,0.009029,0.009644,0.012161,0.00625,0.008007,0.003962,0.004007,8e-05,0.003668,0.014339,0.010403,0.009853,0.010688,0.011379,0.008183,0.008879,0.005266,0.001854,0.025537,0.00401,0.01501,0.019549,0.013936,0.023683,0.000796,0.024786,0.021549,0.004136
3,0.235519,0.089261,0.056575,1.0,0.06394,0.340522,0.226385,0.217368,0.333264,0.065109,0.074985,0.036296,0.00861,0.007133,0.025365,0.119277,0.119284,0.133088,0.120487,0.12483,0.087377,0.025328,0.040477,0.021383,0.012574,0.023982,0.020261,0.031535,0.008608,0.015639,0.015193,0.019528,0.03558,0.003525,0.004542,0.012512,0.002347,0.113015,0.113015,0.01289,0.039126,0.014525,0.051276,0.058036,0.046838,0.02591,0.043006,0.030133,0.027055,0.005711,...,0.003258,0.031249,0.00998,0.024654,0.007197,0.001474,0.003653,0.006061,0.004811,0.040314,0.008067,0.012471,0.007091,0.006854,0.007788,0.015948,0.003497,0.008982,0.009235,0.002548,0.002795,0.007296,0.004714,0.012535,0.024005,0.007406,0.007797,0.010517,0.011064,0.040314,0.003135,0.007776,0.002812,0.019354,0.006797,0.001859,0.004786,0.028971,0.00878,0.004857,0.020632,0.018439,0.000859,0.002637,0.022604,0.002541,0.000447,0.009995,0.011769,0.007664
4,0.153747,0.027572,0.151058,0.06394,1.0,0.081527,0.182625,0.162526,0.101801,0.079888,0.069511,0.065472,0.040111,0.072662,0.0288,0.040419,0.125425,0.106117,0.0418,0.106172,0.083387,0.012468,0.004838,0.006348,0.001255,0.002099,0.008724,0.0005,0.006807,0.063519,0.037647,0.006854,0.026324,0.046607,0.07554,0.042047,0.102118,0.051,0.051,0.020484,0.034493,0.059932,0.018824,0.021212,0.007077,0.031854,0.054471,0.039233,0.032588,0.042068,...,0.018491,0.011371,0.030562,0.018673,0.002873,0.021282,0.003303,0.003607,0.070434,0.011893,0.028142,0.009319,0.022141,0.038949,0.018291,0.007058,0.006774,0.02937,0.002861,0.000376,0.013235,0.01098,0.01252,0.027007,0.016464,0.013891,0.01146,0.011491,0.042573,0.011893,0.006483,0.026046,0.033765,0.023007,0.002011,0.022124,0.03941,0.018336,0.018446,0.004055,0.011604,0.021924,0.004945,0.010201,0.006679,0.01299,0.019164,0.004621,0.030686,0.009099


In [15]:
# Upper triangle of correlations
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

In [16]:
# Select columns with correlations above threshold
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]

In [17]:
X_train_preprocess = X_train_preprocess.drop(columns = to_drop)
X_eval_preprocess = X_eval_preprocess.drop(columns = to_drop)

In [18]:
features_df_preprocess = features_df_preprocess.drop(columns = to_drop)
test_features_df_preprocess = test_features_df_preprocess.drop(columns = to_drop)

# Validation

In [19]:
model_seasonal= LogisticRegression(penalty="l2", C=1)

In [20]:
# train model
%%time 

model_seasonal.fit(X_train_preprocess, y_train)

None   # So we don't print out the whole pipeline representation

CPU times: user 682 ms, sys: 251 ms, total: 933 ms
Wall time: 491 ms


In [21]:
preds_seasonal = model_seasonal.predict_proba(X_eval_preprocess)

In [22]:
y_preds_seasonal = pd.DataFrame(
    {
        "seasonal_vaccine": preds_seasonal[:, 1]
       
    },
    index = y_eval.index
)
print("y_preds_seasonal.shape:", y_preds_seasonal.shape)

y_preds_seasonal.shape: (8814, 1)


In [23]:
roc_auc_score(y_eval, y_preds_seasonal)

0.851743039793174

# Train whole dataset

In [24]:
# retrain on full dataset
%%time 

model_seasonal.fit(features_df_preprocess, labels_df_seasonal)

None   # So we don't print out the whole pipeline representation

CPU times: user 1.15 s, sys: 343 ms, total: 1.49 s
Wall time: 773 ms


In [25]:
preds1_seasonal = model_seasonal.predict_proba(test_features_df_preprocess)

In [26]:
y_preds1_seasonal = pd.DataFrame(
    {
        "seasonal_vaccine": preds1_seasonal[:, 1],
        
    },
    index = test_features_df.index
)
print("y_preds1_seasonal.shape:", y_preds1_seasonal.shape)

y_preds1_seasonal.shape: (26708, 1)


In [28]:
h1n1_df = pd.read_csv(DATA_PATH / "/content/drive/My Drive/DM/my_submission19.csv", 
                            index_col="respondent_id")

In [29]:
joined_df = h1n1_df.join(y_preds1_seasonal)
joined_df.head()

Unnamed: 0_level_0,h1n1_vaccine,seasonal_vaccine
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1
26707,0.061879,0.304426
26708,0.01944,0.036961
26709,0.409258,0.590114
26710,0.482891,0.871683
26711,0.179245,0.488876


In [30]:
joined_df.to_csv('my_submission19.csv', index=True)