In [1]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, log_loss
import lightgbm as lgb

from IPython.display import clear_output
from warnings import simplefilter
simplefilter(action='ignore', category=UserWarning)

In [2]:
# Import data
train = pd.read_csv('.././mod_data/train.csv')
test = pd.read_csv('.././mod_data/test.csv')

In [3]:
# Drop name column
train = train.drop('Name', axis=1)
test = test.drop('Name', axis=1)

# Convert Infect_Prob to binary with 50% threshold
train['Infect_Prob'] = [1 if i >= 50 else 0 for i in train['Infect_Prob'].values]

In [4]:
# Is the data imbalanced?
train['Infect_Prob'].value_counts()

# Nah, we're good

0    6569
1    4145
Name: Infect_Prob, dtype: int64

In [5]:
# Split data into train and hold-out and cross-validate on train
drop_cols = ['people_ID']
X, y = train.drop(drop_cols + ['Infect_Prob'], axis=1), train['Infect_Prob']
X_cv, X_hold, y_cv, y_hold = train_test_split(X, y, test_size=0.2, 
                                              shuffle=True, stratify=y)

cat_features = ['Region', 'Gender', 'Designation', 'Married', 'Occupation',
                'Mode_transport', 'comorbidity', 'Pulmonary score', 'cardiological pressure']

# Define parameters for lightgbm
lgb_params = {
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.001,
    'max_depth': 11,
    'num_leaves': 31,
    'bagging_fraction': 0.5,
    'feature_fraction': 1.0,
    'random_state': 42,
    'verbosity': 0
}

# Cross validate on train with lgb classifier
hold_auc_scores = []
hold_logloss_scores = []

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for train_idx, val_idx in skf.split(X_cv, y_cv):
    X_train, X_val = X_cv[X_cv.index.isin(train_idx)], X_cv[X_cv.index.isin(val_idx)]
    y_train, y_val = y_cv[y_cv.index.isin(train_idx)], y_cv[y_cv.index.isin(val_idx)]
    
    d_train = lgb.Dataset(data=X_train, label=y_train, categorical_feature=cat_features)
    d_val = lgb.Dataset(data=X_val, label=y_val, categorical_feature=cat_features)
    
    clf = lgb.train(
        lgb_params, d_train,
        valid_sets = [d_train, d_val],
        num_boost_round = 5000,
    )
    
    hold_probs = clf.predict(X_hold)
    hold_preds = np.array([1 if i>=0.5 else 0 for i in hold_probs])
    hold_auc_scores.append(roc_auc_score(y_hold, hold_preds))
    hold_logloss_scores.append(log_loss(y_hold, hold_probs))
    
    clear_output()

In [6]:
# Average scores

print("Average AUC score: {:.5f}".format(np.mean(hold_auc_scores)))
print("Average Logloss: {:.5f}".format(np.mean(hold_logloss_scores)))

Average AUC score: 0.92235
Average Logloss: 0.19562


## Predict for test data

In [7]:
test_ids = test['people_ID'].values
X_test = test.drop(drop_cols, axis=1)

test_probs = clf.predict(X_test)
sub = pd.DataFrame(np.vstack((test_ids, test_probs*100.0)).T, 
                   columns=['people_ID', 'Infect_Prob'])
sub.to_csv('.././submission/sub1_LGBM.csv', index=False)