In [115]:
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn import model_selection
from sklearn import metrics

import lightgbm as lgb

In [4]:
!ls input/health-insurance-cross-sell-prediction

sample_submission.csv test.csv              train.csv


In [5]:
train = pd.read_csv("input/health-insurance-cross-sell-prediction/train.csv")

In [7]:
train.columns = [col.lower() for col in train.columns]

In [32]:
numeric_cols = ["age", "driving_license", "previously_insured", "annual_premium", "vintage", ]
categoric_cols = ["gender", "region_code", "vehicle_age", "vehicle_damage", "policy_sales_channel", ]
input_cols = numeric_cols + categoric_cols
target = "response"

In [26]:
cat_encoder = preprocessing.OrdinalEncoder()
train.loc[:, categoric_cols] = cat_encoder.fit_transform(train.loc[:, categoric_cols])

In [29]:
train.response.sum() / train.shape[0]

0.12256336113815208

***
## Model validation through StratifiedKFold

In [46]:
def cast_to_lgb_dset(X, y, categorical_cols):
    lgb_dset = lgb.Dataset(
        data = X,
        label = y,
        categorical_feature = categorical_cols,
        free_raw_data = False,
    )
    return lgb_dset

model_params = {
    "objective": "binary",
    "metric":"auc",
    "boosting": "gbdt",
    "learning_rate": 0.05,
    "num_leaves": 31,
    "max_depth": -1,
    "min_data_in_leaf": 20,
    "bagging_fraction": 0.8,
    "feature_fraction": 0.9,
    "seed": 42,
    "deterministic": True,
    "verbose": -1,
}

In [31]:
skf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=19)

In [78]:
cv_scores = list()
best_iterations = list()

for train_idx,valid_idx in skf.split(train.loc[:,input_cols], train.loc[:,target]):
    
    X_train,y_train = train.loc[train_idx,input_cols], train.loc[train_idx,target]
    X_valid,y_valid = train.loc[valid_idx,input_cols], train.loc[valid_idx,target]

    train_dset = cast_to_lgb_dset(X_train, y_train, categoric_cols)
    valid_dset = cast_to_lgb_dset(X_valid, y_valid, categoric_cols)
    
    _model = lgb.train(
    train_set = train_dset,
    valid_sets = [valid_dset, ],
    num_boost_round = 10000,
    early_stopping_rounds = 50,
    params = model_params, 
    verbose_eval=25,
    )
    
    cv_scores.append(_model.best_score["valid_0"]["auc"])
    best_iterations.append(_model.best_iteration)



Training until validation scores don't improve for 50 rounds
[25]	valid_0's auc: 0.857356
[50]	valid_0's auc: 0.858065
[75]	valid_0's auc: 0.858455
[100]	valid_0's auc: 0.85857
[125]	valid_0's auc: 0.858645
[150]	valid_0's auc: 0.858738
[175]	valid_0's auc: 0.858791
[200]	valid_0's auc: 0.858803
[225]	valid_0's auc: 0.85876
[250]	valid_0's auc: 0.858752
Early stopping, best iteration is:
[218]	valid_0's auc: 0.858813
Training until validation scores don't improve for 50 rounds
[25]	valid_0's auc: 0.858052
[50]	valid_0's auc: 0.859033
[75]	valid_0's auc: 0.859458
[100]	valid_0's auc: 0.859678
[125]	valid_0's auc: 0.859575
Early stopping, best iteration is:
[96]	valid_0's auc: 0.859699
Training until validation scores don't improve for 50 rounds
[25]	valid_0's auc: 0.856594
[50]	valid_0's auc: 0.857519
[75]	valid_0's auc: 0.85793
[100]	valid_0's auc: 0.858038
[125]	valid_0's auc: 0.858047
Early stopping, best iteration is:
[93]	valid_0's auc: 0.858056
Training until validation scores don

In [83]:
print(f"CV AUC: {np.mean(cv_scores):0.5f}")
print(f"Best iterations by fold: {best_iterations}")

CV AUC: 0.85833
Best iterations by fold: [218, 96, 93, 86, 150]


***
### Nested cross validation

In [85]:
skf_outer = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=19)

In [128]:
cv_scores = list()

for train_idx,valid_idx in skf_outer.split(train.loc[:,input_cols], train.loc[:,target]):
        
    X_train,y_train = train.loc[train_idx,input_cols], train.loc[train_idx,target]
    X_valid,y_valid = train.loc[valid_idx,input_cols], train.loc[valid_idx,target]
        
    best_iterations = list()
    skf_inner = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=19)
    
    for train_idx_inner,valid_idx_inner in skf_inner.split(X_train, y_train):
        X_train_inner,y_train_inner = X_train.iloc[train_idx_inner,:], y_train.iloc[train_idx_inner]
        X_valid_inner,y_valid_inner = X_train.iloc[valid_idx_inner,:], y_train.iloc[valid_idx_inner]
        
        train_dset = cast_to_lgb_dset(X_train_inner, y_train_inner, categoric_cols)
        valid_dset = cast_to_lgb_dset(X_valid_inner, y_valid_inner, categoric_cols)

        _model = lgb.train(
            train_set = train_dset,
            valid_sets = [valid_dset, ],
            num_boost_round = 10000,
            early_stopping_rounds = 50,
            params = model_params, 
            verbose_eval=25,
        )
        best_iterations.append(_model.best_iteration)
        
    best_iteration = np.median(best_iterations)
    
    # trains a model over outer split using best iteration obtained from inner loop
    # (without using early stopping to avoid leakage)

    train_dset = cast_to_lgb_dset(X_train, y_train, categoric_cols)
    
    model = lgb.train(
        train_set = train_dset,
        num_boost_round = int(best_iteration),
        params = model_params)
    preds = model.predict(X_valid)
    auc = metrics.roc_auc_score(y_valid, preds)
    
    cv_scores.append(auc)
    



Training until validation scores don't improve for 50 rounds
[25]	valid_0's auc: 0.856549
[50]	valid_0's auc: 0.857292
[75]	valid_0's auc: 0.857671
[100]	valid_0's auc: 0.857699
[125]	valid_0's auc: 0.857731
Early stopping, best iteration is:
[92]	valid_0's auc: 0.857802
Training until validation scores don't improve for 50 rounds
[25]	valid_0's auc: 0.858196
[50]	valid_0's auc: 0.859119
[75]	valid_0's auc: 0.859632
[100]	valid_0's auc: 0.859757
[125]	valid_0's auc: 0.859769
[150]	valid_0's auc: 0.859804
[175]	valid_0's auc: 0.859828
[200]	valid_0's auc: 0.859827
[225]	valid_0's auc: 0.859948
[250]	valid_0's auc: 0.860055
[275]	valid_0's auc: 0.860086
[300]	valid_0's auc: 0.860043
[325]	valid_0's auc: 0.859902
Early stopping, best iteration is:
[275]	valid_0's auc: 0.860086
Training until validation scores don't improve for 50 rounds
[25]	valid_0's auc: 0.855141
[50]	valid_0's auc: 0.856013
[75]	valid_0's auc: 0.85629
[100]	valid_0's auc: 0.856322
[125]	valid_0's auc: 0.856392
[150]	va



Training until validation scores don't improve for 50 rounds
[25]	valid_0's auc: 0.856257
[50]	valid_0's auc: 0.856753
[75]	valid_0's auc: 0.857136
[100]	valid_0's auc: 0.857109
[125]	valid_0's auc: 0.857064
Early stopping, best iteration is:
[79]	valid_0's auc: 0.857165
Training until validation scores don't improve for 50 rounds
[25]	valid_0's auc: 0.856794
[50]	valid_0's auc: 0.857318
[75]	valid_0's auc: 0.857641
[100]	valid_0's auc: 0.85772
[125]	valid_0's auc: 0.857716
Early stopping, best iteration is:
[90]	valid_0's auc: 0.857754
Training until validation scores don't improve for 50 rounds
[25]	valid_0's auc: 0.855075
[50]	valid_0's auc: 0.856049
[75]	valid_0's auc: 0.856311
[100]	valid_0's auc: 0.856365
[125]	valid_0's auc: 0.856357
[150]	valid_0's auc: 0.856457
[175]	valid_0's auc: 0.856325
Early stopping, best iteration is:
[144]	valid_0's auc: 0.85647
Training until validation scores don't improve for 50 rounds
[25]	valid_0's auc: 0.855214
[50]	valid_0's auc: 0.856323
[75]	v



Training until validation scores don't improve for 50 rounds
[25]	valid_0's auc: 0.855418
[50]	valid_0's auc: 0.856435
[75]	valid_0's auc: 0.856838
[100]	valid_0's auc: 0.856811
[125]	valid_0's auc: 0.856812
Early stopping, best iteration is:
[76]	valid_0's auc: 0.85685
Training until validation scores don't improve for 50 rounds
[25]	valid_0's auc: 0.85962
[50]	valid_0's auc: 0.860599
[75]	valid_0's auc: 0.861035
[100]	valid_0's auc: 0.861184
[125]	valid_0's auc: 0.861188
[150]	valid_0's auc: 0.861177
Early stopping, best iteration is:
[109]	valid_0's auc: 0.861226
Training until validation scores don't improve for 50 rounds
[25]	valid_0's auc: 0.855449
[50]	valid_0's auc: 0.856139
[75]	valid_0's auc: 0.856369
[100]	valid_0's auc: 0.856435
[125]	valid_0's auc: 0.856416
[150]	valid_0's auc: 0.856413
Early stopping, best iteration is:
[101]	valid_0's auc: 0.856447
Training until validation scores don't improve for 50 rounds
[25]	valid_0's auc: 0.856169
[50]	valid_0's auc: 0.856911
[75]	



Training until validation scores don't improve for 50 rounds
[25]	valid_0's auc: 0.857535
[50]	valid_0's auc: 0.858334
[75]	valid_0's auc: 0.858509
[100]	valid_0's auc: 0.85858
[125]	valid_0's auc: 0.858615
[150]	valid_0's auc: 0.85868
[175]	valid_0's auc: 0.858611
[200]	valid_0's auc: 0.858739
[225]	valid_0's auc: 0.858722
[250]	valid_0's auc: 0.858712
Early stopping, best iteration is:
[203]	valid_0's auc: 0.858742
Training until validation scores don't improve for 50 rounds
[25]	valid_0's auc: 0.85859
[50]	valid_0's auc: 0.859537
[75]	valid_0's auc: 0.859857
[100]	valid_0's auc: 0.860095
[125]	valid_0's auc: 0.860059
[150]	valid_0's auc: 0.860078
Early stopping, best iteration is:
[100]	valid_0's auc: 0.860095
Training until validation scores don't improve for 50 rounds
[25]	valid_0's auc: 0.856865
[50]	valid_0's auc: 0.857657
[75]	valid_0's auc: 0.857965
[100]	valid_0's auc: 0.858092
[125]	valid_0's auc: 0.858086
[150]	valid_0's auc: 0.858067
Early stopping, best iteration is:
[115



Training until validation scores don't improve for 50 rounds
[25]	valid_0's auc: 0.856586
[50]	valid_0's auc: 0.857529
[75]	valid_0's auc: 0.857731
[100]	valid_0's auc: 0.857734
[125]	valid_0's auc: 0.857701
Early stopping, best iteration is:
[88]	valid_0's auc: 0.857797
Training until validation scores don't improve for 50 rounds
[25]	valid_0's auc: 0.856424
[50]	valid_0's auc: 0.857296
[75]	valid_0's auc: 0.857665
[100]	valid_0's auc: 0.857687
[125]	valid_0's auc: 0.857665
[150]	valid_0's auc: 0.857625
Early stopping, best iteration is:
[109]	valid_0's auc: 0.857723
Training until validation scores don't improve for 50 rounds
[25]	valid_0's auc: 0.856601
[50]	valid_0's auc: 0.857538
[75]	valid_0's auc: 0.857948
[100]	valid_0's auc: 0.857944
[125]	valid_0's auc: 0.857981
[150]	valid_0's auc: 0.857954
Early stopping, best iteration is:
[118]	valid_0's auc: 0.858028
Training until validation scores don't improve for 50 rounds
[25]	valid_0's auc: 0.856241
[50]	valid_0's auc: 0.857065
[75

In [129]:
print(f"CV AUC: {np.mean(cv_scores):0.5f}")

CV AUC: 0.85827


***
### Repeated cross validation

First make a CV split to find a good value for the number of boosting rounds, then validate your model on a different CV split

In [130]:
skf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=129)

In [131]:
best_iterations = list()

for train_idx,valid_idx in skf.split(train.loc[:,input_cols], train.loc[:,target]):
    
    X_train,y_train = train.loc[train_idx,input_cols], train.loc[train_idx,target]
    X_valid,y_valid = train.loc[valid_idx,input_cols], train.loc[valid_idx,target]

    train_dset = cast_to_lgb_dset(X_train, y_train, categoric_cols)
    valid_dset = cast_to_lgb_dset(X_valid, y_valid, categoric_cols)
    
    _model = lgb.train(
    train_set = train_dset,
    valid_sets = [valid_dset, ],
    num_boost_round = 10000,
    early_stopping_rounds = 50,
    params = model_params, 
    verbose_eval=25,
    )
    
    best_iterations.append(_model.best_iteration)



Training until validation scores don't improve for 50 rounds
[25]	valid_0's auc: 0.854761
[50]	valid_0's auc: 0.855633
[75]	valid_0's auc: 0.855884
[100]	valid_0's auc: 0.856027
[125]	valid_0's auc: 0.856047
[150]	valid_0's auc: 0.856066
[175]	valid_0's auc: 0.856031
[200]	valid_0's auc: 0.856096
Early stopping, best iteration is:
[157]	valid_0's auc: 0.856111
Training until validation scores don't improve for 50 rounds
[25]	valid_0's auc: 0.856272
[50]	valid_0's auc: 0.857507
[75]	valid_0's auc: 0.858026
[100]	valid_0's auc: 0.858128
[125]	valid_0's auc: 0.858145
[150]	valid_0's auc: 0.858086
Early stopping, best iteration is:
[119]	valid_0's auc: 0.858155
Training until validation scores don't improve for 50 rounds
[25]	valid_0's auc: 0.857685
[50]	valid_0's auc: 0.858633
[75]	valid_0's auc: 0.859105
[100]	valid_0's auc: 0.859141
[125]	valid_0's auc: 0.859084
Early stopping, best iteration is:
[92]	valid_0's auc: 0.859192
Training until validation scores don't improve for 50 rounds
[

In [137]:
best_iteration = np.median(best_iterations)
skf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=19)

cv_scores = list()

for train_idx,valid_idx in skf.split(train.loc[:,input_cols], train.loc[:,target]):
    
    X_train,y_train = train.loc[train_idx,input_cols], train.loc[train_idx,target]
    X_valid,y_valid = train.loc[valid_idx,input_cols], train.loc[valid_idx,target]

    train_dset = cast_to_lgb_dset(X_train, y_train, categoric_cols)
    
    model = lgb.train(
        train_set = train_dset,
        num_boost_round = int(best_iteration),
        params = model_params)
    
    preds = model.predict(X_valid)
    auc = metrics.roc_auc_score(y_valid, preds)
    cv_scores.append(auc)  



In [138]:
print(f"CV AUC: {np.mean(cv_scores):0.5f}")

CV AUC: 0.85824


***