In [37]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, log_loss
from sklearn.linear_model import LogisticRegression
import pickle


In [38]:
folds_file = '../data/folds.pickle'
with open(folds_file, 'rb') as fid:
    outer_cross_val = pickle.load(fid)


# Baseline
Running xgboost on all regions. We reduce depth of trees in xgboost to make sure we get the simplest model since we have almost no data.

In [39]:
X_train_list, y_train_list = [], []
X_val_list, y_val_list = [], []

for split in outer_cross_val:
    inner_cross_val, test_index = split
    for train_index, val_index in inner_cross_val:
        X_train, y_train = X.iloc[train_index,:], y[train_index]
        X_val, y_val = X.iloc[val_index,:], y[val_index]

        X_train_list.append(X_train)
        y_train_list.append(y_train)

        X_val_list.append(X_val)
        y_val_list.append(y_val)

In [40]:
import lightgbm as lgb
data_train = lgb.Dataset(X_train, np.float32(y_train=='HR+'))
params = {    
          'boosting_type': 'gbdt',
          'objective': 'binary',
          'metric': 'auc',
          'nthread':4,
          'learning_rate':0.1,
          'num_leaves':30, 
          'max_depth': 5,   
          'subsample': 0.8, 
          'colsample_bytree': 0.8, 
    }
cv_results = lgb.cv(params, data_train, num_boost_round=1000, nfold=5, stratified=False, shuffle=True, metrics='auc',early_stopping_rounds=50,seed=0)
print('best n_estimators:', len(cv_results['auc-mean']))
print('best cv score:', pd.Series(cv_results['auc-mean']).max())

[LightGBM] [Info] Number of positive: 21, number of negative: 23
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2142
[LightGBM] [Info] Number of data points in the train set: 44, number of used features: 610
[LightGBM] [Info] Number of positive: 20, number of negative: 24
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2142
[LightGBM] [Info] Number of data points in the train set: 44, number of used features: 610
[LightGBM] [Info] Number of positive: 22, number of negative: 22
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2142
[LightGBM] [Info] Number of data points in the train set: 44, number of used features: 610
[LightGBM] [Info] Number of positive: 19, number of negative: 25
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2142
[LightGBM] [Info] Number of data po

best n_estimators: 44
best cv score: 0.6714285714285715


In [41]:
%%time
# update n_estimators，search max_depth and num_leaves
from sklearn.model_selection import GridSearchCV
params_test1={'max_depth': range(3,8,1), 
              'num_leaves':range(5, 100, 5)}

estimator = lgb.LGBMClassifier(boosting_type='gbdt',
                               objective='binary',
                               metrics='auc',
                               learning_rate=0.1, 
                               n_estimators=44, 
                               max_depth=6, 
                               bagging_fraction = 0.8,
                               feature_fraction = 0.8)

gsearch1 = GridSearchCV(estimator, param_grid = params_test1, scoring='roc_auc',cv=5,n_jobs=-1)
gsearch1.fit(X_train,y_train)
print(gsearch1.best_params_)

{'max_depth': 3, 'num_leaves': 5}
CPU times: user 998 ms, sys: 194 ms, total: 1.19 s
Wall time: 10.6 s


In [23]:
%%time
# step3：update max_depth and num_leaves，search max_bin and min_data_in_leaf
params_test2={'max_bin': range(5,256,10), 'min_data_in_leaf':range(1,102,10)}
estimator = lgb.LGBMClassifier(boosting_type='gbdt',
                               objective='binary',
                               metrics='auc',
                               learning_rate=0.1, 
                               n_estimators=44, 
                               max_depth=3, 
                               num_leaves=5,
                               bagging_fraction = 0.8,
                               feature_fraction = 0.8)

gsearch2 = GridSearchCV(estimator,param_grid = params_test2, scoring='roc_auc',cv=5,n_jobs=-1)
gsearch2.fit(X_train,y_train)
print(gsearch2.best_params_)

{'max_bin': 5, 'min_data_in_leaf': 1}
CPU times: user 2.86 s, sys: 397 ms, total: 3.26 s
Wall time: 30.5 s


In [25]:
%%time
# step4：update max_bin and min_data_in_leaf，search feature_fraction，bagging_fraction，bagging_freq
params_test3={'feature_fraction': [0.6,0.7,0.8,0.9,1.0],
              'bagging_fraction': [0.6,0.7,0.8,0.9,1.0],
              'bagging_freq': range(0,81,10)
}
estimator = lgb.LGBMClassifier(boosting_type='gbdt',
                               objective='binary',
                               metrics='auc',
                               learning_rate=0.1, 
                               n_estimators=44, 
                               max_depth=3, 
                               num_leaves=5,
                               max_bin=5,
                               min_data_in_leaf=1)
gsearch3 = GridSearchCV(estimator,param_grid = params_test3, scoring='roc_auc',cv=5,n_jobs=-1)
gsearch3.fit(X_train,y_train)
print(gsearch3.best_params_)

















[LightGBM] [ignored. Current value: bagging_freq=10




[Lightample_bytree=1.0 will be ignored. Current value: feature_fraction=0.7


[LightGBM]olsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.9


[LightGBM] =20 will be ignored. Current value: min_data_in_leaf=1
[Laction=0.7


 subsample=1.0 will be ignored. Current value: bagging_fraction=1.0












ta_in_leaf=1
















{'bagging_fraction': 0.6, 'bagging_freq': 0, 'feature_fraction': 0.9}
CPU times: user 3.66 s, sys: 514 ms, total: 4.17 s
Wall time: 1min 5s


In [26]:
%%time
# step5：update feature_fraction，bagging_fraction，bagging_freq， search lambda_l1 and lambda_l2
params_test4={'lambda_l1': [1e-5,1e-3,1e-1,0.0,0.1,0.3,0.5,0.7,0.9,1.0],
              'lambda_l2': [1e-5,1e-3,1e-1,0.0,0.1,0.3,0.5,0.7,0.9,1.0]
}
estimator = lgb.LGBMClassifier(boosting_type='gbdt',
                               objective='binary',
                               metrics='auc',
                               learning_rate=0.1, 
                               n_estimators=44, 
                               max_depth=3, 
                               num_leaves=5,
                               max_bin=5,
                               min_data_in_leaf=1,
                               bagging_fraction=0.6,
                               bagging_freq= 0, 
                               feature_fraction= 0.9)      
gsearch4 = GridSearchCV(estimator,param_grid = params_test4, scoring='roc_auc',cv=5,n_jobs=-1)
gsearch4.fit(X_train,y_train)
print(gsearch4.best_params_)











ubsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM]=0.9
[Lightample=1.0 will be ignored. Current value: bagging_fraction=1.0












{'lambda_l1': 0.9, 'lambda_l2': 0.0}
CPU times: user 1.99 s, sys: 297 ms, total: 2.29 s
Wall time: 28 s


In [27]:
%%time
# step6：update lambda_l1 and lambda_l2，search min_split_gain
params_test5={'min_split_gain':[0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]}
estimator = lgb.LGBMClassifier(boosting_type='gbdt',
                               objective='binary',
                               metrics='auc',
                               learning_rate=0.1, 
                               n_estimators=44,
                               max_depth=3, 
                               num_leaves=5,
                               max_bin=5,
                               min_data_in_leaf=1,
                               bagging_fraction=0.6,
                               bagging_freq= 0, 
                               feature_fraction= 0.9,
                               lambda_l1=0.9,
                               lambda_l2=0)
gsearch5 = GridSearchCV(estimator,param_grid = params_test5, scoring='roc_auc',cv=5,n_jobs=-1)
gsearch5.fit(X_train,y_train)
print(gsearch5.best_params_)

{'min_split_gain': 0.1}
CPU times: user 855 ms, sys: 142 ms, total: 997 ms
Wall time: 3.42 s


In [33]:
# We substitute the trained parameters into the model
import sklearn.metrics as metrics
model=lgb.LGBMClassifier(boosting_type='gbdt',
                               objective='binary',
                               metrics='auc',
                               learning_rate=0.1, 
                               n_estimators=44,
                               max_depth=3, 
                               num_leaves=5,
                               max_bin=5,
                               min_data_in_leaf=1,
                               bagging_fraction=0.6,
                               bagging_freq= 0, 
                               feature_fraction= 0.9,
                               lambda_l1=0.9,
                               lambda_l2=0,
                               min_split_gain=0.1)
model.fit(X_train,np.float32(y_train=='HR+'))
y_pre=model.predict(X_val)
print("acc:",metrics.accuracy_score(np.float32(y_val=='HR+'),y_pre))


acc: 0.8333333333333334


In [42]:
# With default parameters, the model behaves as follows:
model=lgb.LGBMClassifier()
model.fit(X_train,np.float32(y_train=='HR+'))
y_pre=model.predict(X_val)
print("acc:",metrics.accuracy_score(np.float32(y_val=='HR+'),y_pre))


acc: 0.5










In [43]:
accuracy_list = []
av_logloss_list = []

for (X_train, y_train, X_val, y_val) in zip(X_train_list, y_train_list, X_val_list, y_val_list):
    model=lgb.LGBMClassifier(boosting_type='gbdt',
                               objective='binary',
                               metrics='auc',
                               learning_rate=0.1, 
                               n_estimators=44,
                               max_depth=3, 
                               num_leaves=5,
                               max_bin=5,
                               min_data_in_leaf=1,
                               bagging_fraction=0.6,
                               bagging_freq= 0, 
                               feature_fraction= 0.9,
                               lambda_l1=0.9,
                               lambda_l2=0,
                               min_split_gain=0.1)
    model.fit(X_train,np.float32(y_train=='HR+'))

    y_pred = model.predict_proba(X_val)[:,1]
    predictions = [round(value) for value in y_pred]
    accuracy = accuracy_score(y_val=='HR+', predictions)
    av_logloss = log_loss(y_true=y_val=='HR+',labels=[True,False], y_pred=y_pred)

    accuracy_list.append(accuracy)
    av_logloss_list.append(av_logloss)

print(np.mean(accuracy_list),np.std(accuracy_list))
print(np.mean(av_logloss_list),np.std(av_logloss_list))















0.7035714285714287 0.17249585094750633
0.5682598090471788 0.17597558671977798
