In [1]:
import os
import gc
import numpy as np
import pandas as pd
import joblib
from datetime import datetime

from lightgbm import LGBMClassifier
import optuna
from prunedcv import PrunedCV

from codes.utils import import_data, drop_columns, cross_val_score_auc, reduce_mem_usage
from codes.fe_browser import latest
from codes.fe_emails import proton, mappings, labeling
from codes.fe_cards import stats
from codes.fe_date import dates
from codes.fe_relatives import divisions
from codes.fe_categorical import pairs, wtf
from codes.prepro import prepro
from codes.fe_users import users_stats

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
DATA_PATH = '../input/'
SEARCH_PARAMS = True
N_FOLD = 8

In [3]:
train, test, sample_submission = import_data(DATA_PATH)

### Some Feature Engineering

drop columns, count encoding, aggregation, fillna

In [4]:
train, test = users_stats(train, test)

train, test = drop_columns(train, test)

train, test = latest(train, test)

train, test = proton(train, test)

train['nulls1'] = train.isna().sum(axis=1)
test['nulls1'] = test.isna().sum(axis=1)

train, test = mappings(train, test)
train, test = labeling(train, test)

train, test = stats(train, test)

train, test = divisions(train, test)

train, test = dates(train, test)

train, test = pairs(train, test)
train, test = wtf(train, test)

y_train = train['isFraud'].copy()


X_train = train.drop('isFraud', axis=1)
X_test = test.copy()

del train, test

#fill in mean for floats
X_train, X_test = prepro(X_train, X_test)

X_train = reduce_mem_usage(X_train)
X_test = reduce_mem_usage(X_test)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  all_data = pd.concat([train, test])


50 features are going to be dropped for being useless
Mem. usage decreased to 1003.32 Mb (12.2% reduction)
Mem. usage decreased to 845.63 Mb (12.2% reduction)


### Model and training

In [12]:
model = LGBMClassifier(metric='auc',
                       n_estimators=1000,
                       boosting_type='gbdt')

In [13]:
prun = PrunedCV(N_FOLD, 0.02, minimize=False)

In [18]:
def objective(trial):
    
    joblib.dump(study, 'study.pkl') 
    
    params = {
        'num_leaves': trial.suggest_int('num_leaves', 10, 1500), 
        'max_depth': trial.suggest_int('max_depth', 10, 1500), 
        'subsample_for_bin': trial.suggest_int('subsample_for_bin', 10, 3000000), 
        'min_child_samples': trial.suggest_int('min_child_samples', 2, 100000), 
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.00000000001, 10.0),
        'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.0001, 1.0),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.000001, 10.0)  
    }
    
#     params = {
#         'num_leaves': trial.suggest_int('num_leaves', 300, 310), 
#         'max_depth': trial.suggest_int('max_depth', 150, 160), 
#         'subsample_for_bin': trial.suggest_int('subsample_for_bin', 290000, 291000), 
#         'min_child_samples': trial.suggest_int('min_child_samples', 75, 82), 
#         'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.990, 0.993),
#         'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.55, 0.58),
#         'learning_rate': trial.suggest_loguniform('learning_rate', 0.02, 0.03)  
#     }
    
    
    model.set_params(**params)

    return prun.cross_val_score(model, 
                                X_train, 
                                y_train, 
                                metric='auc', 
                                shuffle=True, 
                                random_state=42)

In [None]:
SEARCH_PARAMS = False

In [20]:
if SEARCH_PARAMS:
    if os.path.isfile('study.pkl'):
        study = joblib.load('study.pkl')
    else:
        study = optuna.create_study()

    study.optimize(objective, timeout=60*60*1)
    joblib.dump(study, 'study.pkl')
    best_params = study.best_params
    
else:
    
    best_params = {'num_leaves': 302,
                 'max_depth': 157,
                 'subsample_for_bin': 290858,
                 'min_child_samples': 79,
                 'reg_alpha': 0.9919573524807885,
                 'colsample_bytree': 0.5653288564015742,
                 'learning_rate': 0.028565794309535042}

[I 2019-09-05 13:54:38,914] Finished trial#3 resulted in value: -0.9131293322211296. Current best value is -0.9769934641400203 with parameters: {'num_leaves': 303, 'max_depth': 150, 'subsample_for_bin': 290623, 'min_child_samples': 76, 'reg_alpha': 0.9902037380820305, 'colsample_bytree': 0.5786227667694329, 'learning_rate': 0.02016609575045021}.
[I 2019-09-05 13:56:47,128] Finished trial#4 resulted in value: -0.9146594691052718. Current best value is -0.9769934641400203 with parameters: {'num_leaves': 303, 'max_depth': 150, 'subsample_for_bin': 290623, 'min_child_samples': 76, 'reg_alpha': 0.9902037380820305, 'colsample_bytree': 0.5786227667694329, 'learning_rate': 0.02016609575045021}.
[I 2019-09-05 13:58:46,345] Finished trial#5 resulted in value: -0.9145228225543517. Current best value is -0.9769934641400203 with parameters: {'num_leaves': 303, 'max_depth': 150, 'subsample_for_bin': 290623, 'min_child_samples': 76, 'reg_alpha': 0.9902037380820305, 'colsample_bytree': 0.5786227667694

KeyboardInterrupt: 

In [22]:
model = LGBMClassifier(metric='auc',
                       n_estimators=1000,
                       boosting_type='gbdt')

model.set_params(**best_params)

cross_val_score_auc(model,
                    X_train,
                    y_train,
                    n_fold=N_FOLD,
                    stratify=True,
                    shuffle=True,
                    random_state=42,
                    predict=True,
                    X_test=X_test,
                    submission=sample_submission)

HBox(children=(IntProgress(value=0, max=8), HTML(value='')))

ROC accuracy: 0.9751879082829373, Train: 0.9999352358261676
ROC accuracy: 0.9787100821002697, Train: 0.9999107349898608
ROC accuracy: 0.9778540984278778, Train: 0.9999137696701055
ROC accuracy: 0.9783780060223369, Train: 0.9999091975176261
ROC accuracy: 0.9761992615163245, Train: 0.999916894750745
ROC accuracy: 0.9766395096557384, Train: 0.9999281378200259
ROC accuracy: 0.9777909388317408, Train: 0.9999049420126224
ROC accuracy: 0.9776969661329578, Train: 0.999904317276327



0.9773070963712729

In [23]:
model.get_params()

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 0.5786227667694329,
 'importance_type': 'split',
 'learning_rate': 0.02016609575045021,
 'max_depth': 150,
 'min_child_samples': 76,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 1000,
 'n_jobs': -1,
 'num_leaves': 303,
 'objective': None,
 'random_state': None,
 'reg_alpha': 0.9902037380820305,
 'reg_lambda': 0.0,
 'silent': True,
 'subsample': 1.0,
 'subsample_for_bin': 290623,
 'subsample_freq': 0,
 'metric': 'auc'}

In [None]:
# ROC accuracy: 0.9751879082829373, Train: 0.9999352358261676
# ROC accuracy: 0.9787100821002697, Train: 0.9999107349898608
# ROC accuracy: 0.9778540984278778, Train: 0.9999137696701055
# ROC accuracy: 0.9783780060223369, Train: 0.9999091975176261
# ROC accuracy: 0.9761992615163245, Train: 0.999916894750745
# ROC accuracy: 0.9766395096557384, Train: 0.9999281378200259
# ROC accuracy: 0.9777909388317408, Train: 0.9999049420126224
# ROC accuracy: 0.9776969661329578, Train: 0.999904317276327

# 0.9773070963712729