In [1]:
import os
import gc
import numpy as np
import pandas as pd
import joblib
from datetime import datetime

from lightgbm import LGBMClassifier
import optuna
from prunedcv import PrunedCV

from codes.utils import import_data, drop_columns, cross_val_score_auc, reduce_mem_usage
from codes.fe_browser import latest
from codes.fe_emails import proton, mappings, labeling
from codes.fe_cards import stats
from codes.fe_date import dates
from codes.fe_relatives import divisions
from codes.fe_categorical import pairs, wtf
from codes.prepro import prepro
from codes.fe_users import users_stats

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
DATA_PATH = '../input/'
SEARCH_PARAMS = False
N_FOLD = 8

In [3]:
train, test, sample_submission = import_data(DATA_PATH)

### Some Feature Engineering

drop columns, count encoding, aggregation, fillna

In [4]:
train, test = users_stats(train, test)

train, test = drop_columns(train, test)

train, test = latest(train, test)

train, test = proton(train, test)

train['nulls1'] = train.isna().sum(axis=1)
test['nulls1'] = test.isna().sum(axis=1)

train, test = mappings(train, test)
train, test = labeling(train, test)

train, test = stats(train, test)

train, test = divisions(train, test)

train, test = dates(train, test)

train, test = pairs(train, test)
train, test = wtf(train, test)

y_train = train['isFraud'].copy()


X_train = train.drop('isFraud', axis=1)
X_test = test.copy()

del train, test

#fill in mean for floats
X_train, X_test = prepro(X_train, X_test)

X_train = reduce_mem_usage(X_train)
X_test = reduce_mem_usage(X_test)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  all_data = pd.concat([train, test])


50 features are going to be dropped for being useless
Mem. usage decreased to 1003.32 Mb (12.2% reduction)
Mem. usage decreased to 845.63 Mb (12.2% reduction)


### Model and training

In [5]:
model = LGBMClassifier(metric='auc',
                       n_estimators=2000,
                       boosting_type='gbdt',
                       is_unbalance=True,)

In [6]:
prun = PrunedCV(N_FOLD, 0.02, minimize=False)

def objective(trial):
    
    joblib.dump(study, 'study.pkl') 
    
    params = {
        'max_depth': trial.suggest_int('max_depth', 10, 5000), 
        'subsample_for_bin': trial.suggest_int('subsample_for_bin', 1000, 3000000), 
        'min_child_samples': trial.suggest_int('min_child_samples', 2, 100000), 
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.00000000001, 10.0),
        'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.0001, 1.0),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.000001, 10.0)  
    }
    
    model = LGBMClassifier()
    model.set_params(**params)

    return prun.cross_val_score(model, 
                                X_train, 
                                y, 
                                metric='auc', 
                                shuffle=True, 
                                random_state=42)

In [7]:
if SEARCH_PARAMS:
    if os.path.isfile('study.pkl'):
        study = joblib.load('study.pkl')
    else:
        study = optuna.create_study()

    study.optimize(objective, timeout=60*60*7)
    joblib.dump(study, 'study.pkl')
    best_params = study.best_params
    
else:
    
    best_params = {'num_leaves': 302,
                   'max_depth': 157,
                   'n_estimators': 1200,
                   'subsample_for_bin': 290858,
                   'min_child_samples': 79,
                   'reg_alpha': 1.0919573524807885,
                   'colsample_bytree': 0.5653288564015742,
                   'learning_rate': 0.028565794309535042}

In [8]:
model.get_params()

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': -1,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 2000,
 'n_jobs': -1,
 'num_leaves': 31,
 'objective': None,
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'silent': True,
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 0,
 'metric': 'auc',
 'is_unbalance': True}

In [9]:
model = LGBMClassifier(metric='auc',
                       n_estimators=2000,
                       boosting_type='gbdt',
                       is_unbalance=True,)

model.set_params(**best_params)

cross_val_score_auc(model,
                    X_train,
                    y_train,
                    n_fold=N_FOLD,
                    stratify=True,
                    shuffle=True,
                    random_state=42,
                    predict=True,
                    X_test=X_test,
                    submission=sample_submission)

HBox(children=(IntProgress(value=0, max=8), HTML(value='')))

KeyboardInterrupt: 

In [None]:
# ROC accuracy: 0.9751335550235447, Train: 0.9999994424014916
# ROC accuracy: 0.979350658169819, Train: 0.9999995681855991
# ROC accuracy: 0.978078995160897, Train: 0.9999994764541557
# ROC accuracy: 0.9785157994968532, Train: 0.9999991995960142
# ROC accuracy: 0.9763520549904333, Train: 0.999999053180651
# ROC accuracy: 0.9769807209581446, Train: 0.9999992919944019
# ROC accuracy: 0.977527478618695, Train: 0.9999993639818112
# ROC accuracy: 0.9786526157982463, Train: 0.9999991182307375

# 0.9775739847770791