In [1]:
import os
import gc
import numpy as np
import pandas as pd
import joblib
from datetime import datetime

from lightgbm import LGBMClassifier
import optuna
from prunedcv import PrunedCV

from codes.utils import import_data, drop_columns, cross_val_score_auc, reduce_mem_usage
from codes.fe_browser import latest
from codes.fe_emails import proton, mappings, labeling
from codes.fe_cards import stats
from codes.fe_date import dates
from codes.fe_relatives import divisions
from codes.fe_categorical import pairs, wtf
from codes.prepro import prepro
from codes.fe_users import users_stats

from sklearn.feature_selection import RFECV

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
DATA_PATH = '../input/'
SEARCH_PARAMS = True
N_FOLD = 8

In [3]:
train, test, sample_submission = import_data(DATA_PATH)

### Some Feature Engineering

drop columns, count encoding, aggregation, fillna

In [4]:
train, test = users_stats(train, test)

train, test = drop_columns(train, test)

train, test = latest(train, test)

train, test = proton(train, test)

train['nulls1'] = train.isna().sum(axis=1)
test['nulls1'] = test.isna().sum(axis=1)

train, test = mappings(train, test)
train, test = labeling(train, test)

train, test = stats(train, test)

train, test = divisions(train, test)

train, test = dates(train, test)

train, test = pairs(train, test)
train, test = wtf(train, test)

y_train = train['isFraud'].copy()


X_train = train.drop('isFraud', axis=1)
X_test = test.copy()

del train, test

#fill in mean for floats
X_train, X_test = prepro(X_train, X_test)

X_train = reduce_mem_usage(X_train)
X_test = reduce_mem_usage(X_test)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  all_data = pd.concat([train, test])


50 features are going to be dropped for being useless
Mem. usage decreased to 1003.32 Mb (12.2% reduction)
Mem. usage decreased to 845.63 Mb (12.2% reduction)


In [5]:
# columns = list(set(
# ['C{}'.format(i) for i in range(1,15)] \
# + ['D{}'.format(i) for i in range(1,16)] \
# + ['V' + str(i) for i in range(1,340)]))

# for col in columns:
#     if col in X_train.columns:
#         X_train[col + '_' + 'trx'] = X_train.TransactionAmt * X_train[col]
#         X_test[col + '_' + 'trx'] = X_test.TransactionAmt * X_test[col]

### Model and training

In [6]:
X_train[X_train == np.inf] = -1
X_train[X_train == -np.inf] = -1
X_test[X_test == np.inf] = -1
X_test[X_test == -np.inf] = -1

In [7]:
X_train.drop(['TransactionDT', 'TransactionAmt'], axis=1, inplace=True)
X_test.drop(['TransactionDT', 'TransactionAmt'], axis=1, inplace=True)

In [8]:
best_params = {'num_leaves': 302,
                 'max_depth': 157,
                 'subsample_for_bin': 290858,
                 'min_child_samples': 79,
                 'reg_alpha': 0.9919573524807885,
                 'colsample_bytree': 0.5653288564015742,
                 'learning_rate': 0.028565794309535042}
mod = LGBMClassifier(metric='auc',
                     boosting_type='gbdt')
mod.set_params(**best_params)
rfe = RFECV(mod, step=25, min_features_to_select=150, cv=4, scoring='roc_auc', verbose=1)
rfe.fit(X_train, y_train)

X_train = rfe.transform(X_train)
X_test = rfe.transform(X_test)

Fitting estimator with 431 features.
Fitting estimator with 406 features.
Fitting estimator with 381 features.
Fitting estimator with 356 features.
Fitting estimator with 331 features.
Fitting estimator with 306 features.
Fitting estimator with 281 features.
Fitting estimator with 256 features.
Fitting estimator with 231 features.
Fitting estimator with 206 features.
Fitting estimator with 181 features.
Fitting estimator with 156 features.
Fitting estimator with 131 features.
Fitting estimator with 106 features.
Fitting estimator with 81 features.
Fitting estimator with 56 features.
Fitting estimator with 431 features.
Fitting estimator with 406 features.
Fitting estimator with 381 features.
Fitting estimator with 356 features.
Fitting estimator with 331 features.
Fitting estimator with 306 features.
Fitting estimator with 281 features.
Fitting estimator with 256 features.
Fitting estimator with 231 features.
Fitting estimator with 206 features.
Fitting estimator with 181 features.
Fit

In [9]:
model = LGBMClassifier(metric='auc',
                       n_estimators=1000,
                       boosting_type='gbdt')

In [15]:
SEARCH_PARAMS = True

In [16]:
prun = PrunedCV(N_FOLD, 0.02, minimize=False)

In [17]:
def objective(trial):
    
    joblib.dump(study, 'study.pkl') 
    
    params = {
        'num_leaves': trial.suggest_int('num_leaves', 10, 1500), 
        'max_depth': trial.suggest_int('max_depth', 10, 1500), 
        'subsample_for_bin': trial.suggest_int('subsample_for_bin', 10, 3000000), 
        'min_child_samples': trial.suggest_int('min_child_samples', 2, 100000), 
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.00000000001, 10.0),
        'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.0001, 1.0),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.000001, 10.0)  
    }
    
#     params = {
#         'num_leaves': trial.suggest_int('num_leaves', 300, 310), 
#         'max_depth': trial.suggest_int('max_depth', 150, 160), 
#         'subsample_for_bin': trial.suggest_int('subsample_for_bin', 290000, 291000), 
#         'min_child_samples': trial.suggest_int('min_child_samples', 75, 82), 
#         'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.990, 0.993),
#         'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.55, 0.58),
#         'learning_rate': trial.suggest_loguniform('learning_rate', 0.02, 0.03)  
#     }
    
    
    model.set_params(**params)

    return prun.cross_val_score(model, 
                                X_train, 
                                y_train, 
                                metric='auc', 
                                shuffle=True, 
                                random_state=42)

In [None]:
if SEARCH_PARAMS:
    if os.path.isfile('study.pkl'):
        study = joblib.load('study.pkl')
    else:
        study = optuna.create_study()

    study.optimize(objective, timeout=60*60*12)
    joblib.dump(study, 'study.pkl')
    best_params = study.best_params
    
else:
    
    best_params = {'num_leaves': 302,
                 'max_depth': 157,
                 'subsample_for_bin': 290858,
                 'min_child_samples': 79,
                 'reg_alpha': 0.9919573524807885,
                 'colsample_bytree': 0.5653288564015742,
                 'learning_rate': 0.028565794309535042}

[I 2019-09-06 19:03:19,290] Finished trial#1 resulted in value: -0.8781856921688671. Current best value is -0.8781856921688671 with parameters: {'num_leaves': 528, 'max_depth': 1168, 'subsample_for_bin': 2068159, 'min_child_samples': 56081, 'reg_alpha': 0.83937678064185, 'colsample_bytree': 0.006678515656431696, 'learning_rate': 0.8799825205929812}.
[I 2019-09-06 19:04:12,883] Finished trial#2 resulted in value: -0.8549639254593047. Current best value is -0.8781856921688671 with parameters: {'num_leaves': 528, 'max_depth': 1168, 'subsample_for_bin': 2068159, 'min_child_samples': 56081, 'reg_alpha': 0.83937678064185, 'colsample_bytree': 0.006678515656431696, 'learning_rate': 0.8799825205929812}.
[I 2019-09-06 19:06:49,213] Finished trial#3 resulted in value: -0.8828669994139955. Current best value is -0.8828669994139955 with parameters: {'num_leaves': 498, 'max_depth': 1259, 'subsample_for_bin': 2883343, 'min_child_samples': 47979, 'reg_alpha': 0.6309799889159281, 'colsample_bytree': 0.

[I 2019-09-06 19:57:11,485] Finished trial#23 resulted in value: -0.9075310511838979. Current best value is -0.9563322847235526 with parameters: {'num_leaves': 1499, 'max_depth': 763, 'subsample_for_bin': 528261, 'min_child_samples': 19159, 'reg_alpha': 2.7619228387137966e-06, 'colsample_bytree': 0.8840719353100626, 'learning_rate': 0.1643925815426876}.
[I 2019-09-06 19:58:20,683] Finished trial#24 resulted in value: -0.8468890640832109. Current best value is -0.9563322847235526 with parameters: {'num_leaves': 1499, 'max_depth': 763, 'subsample_for_bin': 528261, 'min_child_samples': 19159, 'reg_alpha': 2.7619228387137966e-06, 'colsample_bytree': 0.8840719353100626, 'learning_rate': 0.1643925815426876}.
[I 2019-09-06 19:58:43,438] Finished trial#25 resulted in value: -0.6310749025162395. Current best value is -0.9563322847235526 with parameters: {'num_leaves': 1499, 'max_depth': 763, 'subsample_for_bin': 528261, 'min_child_samples': 19159, 'reg_alpha': 2.7619228387137966e-06, 'colsample

[I 2019-09-06 22:55:01,506] Finished trial#46 resulted in value: -0.8447702258791796. Current best value is -0.9750170068588928 with parameters: {'num_leaves': 854, 'max_depth': 1373, 'subsample_for_bin': 1793070, 'min_child_samples': 99, 'reg_alpha': 6.553205857697253e-10, 'colsample_bytree': 0.187934424563022, 'learning_rate': 0.06975277636212471}.
[I 2019-09-06 23:25:24,615] Finished trial#47 resulted in value: -0.973900000889306. Current best value is -0.9750170068588928 with parameters: {'num_leaves': 854, 'max_depth': 1373, 'subsample_for_bin': 1793070, 'min_child_samples': 99, 'reg_alpha': 6.553205857697253e-10, 'colsample_bytree': 0.187934424563022, 'learning_rate': 0.06975277636212471}.
[I 2019-09-06 23:26:13,517] Finished trial#48 resulted in value: -0.5348944234121003. Current best value is -0.9750170068588928 with parameters: {'num_leaves': 854, 'max_depth': 1373, 'subsample_for_bin': 1793070, 'min_child_samples': 99, 'reg_alpha': 6.553205857697253e-10, 'colsample_bytree': 

[I 2019-09-07 02:54:26,159] Finished trial#69 resulted in value: -0.9232238505756701. Current best value is -0.9750170068588928 with parameters: {'num_leaves': 854, 'max_depth': 1373, 'subsample_for_bin': 1793070, 'min_child_samples': 99, 'reg_alpha': 6.553205857697253e-10, 'colsample_bytree': 0.187934424563022, 'learning_rate': 0.06975277636212471}.
[I 2019-09-07 03:09:24,300] Finished trial#70 resulted in value: -0.9695292568118152. Current best value is -0.9750170068588928 with parameters: {'num_leaves': 854, 'max_depth': 1373, 'subsample_for_bin': 1793070, 'min_child_samples': 99, 'reg_alpha': 6.553205857697253e-10, 'colsample_bytree': 0.187934424563022, 'learning_rate': 0.06975277636212471}.
[I 2019-09-07 03:10:43,398] Finished trial#71 resulted in value: -0.9527680204906781. Current best value is -0.9750170068588928 with parameters: {'num_leaves': 854, 'max_depth': 1373, 'subsample_for_bin': 1793070, 'min_child_samples': 99, 'reg_alpha': 6.553205857697253e-10, 'colsample_bytree':

In [None]:
model.set_params(**best_params)

cross_val_score_auc(model,
                    X_train,
                    y_train,
                    n_fold=N_FOLD,
                    stratify=True,
                    shuffle=True,
                    random_state=42,
                    predict=True,
                    X_test=X_test,
                    submission=sample_submission)

In [19]:
# ROC accuracy: 0.9752166854560683, Train: 0.9999880028138726
# ROC accuracy: 0.978549489713329, Train: 0.9999846402519508
# ROC accuracy: 0.9775330875670358, Train: 0.9999857411401932
# ROC accuracy: 0.9779064734264544, Train: 0.9999820012259492
# ROC accuracy: 0.9759618973923397, Train: 0.9999879506811296
# ROC accuracy: 0.9760439850075724, Train: 0.999982252164232
# ROC accuracy: 0.9777317260455965, Train: 0.9999817054370517
# ROC accuracy: 0.9770691545023485, Train: 0.9999847023569826


# 0.9770015623888431