In [1]:
import os
import gc
import numpy as np
import pandas as pd
import joblib
from datetime import datetime

from lightgbm import LGBMClassifier
import optuna
from prunedcv import PrunedCV

from codes.utils import import_data, drop_columns, cross_val_score_auc, reduce_mem_usage
from codes.fe_browser import latest
from codes.fe_emails import proton, mappings, labeling
from codes.fe_cards import stats
from codes.fe_date import dates
from codes.fe_relatives import divisions
from codes.fe_categorical import pairs, wtf
from codes.prepro import prepro
from codes.fe_users import users_stats

from sklearn.feature_selection import RFECV

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
DATA_PATH = '../input/'
SEARCH_PARAMS = False
SEARCH_FEATURES = False
N_FOLD = 8

In [3]:
train, test, sample_submission = import_data(DATA_PATH)

### Some Feature Engineering

drop columns, count encoding, aggregation, fillna

In [4]:
train, test = users_stats(train, test)

train, test = drop_columns(train, test)

train, test = latest(train, test)

train, test = proton(train, test)

train['nulls1'] = train.isna().sum(axis=1)
test['nulls1'] = test.isna().sum(axis=1)

train, test = mappings(train, test)
train, test = labeling(train, test)

train, test = stats(train, test)

train, test = divisions(train, test)

train, test = dates(train, test)

train, test = pairs(train, test)
train, test = wtf(train, test)

y_train = train['isFraud'].copy()


X_train = train.drop('isFraud', axis=1)
X_test = test.copy()

del train, test

#fill in mean for floats
X_train, X_test = prepro(X_train, X_test)

X_train = reduce_mem_usage(X_train)
X_test = reduce_mem_usage(X_test)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  all_data = pd.concat([train, test])


50 features are going to be dropped for being useless
Mem. usage decreased to 1003.32 Mb (12.2% reduction)
Mem. usage decreased to 845.63 Mb (12.2% reduction)


### Model and training

In [5]:
X_train[X_train == np.inf] = -1
X_train[X_train == -np.inf] = -1
X_test[X_test == np.inf] = -1
X_test[X_test == -np.inf] = -1

In [6]:
if SEARCH_FEATURES:
    best_params = {'num_leaves': 302,
                 'max_depth': 157,
                 'subsample_for_bin': 290858,
                 'min_child_samples': 79,
                 'reg_alpha': 0.9919573524807885,
                 'colsample_bytree': 0.5653288564015742,
                 'learning_rate': 0.028565794309535042}
    mod = LGBMClassifier(metric='auc',
                     boosting_type='gbdt')
    mod.set_params(**best_params)
    rfe = RFECV(mod, step=25, min_features_to_select=150, cv=4, scoring='roc_auc', verbose=1)
    rfe.fit(X_train, y_train)

    columns = list(X_test.columns[rfe.get_support()])
    joblib.dump(columns, 'columns.pkl')

    X_train = X_train.loc[:,columns]
    X_test = X_test.loc[:,columns]
else:
    columns = joblib.load('columns.pkl')
    columns.append('TransactionAmt')
    X_train = X_train.loc[:,columns]
    X_test = X_test.loc[:,columns]

In [7]:
model = LGBMClassifier(metric='auc',
                       n_estimators=1000,
                       boosting_type='gbdt')

In [8]:
prun = PrunedCV(N_FOLD, 0.02, minimize=False)

In [9]:
def objective(trial):
    
    joblib.dump(study, 'study.pkl') 
    
    params = {
        'num_leaves': trial.suggest_int('num_leaves', 10, 1500), 
        'max_depth': trial.suggest_int('max_depth', 10, 1500), 
        'subsample_for_bin': trial.suggest_int('subsample_for_bin', 10, 3000000), 
        'min_child_samples': trial.suggest_int('min_child_samples', 2, 100000), 
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.00000000001, 10.0),
        'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.0001, 1.0),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.000001, 10.0)  
    }
    
#     params = {
#         'num_leaves': trial.suggest_int('num_leaves', 300, 310), 
#         'max_depth': trial.suggest_int('max_depth', 150, 160), 
#         'subsample_for_bin': trial.suggest_int('subsample_for_bin', 290000, 291000), 
#         'min_child_samples': trial.suggest_int('min_child_samples', 75, 82), 
#         'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.990, 0.993),
#         'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.55, 0.58),
#         'learning_rate': trial.suggest_loguniform('learning_rate', 0.02, 0.03)  
#     }
    
    
    model.set_params(**params)

    return prun.cross_val_score(model, 
                                X_train, 
                                y_train, 
                                metric='auc', 
                                shuffle=True, 
                                random_state=42)

In [10]:
if SEARCH_PARAMS:
    if os.path.isfile('study.pkl'):
        study = joblib.load('study.pkl')
    else:
        study = optuna.create_study()

    study.optimize(objective, timeout=60*60*12)
    joblib.dump(study, 'study.pkl')
    best_params = study.best_params
    
else:
    
    best_params = {'num_leaves': 302,
                 'max_depth': 157,
                 'subsample_for_bin': 290858,
                 'min_child_samples': 79,
                 'reg_alpha': 0.9919573524807885,
                 'colsample_bytree': 0.5653288564015742,
                 'learning_rate': 0.028565794309535042}

In [11]:
model.set_params(**best_params)

cross_val_score_auc(model,
                    X_train,
                    y_train,
                    n_fold=N_FOLD,
                    stratify=True,
                    shuffle=True,
                    random_state=42,
                    predict=True,
                    X_test=X_test,
                    submission=sample_submission)

HBox(children=(IntProgress(value=0, max=8), HTML(value='')))

ROC accuracy: 0.9745746256909068, Train: 0.9999864637887476
ROC accuracy: 0.9781481255938009, Train: 0.9999883082895623
ROC accuracy: 0.9778578756016881, Train: 0.9999818318499949
ROC accuracy: 0.9780804082043367, Train: 0.9999800269478806
ROC accuracy: 0.9765007886820439, Train: 0.9999855370460508
ROC accuracy: 0.9764588549506961, Train: 0.9999851612763766
ROC accuracy: 0.9778002052659194, Train: 0.9999820951007937
ROC accuracy: 0.9766845328192199, Train: 0.9999853038456252




0.9770131771010764

In [12]:
# ROC accuracy: 0.9747431963385, Train: 0.9999885797125879
# ROC accuracy: 0.9783240331977164, Train: 0.9999843937860894
# ROC accuracy: 0.9776021473477676, Train: 0.999985213046599
# ROC accuracy: 0.9776277016948994, Train: 0.9999816344110959
# ROC accuracy: 0.9763353593387132, Train: 0.9999850285854255
# ROC accuracy: 0.975954065269458, Train: 0.9999858101613444
# ROC accuracy: 0.9777313673449186, Train: 0.9999735340342005
# ROC accuracy: 0.9765956112785853, Train: 0.99998214722258


# 0.9768641852263198