In [1]:
import os
import gc
import numpy as np
import pandas as pd
import joblib

from lightgbm import LGBMClassifier
import optuna

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold, cross_val_score

from ieee import utils
from ieee import fe_browser
from ieee import fe_emails
from ieee import fe_cards
from ieee import fe_date
from ieee import fe_relatives
from ieee import fe_categorical
from ieee import prepro

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.
Using TensorFlow backend.
W0812 22:55:14.534492 4602312128 deprecation_wrapper.py:119] From /Users/piotrgabrys/.pyenv/versions/miniconda3-4.3.30/lib/python3.6/site-packages/optuna/integration/tensorflow.py:7: The name tf.train.SessionRunHook is deprecated. Please use tf.estimator.SessionRunHook instead.



In [2]:
DATA_PATH = '../input/'

In [3]:
train, test, sample_submission = utils.import_data(DATA_PATH)

### Some Feature Engineering

drop columns, count encoding, aggregation, fillna

In [4]:
train, test = utils.drop_columns(train, test)

50 features are going to be dropped for being useless


In [5]:
train, test = fe_browser.latest(train, test)

In [6]:
train, test = fe_emails.proton(train, test)

train['nulls1'] = train.isna().sum(axis=1)
test['nulls1'] = test.isna().sum(axis=1)

train, test = fe_emails.mappings(train, test)
train, test = fe_emails.labeling(train, test)

In [7]:
train, test = fe_cards.stats(train, test)

In [8]:
train, test = fe_relatives.divisions(train, test)

In [9]:
train, test = fe_date.dates(train, test)

In [10]:
train, test = fe_categorical.pairs(train, test)
train, test = fe_categorical.wtf(train, test)

In [11]:
y_train = train['isFraud'].copy()


X_train = train.drop('isFraud', axis=1)
X_test = test.copy()

del train, test

#fill in mean for floats
X_train, X_test = prepro.prepro(X_train, X_test)

### Model and training

In [12]:
submission=sample_submission.copy()
submission['isFraud'] = 0

In [13]:
model = LGBMClassifier(metric='auc')

In [14]:
def objective(trial):
    
    joblib.dump(study, 'study.pkl')
    
    num_leaves = trial.suggest_int('num_leaves', 2, 200) 
    max_depth = trial.suggest_int('max_depth', 2, 100) 
    n_estimators = trial.suggest_int('n_estimators', 10, 500) 
    subsample_for_bin = trial.suggest_int('subsample_for_bin', 2000, 300_000) 
    min_child_samples = trial.suggest_int('min_child_samples', 20, 1000) 
    reg_alpha = trial.suggest_uniform('reg_alpha', 0.0, 1.0) 
    colsample_bytree = trial.suggest_uniform('colsample_bytree', 0.6, 1.0) 
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-0)   

    params = {
        'num_leaves': num_leaves,
        'max_depth': max_depth,
        'n_estimators': n_estimators,
        'subsample_for_bin': subsample_for_bin,
        'min_child_samples': min_child_samples,
        'reg_alpha': reg_alpha,
        'colsample_bytree': colsample_bytree,
        'learning_rate': learning_rate
    }
    
    model.set_params(**params)

    return - np.mean(cross_val_score(model, X_train, y_train, cv=4, scoring='roc_auc'))

In [15]:
if os.path.isfile('study.pkl'):
    study = joblib.load('study.pkl')
else:
    study = optuna.create_study()
study.optimize(objective, timeout=60*60*8)

In [16]:
print(study.best_params)

{'num_leaves': 116, 'max_depth': 78, 'n_estimators': 273, 'subsample_for_bin': 87043, 'min_child_samples': 1000, 'reg_alpha': 0.9028845416568297, 'colsample_bytree': 0.6967754069828626, 'learning_rate': 0.0442974258725275}


In [17]:
n_fold = 8
folds = KFold(n_splits=n_fold, shuffle=True)

for train_index, valid_index in folds.split(X_train):
    model.set_params(**study.best_params)
    X_train_, X_valid = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_train_, y_valid = y_train.iloc[train_index], y_train.iloc[valid_index]
    model.fit(X_train_,y_train_)
    del X_train_,y_train_
    pred=model.predict_proba(X_test)[:,1]
    val=model.predict_proba(X_valid)[:,1]
    del X_valid
    print('ROC accuracy: {}'.format(roc_auc_score(y_valid, val)))
    del val, y_valid
    submission['isFraud'] = submission['isFraud'] + pred / n_fold
    del pred

ROC accuracy: 0.9582742592595882
ROC accuracy: 0.9637357240766808
ROC accuracy: 0.9583961402329393
ROC accuracy: 0.9582335030618865
ROC accuracy: 0.9594001456607438
ROC accuracy: 0.9619460730980097
ROC accuracy: 0.9609497665977594
ROC accuracy: 0.9577101737225144


In [18]:
submission.to_csv('submission.csv')