In [0]:
!pip install optuna


Collecting optuna
[?25l  Downloading https://files.pythonhosted.org/packages/6c/32/b8de89bd281c9799365a374daa3f66b94f61363be24a8647a38aa5498cfb/optuna-1.2.0.tar.gz (146kB)
[K     |████████████████████████████████| 153kB 3.3MB/s 
[?25hCollecting alembic
[?25l  Downloading https://files.pythonhosted.org/packages/e0/e9/359dbb77c35c419df0aedeb1d53e71e7e3f438ff64a8fdb048c907404de3/alembic-1.4.1.tar.gz (1.1MB)
[K     |████████████████████████████████| 1.1MB 10.3MB/s 
[?25hCollecting cliff
[?25l  Downloading https://files.pythonhosted.org/packages/90/e4/624f02aa2fcbf6efcd9d6bf90f92836a2ae46bc4376a824e317d10506fc8/cliff-3.0.0-py3-none-any.whl (79kB)
[K     |████████████████████████████████| 81kB 7.8MB/s 
[?25hCollecting colorlog
  Downloading https://files.pythonhosted.org/packages/00/0d/22c73c2eccb21dd3498df7d22c0b1d4a30f5a5fb3feb64e1ce06bc247747/colorlog-4.1.0-py2.py3-none-any.whl
Collecting Mako
[?25l  Downloading https://files.pythonhosted.org/packages/50/78/f6ade1e18aebda570eed3

In [0]:
import os
import numpy as np
import pandas as pd
from sklearn import preprocessing
import xgboost as xgb
import lightgbm as lgb
import optuna
import functools
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve,auc,accuracy_score,confusion_matrix,f1_score

In [0]:
train_transaction = pd.read_csv('../input/train_transaction.csv', index_col='TransactionID')
test_transaction = pd.read_csv('../input/test_transaction.csv', index_col='TransactionID')

train_identity = pd.read_csv('../input/train_identity.csv', index_col='TransactionID')
test_identity = pd.read_csv('../input/test_identity.csv', index_col='TransactionID')

sample_submission = pd.read_csv('../input/sample_submission.csv', index_col='TransactionID')

train = train_transaction.merge(train_identity, how='left', left_index=True, right_index=True)
test = test_transaction.merge(test_identity, how='left', left_index=True, right_index=True)

print(train.shape)
print(test.shape)

y_train = train['isFraud'].copy()
del train_transaction, train_identity, test_transaction, test_identity

# Drop target, fill in NaNs
X_train = train.drop('isFraud', axis=1)
X_test = test.copy()

del train, test

X_train = X_train.fillna(-999)
X_test = X_test.fillna(-999)

# Label Encoding
for f in X_train.columns:
    if X_train[f].dtype=='object' or X_test[f].dtype=='object': 
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(X_train[f].values) + list(X_test[f].values))
        X_train[f] = lbl.transform(list(X_train[f].values))
        X_test[f] = lbl.transform(list(X_test[f].values))   

FileNotFoundError: ignored

In [0]:
(X_train,X_eval,y_train,y_eval) = train_test_split(X_train,y_train,test_size=0.2,random_state=0)

In [0]:
def opt(X_train, y_train, X_test, y_test, trial):
    #param_list
    n_estimators = trial.suggest_int('n_estimators', 0, 1000)
    max_depth = trial.suggest_int('max_depth', 1, 20)
    min_child_weight = trial.suggest_int('min_child_weight', 1, 20)
    #learning_rate = trial.suggest_discrete_uniform('learning_rate', 0.01, 0.1, 0.01)
    scale_pos_weight = trial.suggest_int('scale_pos_weight', 1, 100)
    subsample = trial.suggest_discrete_uniform('subsample', 0.5, 0.9, 0.1)
    colsample_bytree = trial.suggest_discrete_uniform('colsample_bytree', 0.5, 0.9, 0.1)

    xgboost_tuna = xgb.XGBClassifier(
        random_state=42, 
        tree_method='gpu_hist',
        n_estimators = n_estimators,
        max_depth = max_depth,
        min_child_weight = min_child_weight,
        #learning_rate = learning_rate,
        scale_pos_weight = scale_pos_weight,
        subsample = subsample,
        colsample_bytree = colsample_bytree,
    )
    xgboost_tuna.fit(X_train, y_train)
    tuna_pred_test = xgboost_tuna.predict(X_test)
    
    return (1.0 - (accuracy_score(y_test, tuna_pred_test)))

In [0]:
study = optuna.create_study()
study.optimize(functools.partial(opt, X_train, y_train, X_eval, y_eval), n_trials=100)

[I 2019-07-18 16:50:13,501] Finished trial#0 resulted in value: 0.011040742371388856. Current best value is 0.011040742371388856 with parameters: {'n_estimators': 841, 'max_depth': 19, 'min_child_weight': 2, 'scale_pos_weight': 58, 'subsample': 0.6, 'colsample_bytree': 0.8}.
[I 2019-07-18 16:50:36,315] Finished trial#1 resulted in value: 0.2977105699867918. Current best value is 0.011040742371388856 with parameters: {'n_estimators': 841, 'max_depth': 19, 'min_child_weight': 2, 'scale_pos_weight': 58, 'subsample': 0.6, 'colsample_bytree': 0.8}.
[I 2019-07-18 16:51:15,938] Finished trial#2 resulted in value: 0.012505503437531762. Current best value is 0.011040742371388856 with parameters: {'n_estimators': 841, 'max_depth': 19, 'min_child_weight': 2, 'scale_pos_weight': 58, 'subsample': 0.6, 'colsample_bytree': 0.8}.
[I 2019-07-18 16:52:02,652] Finished trial#3 resulted in value: 0.030251972770684477. Current best value is 0.011040742371388856 with parameters: {'n_estimators': 841, 'max_d

In [0]:
study.best_params

{'n_estimators': 874,
 'max_depth': 11,
 'min_child_weight': 3,
 'scale_pos_weight': 13,
 'subsample': 0.7,
 'colsample_bytree': 0.7}

In [0]:
clf = xgb.XGBClassifier(tree_method='gpu_hist',**study.best_params)
clf.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=11,
              min_child_weight=3, missing=None, n_estimators=874, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=13, seed=None,
              silent=None, subsample=0.7, tree_method='gpu_hist', verbosity=1)

In [0]:
sample_submission['isFraud'] = clf.predict_proba(X_test)[:,1]
sample_submission.to_csv('submission.csv')