<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Lode-data" data-toc-modified-id="Lode-data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Lode data</a></span></li><li><span><a href="#Params" data-toc-modified-id="Params-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Params</a></span></li><li><span><a href="#Fitting" data-toc-modified-id="Fitting-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Fitting</a></span></li></ul></div>

In [1]:
import pandas as pd
import numpy as np
import os

import seaborn as sns
import matplotlib.pyplot as plt

import xgboost as xgb
# import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics

# Lode data

In [2]:
def loaddata():
    root_dir = os.path.abspath(os.path.join(os.getcwd(),'..'))
    data_dir = root_dir + '/dataset'
    train = pd.read_csv(data_dir + '/train.csv', index_col=0)
    test = pd.read_csv(data_dir + '/test.csv', index_col=0)

    return train, test

In [3]:
data, test = loaddata()
x_data, y_data = data.iloc[:,1:], data['target']

In [4]:
print('x_data.shape: ',x_data.shape)
print('y_data.shape: ',y_data.shape)

x_data.shape:  (200000, 200)
y_data.shape:  (200000,)


# Params

In [5]:
config = {
    "seed":2019,
    "k_folds":5,
    "early_stopping_rounds":100
}

params = {
    "learning_rate": 0.1,
    "n_estimators": 10000,
    "max_depth": 3,
    "min_child_weight": 5,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "colsample_bylevel": 0.8,
    "alpha": 0,
    "lambda": 1,
    "objective": "gpu:binary:logistic",
    "tree_method": "gpu_hist",
    "predictor": "gpu_predictor",
    "eval_metric":"auc"
}

# Fitting

In [None]:
folds = StratifiedKFold(n_splits=config['k_folds'], random_state=config['seed'], shuffle=True)

auc_list = list()
prediction = np.zeros(len(test))
score_df = pd.DataFrame()
for i, (train_idx, valid_idx) in enumerate(folds.split(X=x_data, y=y_data)):
    x_train, y_train = x_data.iloc[train_idx, :], y_data.iloc[train_idx]
    x_valid, y_valid = x_data.iloc[valid_idx, :], y_data.iloc[valid_idx]
    model = xgb.XGBClassifier(**params)
    model.fit(x_train, y_train, 
              eval_set=[(x_train, y_train), (x_valid, y_valid)],  
              early_stopping_rounds=config['early_stopping_rounds'], 
              verbose=200)
    
    prob = model.predict_proba(x_valid, ntree_limit=model.best_iteration)[:,1]

    auc = metrics.roc_auc_score(y_true=y_valid, y_score=prob)
    auc_list.append(auc)
    print('{} fold validation AUC: {}'.format(i, auc))
    
    proba = model.predict_proba(test, ntree_limit=model.best_iteration)[:,1]
    prediction += proba 
    
    score_df = pd.DataFrame()
    feature_score =  model.get_booster().get_score(importance_type='gain')
    score_df.loc[:,'feature'] = list(feature_score.keys())
    score_df.loc[:,'importance'] = list(feature_score.values())
    fold_score_df = pd.concat([fold_score_df, score_df])

prediction /=  folds.n_splits

print('='*100)
print('AUC_LIST')
print(auc_list)

print('-'*100)
print('Mean AUC: {}'.format(np.mean(auc_list)))

[0]	validation_0-auc:0.606667	validation_1-auc:0.605384
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 100 rounds.
[200]	validation_0-auc:0.893565	validation_1-auc:0.863872
[400]	validation_0-auc:0.920488	validation_1-auc:0.884266
[600]	validation_0-auc:0.933232	validation_1-auc:0.891317
[800]	validation_0-auc:0.94086	validation_1-auc:0.894669
[1000]	validation_0-auc:0.946581	validation_1-auc:0.896181
[1200]	validation_0-auc:0.95128	validation_1-auc:0.896701
[1400]	validation_0-auc:0.955494	validation_1-auc:0.896823
Stopping. Best iteration:
[1320]	validation_0-auc:0.953897	validation_1-auc:0.896909

0 fold validation AUC: 0.8968917536770719


In [None]:
# root_dir = os.path.abspath(os.path.join(os.getcwd(),'..'))
# submit_dir = root_dir + '/submission'
# id = len(os.listdir(submit_dir))
# pd.DataFrame({'ID_code':test.index, 'target':proba}).to_csv('{}/s{}.csv'.format(submit_dir, id),index=False)