In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import ensemble
from scipy.stats import spearmanr

import time
import warnings
warnings.filterwarnings("ignore")

import xgboost as xgb
# from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import StratifiedKFold

import utils

In [2]:
df_train = pd.read_csv('../data/train.csv')
target = df_train.pop('target')

df_train.drop(['ID_code'], axis=1, inplace=True)

In [3]:
test_file = '../data/test.csv'

df_test = pd.read_csv(test_file)
test_id = df_test.pop('ID_code')

In [13]:
params = {
        'max_depth': 6,
        'colsample_bytree': 0.3,
        'learning_rate': 0.01,
        'objective': 'binary:logistic',
        'eval_metric' : 'auc',
        'subsample': 0.5,
        'n_jobs': 8    
        }

In [27]:
%%time
fold_n = 5

folds = StratifiedKFold(n_splits = fold_n, shuffle=True, random_state=30)
y_pred_xgb = np.zeros(len(df_test))

for fold_n, (train_index, valid_index) in enumerate(folds.split(df_train, target)):
    print('Fold', fold_n, 'started at', time.ctime())

    dtrain = xgb.DMatrix(df_train.iloc[train_index], label=target.iloc[train_index])
    dvalid = xgb.DMatrix(df_train.iloc[valid_index], label=target.iloc[valid_index])

    evallist = [(dtrain, 'train'), (dvalid, 'eval')]
    xgb_model = xgb.train( params, dtrain, num_boost_round = 5000,
                      evals = evallist, verbose_eval=300, early_stopping_rounds=200)

    

    y_pred_xgb += xgb_model.predict(xgb.DMatrix(df_test), ntree_limit=xgb_model.best_ntree_limit)/fold_n

Fold 0 started at Wed Mar  6 03:46:21 2019
[0]	train-auc:0.610269	eval-auc:0.611468
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 200 rounds.
[300]	train-auc:0.865255	eval-auc:0.824235
[600]	train-auc:0.908248	eval-auc:0.856637
[900]	train-auc:0.932121	eval-auc:0.873129
[1200]	train-auc:0.946429	eval-auc:0.882266
[1500]	train-auc:0.956497	eval-auc:0.887872
[1800]	train-auc:0.963834	eval-auc:0.891435
[2100]	train-auc:0.969633	eval-auc:0.893996
[2400]	train-auc:0.974105	eval-auc:0.895711
[2700]	train-auc:0.977761	eval-auc:0.897128
[3000]	train-auc:0.980848	eval-auc:0.898145
[3300]	train-auc:0.983401	eval-auc:0.898606
[3600]	train-auc:0.985625	eval-auc:0.899218
[3900]	train-auc:0.987527	eval-auc:0.899694
[4200]	train-auc:0.989197	eval-auc:0.899906
[4500]	train-auc:0.990675	eval-auc:0.900207
[4800]	train-auc:0.991893	eval-auc:0.900423
[4999]	train-auc:0.992632	eval-auc:0.900544
Fold 1 started at Wed Mar  6 

In [28]:
submission_xgb = pd.DataFrame({
        "ID_code": test_id,
        "target": y_pred_xgb
    })
submission_xgb.to_csv('../results/submission_xgb.csv', index=False)

In [None]:
##benchmarks and things to try
# Here we go in terms of CV

# LightGBM            0.89804
# XGBoost             0.89712
# FeedForward Net     0.87236
# Logistic Regression 0.89096

# https://www.kaggle.com/bogorodvo/starter-code-saving-and-loading-lgb-xgb-cb/output

In [None]:
# why insight
# your background
# career goals

