In [None]:
import os
import numpy as np
import pandas as pd
import cv2
import zarr
import glob
import time
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.metrics import log_loss
from sklearn.cross_validation import train_test_split
import xgboost as xgb
from bayes_opt import BayesianOptimization

np.random.seed(1337)

In [None]:
X_train = np.load('2D_train0-1500_features_masks.npy')
y_train = np.load('2D_train0-1500_labels_masks.npy')

X_test = np.load('2D_valid1500-1595_features_masks.npy')
y_test = np.load('2D_valid1500-1595_labels_masks.npy')

stg2_feats = np.load('2D_stg2data_masks.npy')

In [None]:
def xgb_train_bo(max_depth, min_child_weight, subsample, colsample_bytree):
    
    xgb_model = xgb.XGBRegressor(max_depth = int(max_depth), min_child_weight = int(min_child_weight),
                                 subsample = max(min(subsample, 1), 0), 
                                 colsample_bytree = max(min(colsample_bytree, 1), 0),
                                 n_estimators = 5000, learning_rate = 0.05,
                                 nthread = 6, seed = 1337)
    
    xgb_model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose = 10, eval_metric='logloss',
                  early_stopping_rounds = 50)
    loss = log_loss(y_test, xgb_model.predict(X_test, ntree_limit = xgb_model.best_ntree_limit))
    print('Logloss on test dataset:', loss)
    return -loss


xgbBO = BayesianOptimization(xgb_train_bo, {
        'max_depth': (7, 14),
        'min_child_weight': (20, 40),
        'subsample': (0.4, 0.7),
        'colsample_bytree': (0.4, 0.6),
    })

num_iter = 50
init_points = 30

xgbBO.maximize(init_points=init_points, n_iter=num_iter)

print('XGB: %f' % xgbBO.res['max']['max_val'])

In [None]:
def xgb_train(X_train, X_valid, y_train, y_valid):
    clf = xgb.XGBRegressor(max_depth=10,
                           n_estimators=5000,
                           min_child_weight=38,
                           learning_rate=0.05,
                           nthread=6,
                           subsample=0.40,
                           colsample_bytree=0.40,
                           seed=1337)
    clf.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=10,
            eval_metric='logloss', early_stopping_rounds=50)
    return clf

def make_submit(clf, X_test):
    df = pd.read_csv('/home/w/DS_Projects/Kaggle/DS Bowl 2017/input_data/stage2_sample_submission.csv')
    pred = clf.predict(X_test, ntree_limit = clf.best_ntree_limit)
    df['cancer'] = pred
    df.to_csv('stage2_sub_2DUNet_optimXGB_ver2_05858.csv', index=False)
    print(df.head())
    return

In [None]:
clf = xgb_train(X_train, X_test, y_train, y_test)

In [None]:
make_submit(clf, stg2_feats)