# Imports

In [17]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from datetime import datetime
from tqdm import tqdm

import sklearn
from sklearn import metrics
from sklearn.metrics import roc_auc_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold

import xgboost as xgb
from xgboost import XGBClassifier

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# Data

In [2]:
train = pd.read_csv("../data/processed/SMOTE_df.csv")
validation = pd.read_csv("../data/processed/validation.csv")
test = pd.read_csv("../data/processed/test.csv")

In [3]:
y_train = train['readmitted_rescaled']
X_train = train.drop('readmitted_rescaled', axis=1)

y_val = validation['readmitted_rescaled']
X_val = validation.drop('readmitted_rescaled', axis=1)

y_test = test['readmitted_rescaled']
X_test = test.drop('readmitted_rescaled', axis=1)

In [4]:
def rename_column_names(colname):
    colname = "_".join(colname.split('-'))
    colname = "_".join(colname.split(','))
    colname = "_".join(colname.split('<'))
    colname = "_".join(colname.split('/'))
    colname = "_".join(colname.split('.'))
    colname = "_".join(colname.split('['))
    colname = "_".join(colname.split(']'))
    colname = "".join(colname.split(')'))
    colname = "_".join(colname.split('('))
    return colname
  

In [5]:
new_train_colnames = list(map(lambda x: rename_column_names(x), X_train.columns.tolist()))
new_val_colnames = list(map(lambda x: rename_column_names(x), X_val.columns.tolist()))
new_test_colnames = list(map(lambda x: rename_column_names(x), X_test.columns.tolist()))

In [6]:
X_train.columns = new_train_colnames
X_val.columns = new_val_colnames
X_test.columns = new_test_colnames

# Base Model

In [7]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dvalidation = xgb.DMatrix(X_val, label=y_val)

In [26]:
dtest = xgb.DMatrix(X_test, label=y_test)

In [8]:
param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic', 'eval_metric': 'auc'}
#param['eval_metric'] = "auc"

In [9]:
# specify validations set to watch performance
watchlist = [(dvalidation, 'eval'), (dtrain, 'train')]

In [10]:
num_round = 25
bst = xgb.train(param, dtrain, num_round, watchlist)

[0]	eval-auc:0.574939	train-auc:0.623693
[1]	eval-auc:0.59001	train-auc:0.670517
[2]	eval-auc:0.579831	train-auc:0.708186
[3]	eval-auc:0.553883	train-auc:0.736572
[4]	eval-auc:0.543521	train-auc:0.767972
[5]	eval-auc:0.562488	train-auc:0.77669
[6]	eval-auc:0.571813	train-auc:0.786016
[7]	eval-auc:0.561704	train-auc:0.813818
[8]	eval-auc:0.567376	train-auc:0.820668
[9]	eval-auc:0.574203	train-auc:0.842267
[10]	eval-auc:0.568558	train-auc:0.868621
[11]	eval-auc:0.570229	train-auc:0.887012
[12]	eval-auc:0.568799	train-auc:0.898338
[13]	eval-auc:0.563253	train-auc:0.901407
[14]	eval-auc:0.561046	train-auc:0.904827
[15]	eval-auc:0.562571	train-auc:0.907242
[16]	eval-auc:0.563237	train-auc:0.911668
[17]	eval-auc:0.561447	train-auc:0.914021
[18]	eval-auc:0.563128	train-auc:0.916966
[19]	eval-auc:0.562175	train-auc:0.919093
[20]	eval-auc:0.563764	train-auc:0.921542
[21]	eval-auc:0.563141	train-auc:0.923222
[22]	eval-auc:0.564054	train-auc:0.924796
[23]	eval-auc:0.569048	train-auc:0.926022
[24]

## Base model performance

In [105]:
# print("Base Model AUC: {:.2f}".format(
#                  bst.best_score,
#                  bst.best_iteration+1))


In [None]:
# Base model predictions

In [75]:
preds = bst.predict(dvalidation)
print(preds)
labels = dvalidation.get_label()
print(labels)

[0.14651231 0.29021257 0.53440356 ... 0.1798375  0.05700677 0.14533229]
[0. 1. 0. ... 0. 0. 0.]


In [82]:
print('error={:.2f}' .format(sum(1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]) /
       float(len(preds))))

error=0.13


In [87]:
bst.save_model('../models/0001.model')
# dump model
bst.dump_model('../models/0001_dump.raw.txt')
# dump model with feature map
bst.dump_model('../models/0001_dump.nice.txt')

# Hyperparameter Tuning

The first parameter we will look at is not part of the default params dictionary, but will be passed as a standalone argument to the training method. This parameter is called **"num_boost_round"** and corresponds to the number of boosting rounds or trees to build. Its optimal value highly depends on the other parameters, and thus it should be re-tuned each time you update a parameter.

Tuning it together with all parameters can be done in a grid-search, but it requires a lot of computational effort.

Fortunately XGBoost provides a nice way to find the best number of rounds whilst training. Since trees are built sequentially, instead of fixing the number of rounds at the beginning, we can test our model at each step and see if adding a new tree/round improves performance.

To do so, we define a validation dataset and a metric that is used to assess performance at each round. If performance haven’t improved for N rounds (N is defined by the variable **"early_stopping_round"**), we stop the training and keep the best number of boosting rounds. 

We will pass a **"num_boost_round"** which corresponds to the maximum number of boosting rounds that we allow. We set it to a large value hoping to find the optimal number of rounds before reaching it, if we haven't improved performance on our test dataset in early_stopping_round rounds

## Number Boost Round and Early Stopping Rounds

In [12]:
# Define a default parameters to begin with as a "param" dictionary
param = {'max_depth': 6, 
         'min_child_weight': 1,
         'eta': 0.3, 
         'subsample': 1,
         'colsample_bytree': 1,
         'objective': 'binary:logistic'}

In [13]:
# Add the evaluation metric to the param dictionary
param['eval_metric'] = "auc"

# set "num_boost_rounds" to a large number
num_boost_round = 999

In [98]:
model_001 = xgb.train(param, dtrain, num_boost_round=num_boost_round, evals=[(dvalidation, "Validation")], early_stopping_rounds=10)

[0]	Validation-auc:0.54458
Will train until Validation-auc hasn't improved in 10 rounds.
[1]	Validation-auc:0.552465
[2]	Validation-auc:0.554492
[3]	Validation-auc:0.558288
[4]	Validation-auc:0.567015
[5]	Validation-auc:0.573127
[6]	Validation-auc:0.57403
[7]	Validation-auc:0.573055
[8]	Validation-auc:0.573107
[9]	Validation-auc:0.575311
[10]	Validation-auc:0.576368
[11]	Validation-auc:0.578053
[12]	Validation-auc:0.575459
[13]	Validation-auc:0.579737
[14]	Validation-auc:0.57889
[15]	Validation-auc:0.579709
[16]	Validation-auc:0.580606
[17]	Validation-auc:0.582377
[18]	Validation-auc:0.584218
[19]	Validation-auc:0.585463
[20]	Validation-auc:0.584919
[21]	Validation-auc:0.586799
[22]	Validation-auc:0.587925
[23]	Validation-auc:0.589869
[24]	Validation-auc:0.589816
[25]	Validation-auc:0.591163
[26]	Validation-auc:0.588882
[27]	Validation-auc:0.586725
[28]	Validation-auc:0.584564
[29]	Validation-auc:0.585482
[30]	Validation-auc:0.584084
[31]	Validation-auc:0.583593
[32]	Validation-auc:0.5

In [99]:
print("Best AUC: {:.2f} with {} rounds".format(
                 model_001.best_score,
                 model_001.best_iteration+1))

Best AUC: 0.59 with 26 rounds


The training stopped before reaching the maximum number of boosting rounds (999), that’s because after the 26th tree, adding more rounds did not lead to improvements of AUC on the validation dataset.

The AUC is 0.59 with default parameters and an optimal number of boosting rounds, on the validation dataset.

# Using XGBoost’s CV

In order to tune the other hyperparameters, we will use the cv function from XGBoost. It allows us to run cross-validation on our training dataset and returns a mean AUC score.

We need to pass it:
* **params**: our dictionary of parameters.

* **our dtrain matrix**.

* **num_boost_round**: number of boosting rounds. We will use a large number again and count on **early_stopping_rounds** to find the optimal number of rounds before reaching the maximum.

* **seed**: random seed. It's important to set a seed here, to ensure we are using the same folds for each step so we can properly compare the scores with different parameters.

* **nfold**: the number of folds to use for cross-validation

* **metrics**: the metrics to use to evaluate our model, here we use **AUC**.

There is no need to pass a test dataset here. It’s because the cross-validation function is splitting the train dataset into nfolds and iteratively keeps one of the folds for test purposes. 

## Cross validation score with default parameters

In [107]:
cv_results = xgb.cv(param,
                    dtrain,
                    num_boost_round=num_boost_round,
                    seed=42,
                    nfold=5,
                    metrics={"auc"},
                    early_stopping_rounds=10)
cv_results

Unnamed: 0,train-auc-mean,train-auc-std,test-auc-mean,test-auc-std
0,0.623693,0.000864,0.623666,0.003424
1,0.670518,0.000914,0.670510,0.003660
2,0.708491,0.000616,0.708416,0.004882
3,0.737613,0.000677,0.737446,0.007267
4,0.768219,0.000864,0.768418,0.005216
...,...,...,...,...
125,0.959062,0.002151,0.955588,0.001537
126,0.959073,0.002123,0.955649,0.001500
127,0.959115,0.002123,0.955695,0.001516
128,0.959163,0.002164,0.955745,0.001510


In [111]:
print("Test-AUC-Mean {}".format(cv_results["test-auc-mean"].max()))

Test-AUC-Mean 0.9557624


## max_depth and min_child_weight

These parameters add constraints on the architecture of the trees and prevent overfitting. Tuning these parameters find a good trade-off between model bias and variance.

**max_depth** - is the maximum number of nodes allowed from the root to the farthest leaf of a tree. Deeper trees can model more complex relationships by adding more nodes, but as we go deeper, splits become less relevant and are sometimes only due to noise, causing the model to overfit.

**min_child_weight** -  is the minimum weight (or number of samples if all samples have a weight of 1) required in order to create a new node in the tree. A smaller min_child_weight allows the algorithm to create children that correspond to fewer samples, thus allowing for more complex trees, but again, more likely to overfit.

In [120]:
# Define a grid for these two parameters
param_grid = [(max_depth, min_child_weight)
              for max_depth in range(9, 12) for min_child_weight in range(5,8)]

In [123]:
# Define initial best params and AUC
max_auc = float("-Inf")
best_params = None

for max_depth, min_child_weight in tqdm(param_grid):
    print("CV with max_depth = {} and min_child_weight = {}".format(
        max_depth, min_child_weight))
    
    # Update our parameters
    param['max_depth'] = max_depth
    param['min_child_weight'] = min_child_weight
    
    # Run CV
    cv_results = xgb.cv(param,
                        dtrain,
                        num_boost_round=num_boost_round,
                        seed=42,
                        nfold=3,
                        metrics={"auc"},
                        early_stopping_rounds=10,
                        as_pandas=True)
    # Update best MAE
    mean_auc = cv_results['test-auc-mean'].max()
    boost_rounds = cv_results['test-auc-mean'].argmax()
    print("\tAUC {} for {} rounds".format(mean_auc, boost_rounds))
    if mean_auc > max_auc:
        max_auc = mean_auc
        best_params = (max_depth, min_child_weight)
print("Best params: {}, {}, AUC: {}".format(best_params[0], best_params[1], max_auc))

  0%|          | 0/9 [00:00<?, ?it/s]

CV with max_depth = 9 and min_child_weight = 5


 11%|█         | 1/9 [03:54<31:15, 234.38s/it]

	AUC 0.9599016666666667 for 29 rounds
CV with max_depth = 9 and min_child_weight = 6


 22%|██▏       | 2/9 [08:05<27:54, 239.25s/it]

	AUC 0.959372 for 35 rounds
CV with max_depth = 9 and min_child_weight = 7


 33%|███▎      | 3/9 [12:14<24:14, 242.45s/it]

	AUC 0.9602056666666666 for 32 rounds
CV with max_depth = 10 and min_child_weight = 5


 44%|████▍     | 4/9 [16:31<20:32, 246.56s/it]

	AUC 0.960337 for 28 rounds
CV with max_depth = 10 and min_child_weight = 6


 56%|█████▌    | 5/9 [21:30<17:29, 262.50s/it]

	AUC 0.9589059999999999 for 32 rounds
CV with max_depth = 10 and min_child_weight = 7


 67%|██████▋   | 6/9 [25:18<12:36, 252.01s/it]

	AUC 0.9600673333333334 for 25 rounds
CV with max_depth = 11 and min_child_weight = 5


 78%|███████▊  | 7/9 [29:48<08:34, 257.45s/it]

	AUC 0.9604739999999999 for 31 rounds
CV with max_depth = 11 and min_child_weight = 6


 89%|████████▉ | 8/9 [32:54<03:55, 235.96s/it]

	AUC 0.9604316666666666 for 18 rounds
CV with max_depth = 11 and min_child_weight = 7


100%|██████████| 9/9 [36:29<00:00, 243.29s/it]

	AUC 0.960932 for 23 rounds
Best params: 11, 7, AUC: 0.960932





We get the best score with a max_depth of 11 and min_child_weight of 7, so let's update our param dict:

In [14]:
param['max_depth'] = 11
param['min_child_weight'] = 7

## Subsample and Colsample_bytrees

These parameters control the sampling of the dataset that is done at each boosting round.

Instead of using the whole training set every time, we can build a tree on slightly different data at each step, which makes it less likely to overfit to a single sample or feature.

**subsample** corresponds to the fraction of observations (the rows) to subsample at each step. By default it is set to 1 meaning that we use all rows.
**colsample_bytree** corresponds to the fraction of features (the columns) to use. By default it is set to 1 meaning that we will use all features.

In [129]:
[i/10. for i in range(7, 11)]

[0.7, 0.8, 0.9, 1.0]

In [130]:
param_grid = [(subsample, colsample)
              for subsample in [i/10. for i in range(7, 11)]
              for colsample in [i/10. for i in range(7, 11)]]

In [131]:
# Define initial best params and AUC
max_auc = float("-Inf")
best_params = None


# iterate over the sample space
for subsample, colsample in tqdm(param_grid ):
    print("CV with subsample = {} and colsample = {}".format(
        subsample, colsample))
    
    # Update our parameters
    param['subsample'] = subsample
    param['colsample_bytree'] = colsample
    
    # Run CV
    cv_results = xgb.cv(param,
                        dtrain,
                        num_boost_round=num_boost_round,
                        seed=42,
                        nfold=3,
                        metrics={"auc"},
                        early_stopping_rounds=10,
                        as_pandas=True)
    # Update best MAE
    mean_auc = cv_results['test-auc-mean'].max()
    boost_rounds = cv_results['test-auc-mean'].argmax()
    print("\tAUC {} for {} rounds".format(mean_auc, boost_rounds))
    if mean_auc > max_auc:
        max_auc = mean_auc
        best_params = (subsample, colsample)
print("Best params: {}, {}, AUC: {}".format(best_params[0], best_params[1], max_auc))

  0%|          | 0/16 [00:00<?, ?it/s]

CV with subsample = 0.7 and colsample = 0.7


  6%|▋         | 1/16 [09:12<2:18:09, 552.62s/it]

	AUC 0.9581206666666667 for 31 rounds
CV with subsample = 0.7 and colsample = 0.8


 12%|█▎        | 2/16 [14:31<1:52:33, 482.41s/it]

	AUC 0.9592156666666667 for 32 rounds
CV with subsample = 0.7 and colsample = 0.9


 19%|█▉        | 3/16 [18:14<1:27:41, 404.69s/it]

	AUC 0.9589196666666666 for 17 rounds
CV with subsample = 0.7 and colsample = 1.0


 25%|██▌       | 4/16 [22:02<1:10:18, 351.52s/it]

	AUC 0.9596766666666667 for 18 rounds
CV with subsample = 0.8 and colsample = 0.7


 31%|███▏      | 5/16 [26:46<1:00:44, 331.32s/it]

	AUC 0.959078 for 27 rounds
CV with subsample = 0.8 and colsample = 0.8


 38%|███▊      | 6/16 [32:03<54:30, 327.06s/it]  

	AUC 0.9593223333333333 for 32 rounds
CV with subsample = 0.8 and colsample = 0.9


 44%|████▍     | 7/16 [35:45<44:21, 295.74s/it]

	AUC 0.959512 for 18 rounds
CV with subsample = 0.8 and colsample = 1.0


 50%|█████     | 8/16 [39:49<37:21, 280.22s/it]

	AUC 0.9599773333333333 for 20 rounds
CV with subsample = 0.9 and colsample = 0.7


 56%|█████▋    | 9/16 [43:08<29:50, 255.78s/it]

	AUC 0.9595546666666667 for 25 rounds
CV with subsample = 0.9 and colsample = 0.8


 62%|██████▎   | 10/16 [46:47<24:28, 244.76s/it]

	AUC 0.959197 for 25 rounds
CV with subsample = 0.9 and colsample = 0.9


 69%|██████▉   | 11/16 [50:23<19:39, 235.97s/it]

	AUC 0.9601259999999999 for 22 rounds
CV with subsample = 0.9 and colsample = 1.0


 75%|███████▌  | 12/16 [56:26<18:16, 274.11s/it]

	AUC 0.9605743333333333 for 26 rounds
CV with subsample = 1.0 and colsample = 0.7


 81%|████████▏ | 13/16 [1:00:18<13:04, 261.65s/it]

	AUC 0.9599286666666668 for 28 rounds
CV with subsample = 1.0 and colsample = 0.8


 88%|████████▊ | 14/16 [1:03:37<08:05, 242.63s/it]

	AUC 0.9595053333333334 for 24 rounds
CV with subsample = 1.0 and colsample = 0.9


 94%|█████████▍| 15/16 [1:07:14<03:54, 235.00s/it]

	AUC 0.959482 for 20 rounds
CV with subsample = 1.0 and colsample = 1.0


100%|██████████| 16/16 [1:11:05<00:00, 266.62s/it]

	AUC 0.960932 for 23 rounds
Best params: 1.0, 1.0, AUC: 0.960932





In [15]:
param['subsample'] = 1.
param['colsample_bytree'] = 1.

## ETA

The ETA parameter controls the learning rate. It corresponds to the shrinkage of the weights associated to features after each round, in other words it defines the amount of "correction" we make at each step
In practice, having a lower eta makes our model more robust to overfitting thus, usually, the lower the learning rate, the best. But with a lower eta, we need more boosting rounds, which takes more time to train, sometimes for only marginal improvements. Let's try a couple of values here, and time them with the notebook command:bbb

In [18]:
# Define initial best params and AUC
max_auc = float("-Inf")
best_params = None


for eta in tqdm([.3, .2, .1, .05, .01, .005]):
    print("CV with eta={}".format(eta))
    # We update our parameters
    param['eta'] = eta
    # Run and time CV
    %time 
    cv_results = xgb.cv(
        param,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={"auc"},
        early_stopping_rounds=10
    )
    # Update best score
    mean_auc = cv_results['test-auc-mean'].max()
    boost_rounds = cv_results['test-auc-mean'].argmax()
    print("\tAUC {} for {} rounds\n".format(mean_auc, boost_rounds))
    if mean_auc < max_auc:
        max_auc = mean_auc
        best_params = eta
print("Best params: {}, AUC: {}".format(best_params, max_auc))

  0%|          | 0/6 [00:00<?, ?it/s]

CV with eta=0.3
CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.01 µs


 17%|█▋        | 1/6 [10:29<52:26, 629.36s/it]

	AUC 0.9644202 for 43 rounds

CV with eta=0.2
CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 7.87 µs


 33%|███▎      | 2/6 [2:25:46<3:11:42, 2875.72s/it]

	AUC 0.9649552 for 58 rounds

CV with eta=0.1
CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 5.96 µs


 50%|█████     | 3/6 [3:47:39<2:54:20, 3486.82s/it]

	AUC 0.9655692 for 109 rounds

CV with eta=0.05
CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.77 µs


 67%|██████▋   | 4/6 [5:32:17<2:24:08, 4324.10s/it]

	AUC 0.9653751999999999 for 184 rounds

CV with eta=0.01
CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.25 µs


 83%|████████▎ | 5/6 [10:03:13<2:11:43, 7903.95s/it]

	AUC 0.96549 for 996 rounds

CV with eta=0.005
CPU times: user 4 µs, sys: 7 µs, total: 11 µs
Wall time: 29.8 µs


100%|██████████| 6/6 [25:32:58<00:00, 15329.70s/it] 

	AUC 0.9642986 for 998 rounds

Best params: None, AUC: -inf





In [19]:
param['eta'] = 0.1

## Tuning Results Hyperparameters

In [20]:
param

{'max_depth': 11,
 'min_child_weight': 7,
 'eta': 0.1,
 'subsample': 1.0,
 'colsample_bytree': 1.0,
 'objective': 'binary:logistic',
 'eval_metric': 'auc'}

# Final Model

In [23]:
best_model = xgb.train(
    param,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dvalidation, "Validation")],
    early_stopping_rounds=20)

[0]	Validation-auc:0.553303
Will train until Validation-auc hasn't improved in 20 rounds.
[1]	Validation-auc:0.557079
[2]	Validation-auc:0.559656
[3]	Validation-auc:0.562852
[4]	Validation-auc:0.562811
[5]	Validation-auc:0.562513
[6]	Validation-auc:0.562838
[7]	Validation-auc:0.564833
[8]	Validation-auc:0.564798
[9]	Validation-auc:0.567778
[10]	Validation-auc:0.566211
[11]	Validation-auc:0.566508
[12]	Validation-auc:0.567766
[13]	Validation-auc:0.569518
[14]	Validation-auc:0.570784
[15]	Validation-auc:0.571355
[16]	Validation-auc:0.572737
[17]	Validation-auc:0.574034
[18]	Validation-auc:0.574788
[19]	Validation-auc:0.575801
[20]	Validation-auc:0.576442
[21]	Validation-auc:0.5771
[22]	Validation-auc:0.577161
[23]	Validation-auc:0.577532
[24]	Validation-auc:0.577915
[25]	Validation-auc:0.578445
[26]	Validation-auc:0.577873
[27]	Validation-auc:0.58054
[28]	Validation-auc:0.581033
[29]	Validation-auc:0.580472
[30]	Validation-auc:0.580667
[31]	Validation-auc:0.580733
[32]	Validation-auc:0.5

In [24]:
best_model.save_model("../models/best_model.model")

In [27]:
loaded_model = xgb.Booster()
loaded_model.load_model("../models/best_model.model")
# And use it for predictions.
loaded_model.predict(dtest)

array([0.08488293, 0.08948976, 0.06540319, ..., 0.06093495, 0.32176277,
       0.21019623], dtype=float32)

In [2]:
params = {
          "learning_rate"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30],
          "n_estimators"     : [5, 10, 25, 50],
          "max_depth"        : [3, 4, 5, 6, 8, 10, 12, 15],
          "min_child_weight" : [1, 3, 5, 7],
          "gamma"            : [0.0, 0.1, 0.2 , 0.3, 0.4],
          "subsample"        : [i/100.0 for i in range(75,90,5)],
          "colsample_bytree" : [0.3, 0.4, 0.5 , 0.7]
         }

In [49]:
# Create the XGB classifier, xgb_model.
xgb_model = XGBClassifier()
# List the default parameters.
print(xgb_model.get_xgb_params())

{'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bynode': 1, 'colsample_bytree': 1, 'gamma': 0, 'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 3, 'min_child_weight': 1, 'missing': None, 'n_estimators': 100, 'nthread': 1, 'objective': 'binary:logistic', 'reg_alpha': 0, 'reg_lambda': 1, 'scale_pos_weight': 1, 'seed': 0, 'subsample': 1, 'verbosity': 1}


In [51]:
xgb_model.fit(X_train, y_train, eval_metric=['error', 'auc'], eval_set=[((X_val, y_val))])

[0]	validation_0-error:0.101686	validation_0-auc:0.528765
[1]	validation_0-error:0.101686	validation_0-auc:0.528765
[2]	validation_0-error:0.101686	validation_0-auc:0.541381
[3]	validation_0-error:0.101686	validation_0-auc:0.552055
[4]	validation_0-error:0.101686	validation_0-auc:0.542922
[5]	validation_0-error:0.101686	validation_0-auc:0.551868
[6]	validation_0-error:0.101686	validation_0-auc:0.558648
[7]	validation_0-error:0.101686	validation_0-auc:0.565723
[8]	validation_0-error:0.101686	validation_0-auc:0.56875
[9]	validation_0-error:0.215173	validation_0-auc:0.56832
[10]	validation_0-error:0.208163	validation_0-auc:0.571801
[11]	validation_0-error:0.208252	validation_0-auc:0.572184
[12]	validation_0-error:0.208252	validation_0-auc:0.571489
[13]	validation_0-error:0.198846	validation_0-auc:0.574799
[14]	validation_0-error:0.207897	validation_0-auc:0.574451
[15]	validation_0-error:0.200887	validation_0-auc:0.572805
[16]	validation_0-error:0.200621	validation_0-auc:0.575015
[17]	vali

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [55]:
y_pred = xgb_model.predict(X_test)  #predicts
print('confusion matrix')
print(metrics.confusion_matrix(y_test, y_pred))
print('classification report')
print(metrics.classification_report(y_test, y_pred))
print("-----------------------------------------------------------------------------------------")
print("Accuracy is :")
print(metrics.accuracy_score(y_test, y_pred))
print('Area under the curve : %f' % (metrics.roc_auc_score(y_test, y_pred)))

confusion matrix
[[12059   738]
 [ 1141   149]]
classification report
              precision    recall  f1-score   support

           0       0.91      0.94      0.93     12797
           1       0.17      0.12      0.14      1290

    accuracy                           0.87     14087
   macro avg       0.54      0.53      0.53     14087
weighted avg       0.85      0.87      0.86     14087

-----------------------------------------------------------------------------------------
Accuracy is :
0.8666146092141691
Area under the curve : 0.528917


In [22]:
X_train.columns.tolist()


['number_diagnoses_clip',
 'number_outpatient_log1p',
 'number_emergency_log1p',
 'number_inpatient_log1p',
 'num_procedures_log1p',
 'num_medications_log1p',
 'num_lab_procedures_log1p',
 'norm_time_in_hospital',
 'race_AfricanAmerican',
 'race_Asian',
 'race_Caucasian',
 'race_Hispanic',
 'race_Other',
 'gender_Male',
 'gender_Unknown/Invalid',
 'age_[10-20)',
 'age_[20-30)',
 'age_[30-40)',
 'age_[40-50)',
 'age_[50-60)',
 'age_[60-70)',
 'age_[70-80)',
 'age_[80-90)',
 'age_[90-100)',
 'metformin_No',
 'metformin_Steady',
 'metformin_Up',
 'repaglinide_No',
 'repaglinide_Steady',
 'repaglinide_Up',
 'nateglinide_No',
 'nateglinide_Steady',
 'nateglinide_Up',
 'chlorpropamide_No',
 'chlorpropamide_Steady',
 'chlorpropamide_Up',
 'glimepiride_No',
 'glimepiride_Steady',
 'glimepiride_Up',
 'glipizide_No',
 'glipizide_Steady',
 'glipizide_Up',
 'glyburide_No',
 'glyburide_Steady',
 'glyburide_Up',
 'tolbutamide_Steady',
 'pioglitazone_No',
 'pioglitazone_Steady',
 'pioglitazone_Up',
 

In [24]:
X_train.columns.str.contains('age').split("_")

AttributeError: 'numpy.ndarray' object has no attribute 'split'

In [53]:
def create_xgb_classifier(alg,
                          X_train,
                          X_test,
                          y_train,
                          y_test,
                          useTrainCV=True,
                          cv_folds=5,
                          early_stopping_rounds=50):
    if useTrainCV:
        alg_ = alg(objective='binary:logistic',
                   scale_pos_weight=1,
                   scoring='roc_auc')
        dtrain = xgb.DMatrix(X_train, label=y_train)
        dtest = xgb.DMatrix(X_test, label=y_test)
        RnS_ = RandomizedSearchCV(estimator=alg_,
                                  param_distributions=params,
                                  n_iter=30,
                                  scoring='auc',
                                  n_jobs=None,
                                  refit=True,
                                  cv=cv_folds,
                                  verbose=0,
                                  random_state=121,
                                  return_train_score=True)
        #fit
        RnS_.fit(X_train, y_train)

        # Evaluate on training data
        # Create an array of predictions directly using the best_estimator_ property
        trn_predictions = RnS_.best_estimator_.predict(xgtrain)
        trn_pred_proba = RnS_.best_estimator_.predict_proba(xgtrain)[:, 1]

        #Print Model report:
        print("\nModel Report")
        print("Accuracy : {0:.4f}".format(
            metrics.accuracy_score(y_train, trn_predictions)))
        print("AUC Score : {0:.4f}".format(
            metrics.roc_auc_score(y_train, trn_predictions)))
        print("Precision Score : {0:.4f}".format(
            metrics.precision_score(y_train, trn_predictions)))
        print("Recall Score : {0:.4f}".format(
            metrics.recall_score(y_train, trn_predictions)))

        # Evaluate on testing data
        test_predictions = RnS_.best_estimator_.predict(xgtest)
        test_pred_proba = RnS_.best_estimator_.predict_proba(xgtest)[:, 1]

        print("\nModel Report")
        print("Accuracy : {0:.4f}".format(
            metrics.accuracy_score(y_test, test_predictions)))
        print("AUC Score : {0:.4f}".format(
            metrics.roc_auc_score(y_test, test_predictions)))
        print("Precision Score : {0:.4f}".format(
            metrics.precision_score(y_test, test_predictions)))
        print("Recall Score : {0:.4f}".format(
            metrics.recall_score(y_test, test_predictions)))

        res = {}
        res['best_estimator'] = RnS_.best_estimator_
        res['best_results'] = RnS_.cv_results_
        res['best_params'] = RnS_.best_params_
        res['best_score'] = RnS_.best_score_

        return res

In [54]:
model_01 = create_xgb_classifier(XGBClassifier, X_train, X_test, features_, target_ )

NameError: name 'features_' is not defined