# **Humana Competition**

### XGBoost

In [59]:
from xgboost import XGBClassifier

weight = sum(y_train.values == 0) / sum(y_train.values != 0)
# fit model to training data
xgb_model = XGBClassifier(random_state = 0, scale_pos_weight = weight)
xgb_model.fit(X_train, y_train)

In [60]:
y_scores2 = xgb_model.predict_proba(X_test)[:, 1]
roc_auc_score(y_test, y_scores2) # if including all features

0.6666632982500618

## Tune Hyperparameters

In [62]:
from hyperopt import hp, fmin, Trials, tpe, STATUS_OK

In [84]:
space={'max_depth': hp.uniform("max_depth", 3, 10),
        'min_child_weight' : hp.uniform('min_child_weight', 1, 6)
    }

In [85]:
import sys
def objective(space):
    clf=XGBClassifier(
                    max_depth = int(space['max_depth']), min_child_weight=int(space['min_child_weight']),
                    scale_pos_weight = weight, seed = 0)
    
    evaluation = [( X_train, y_train), ( X_test, y_test)]
    
    clf.fit(X_train, y_train,
            eval_set=evaluation, eval_metric="auc",
            early_stopping_rounds=10,verbose=False)
    

    pred = clf.predict_proba(X_test)[:, 1]
    acc = roc_auc_score(y_test, pred)
    # print('AUC:', acc)
    sys.stdout.flush() 
    return {'loss': -acc, 'status': STATUS_OK}

In [None]:
trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 100,
                        trials = trials)

best_hyperparams

In [91]:
best_hyperparams

{'max_depth': 3.4376209691740693, 'min_child_weight': 5.789854696000388}

In [93]:
xgb2 = XGBClassifier(max_depth = int(3.44), min_child_weight=int(5.79),scale_pos_weight = weight, seed = 0)
xgb2.fit(X_train, y_train)
y_scores3 = xgb2.predict_proba(X_test)[:, 1]
roc_auc_score(y_test, y_scores3) # if including all features

0.7122744346557359

### hp 2： tune gamma

In [94]:
space={'gamma': hp.uniform("gamma", 0.1, 0.3)}

def objective2(space):
    clf=XGBClassifier(
                    max_depth = int(3.44), min_child_weight=int(5.79), gamma = space['gamma'],
                    scale_pos_weight = weight, seed = 0)
    
    evaluation = [( X_train, y_train), ( X_test, y_test)]
    
    clf.fit(X_train, y_train,
            eval_set=evaluation, eval_metric="auc",
            early_stopping_rounds=10,verbose=False)
    

    pred = clf.predict_proba(X_test)[:, 1]
    acc = roc_auc_score(y_test, pred)
    # print('AUC:', acc)
    sys.stdout.flush() 
    return {'loss': -acc, 'status': STATUS_OK}

In [None]:
trials = Trials()

best_hyperparams2 = fmin(fn = objective2,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 100,
                        trials = trials)

best_hyperparams2

In [96]:
best_hyperparams2
# {'gamma': 0.1542296702055435}

{'gamma': 0.1542296702055435}

In [98]:
xgb3 = XGBClassifier(max_depth = int(3.44), min_child_weight=int(5.79), gamma = 0.1542296702055435, scale_pos_weight = weight, seed = 0)
xgb3.fit(X_train, y_train)
y_scores4 = xgb3.predict_proba(X_test)[:, 1]
roc_auc_score(y_test, y_scores4) # does not change much

0.7122744346557359

### hp3: tune subsample

In [108]:
space={'subsample': hp.uniform("subsample", 0.6, 1)}

def objective3(space):
    clf=XGBClassifier(
                    max_depth = int(3.44), min_child_weight=int(5.79), gamma = 0.1542,
                    subsample = space['subsample'],
                    scale_pos_weight = weight, seed = 0)
    
    evaluation = [( X_train, y_train), ( X_test, y_test)]
    
    clf.fit(X_train, y_train,
            eval_set=evaluation, eval_metric="auc",
            early_stopping_rounds=10,verbose=False)
    

    pred = clf.predict_proba(X_test)[:, 1]
    acc = roc_auc_score(y_test, pred)
    # print('AUC:', acc)
    sys.stdout.flush() 
    return {'loss': -acc, 'status': STATUS_OK}

In [None]:
trials = Trials()

best_hyperparams3 = fmin(fn = objective3,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 50,
                        trials = trials)

best_hyperparams3

In [109]:
xgb4 = XGBClassifier(max_depth = int(3.44), min_child_weight=int(5.79), gamma = 0.1542296702055435, 
                    subsample = 0.8932,
                    scale_pos_weight = weight, seed = 0)
xgb4.fit(X_train, y_train)
y_scores5 = xgb3.predict_proba(X_test)[:, 1]
roc_auc_score(y_test, y_scores5) # does not change much

0.7122744346557359

### hp4: tune colsample_bytree

In [111]:
space={'colsample_bytree': hp.uniform("colsample_bytree", 0.6, 1)}

def objective4(space):
    clf=XGBClassifier(
                    max_depth = int(3.44), min_child_weight=int(5.79), gamma = 0.1542, 
                    subsample = 0.8932,
                    colsample_bytree = space['colsample_bytree'],
                    scale_pos_weight = weight, seed = 0)
    
    evaluation = [( X_train, y_train), ( X_test, y_test)]
    
    clf.fit(X_train, y_train,
            eval_set=evaluation, eval_metric="auc",
            early_stopping_rounds=10,verbose=False)
    

    pred = clf.predict_proba(X_test)[:, 1]
    acc = roc_auc_score(y_test, pred)
    # print('AUC:', acc)
    sys.stdout.flush() 
    return {'loss': -acc, 'status': STATUS_OK}

In [None]:
trials = Trials()

best_hyperparams4 = fmin(fn = objective4,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 20,
                        trials = trials)

best_hyperparams4

In [118]:
xgb5 = XGBClassifier(max_depth = int(3.44), min_child_weight=int(5.79), gamma = 0.1542296702055435, 
                    subsample = 0.8932,  colsample_bytree = 0.9987,
                    scale_pos_weight = weight, seed = 0)
xgb5.fit(X_train, y_train)
y_scores6 = xgb5.predict_proba(X_test)[:, 1]
roc_auc_score(y_test, y_scores6) # does not change much

0.7013623816958193

### hp5: tune regulization

In [121]:
space={'reg_alpha': hp.uniform("reg_alpha", 0.00001, 100)}

def objective5(space):
    clf=XGBClassifier(
                    max_depth = int(3.44), min_child_weight=int(5.79), gamma = 0.1542, 
                    subsample = 0.8932,
                    colsample_bytree = 0.9987,
                    reg_alpha = space['reg_alpha'],
                    scale_pos_weight = weight, seed = 0)
    
    evaluation = [( X_train, y_train), ( X_test, y_test)]
    
    clf.fit(X_train, y_train,
            eval_set=evaluation, eval_metric="auc",
            early_stopping_rounds=10,verbose=False)
    

    pred = clf.predict_proba(X_test)[:, 1]
    acc = roc_auc_score(y_test, pred)
    # print('AUC:', acc)
    sys.stdout.flush() 
    return {'loss': -acc, 'status': STATUS_OK}

In [None]:
trials = Trials()

best_hyperparams5 = fmin(fn = objective5,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 100,
                        trials = trials)

best_hyperparams5

In [127]:
#27.47506239406208

xgb6 = XGBClassifier(max_depth = int(3.44), min_child_weight=int(5.79), gamma = 0.1542, 
                    subsample = 0.8932,  colsample_bytree = 0.9987, reg_alpha = 27.4751,
                    scale_pos_weight = weight)
xgb6.fit(X_train, y_train)
y_scores7 = xgb5.predict_proba(X_test)[:, 1]
roc_auc_score(y_test, y_scores7) # does not change much

0.7013623816958193

### hp6: learning rate

In [129]:
space={'eta': hp.uniform("eta", 0.1, 1)}

def objective6(space):
    clf=XGBClassifier(
                    max_depth = int(3.44), min_child_weight=int(5.79), gamma = 0.1542, 
                    subsample = 0.8932,
                    colsample_bytree = 0.9987, reg_alpha = 27.4751,
                    eta = space['eta'],
                    scale_pos_weight = weight, seed = 0)
    
    evaluation = [( X_train, y_train), ( X_test, y_test)]
    
    clf.fit(X_train, y_train,
            eval_set=evaluation, eval_metric="auc",
            early_stopping_rounds=10,verbose=False)
    

    pred = clf.predict_proba(X_test)[:, 1]
    acc = roc_auc_score(y_test, pred)
    # print('AUC:', acc)
    sys.stdout.flush() 
    return {'loss': -acc, 'status': STATUS_OK}

In [131]:
trials = Trials()

best_hyperparams6 = fmin(fn = objective6,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 20,
                        trials = trials)

best_hyperparams6

  0%|          | 0/20 [00:00<?, ?trial/s, best loss=?]





  5%|▌         | 1/20 [00:07<02:15,  7.11s/trial, best loss: -0.7313330974902763]





 10%|█         | 2/20 [00:10<01:26,  4.78s/trial, best loss: -0.7313330974902763]





 15%|█▌        | 3/20 [00:12<01:04,  3.82s/trial, best loss: -0.7313330974902763]





 20%|██        | 4/20 [00:15<00:52,  3.31s/trial, best loss: -0.7313330974902763]





 25%|██▌       | 5/20 [00:20<00:57,  3.85s/trial, best loss: -0.7313330974902763]





 30%|███       | 6/20 [00:22<00:45,  3.24s/trial, best loss: -0.7313330974902763]





 35%|███▌      | 7/20 [00:25<00:40,  3.11s/trial, best loss: -0.7313330974902763]





 40%|████      | 8/20 [00:27<00:35,  2.96s/trial, best loss: -0.7313330974902763]





 45%|████▌     | 9/20 [00:29<00:28,  2.61s/trial, best loss: -0.7313330974902763]





 50%|█████     | 10/20 [00:32<00:28,  2.83s/trial, best loss: -0.7313330974902763]





 55%|█████▌    | 11/20 [00:36<00:27,  3.04s/trial, best loss: -0.7313330974902763]





 60%|██████    | 12/20 [00:41<00:28,  3.60s/trial, best loss: -0.7313330974902763]





 65%|██████▌   | 13/20 [00:46<00:28,  4.03s/trial, best loss: -0.7313330974902763]





 70%|███████   | 14/20 [00:48<00:20,  3.42s/trial, best loss: -0.7313330974902763]





 75%|███████▌  | 15/20 [00:50<00:15,  3.10s/trial, best loss: -0.7313330974902763]





 80%|████████  | 16/20 [00:53<00:11,  2.96s/trial, best loss: -0.7313330974902763]





 85%|████████▌ | 17/20 [00:56<00:09,  3.07s/trial, best loss: -0.7313330974902763]





 90%|█████████ | 18/20 [01:01<00:07,  3.58s/trial, best loss: -0.7313330974902763]





 95%|█████████▌| 19/20 [01:04<00:03,  3.44s/trial, best loss: -0.7313330974902763]





100%|██████████| 20/20 [01:07<00:00,  3.39s/trial, best loss: -0.7313330974902763]


{'eta': 0.15799590705766733}

In [133]:
xgb7 = XGBClassifier(max_depth = int(3.44), min_child_weight=int(5.79), gamma = 0.1542, 
                    subsample = 0.8932,  colsample_bytree = 0.9987, reg_alpha = 27.4751,
                    eta = 0.1580, scale_pos_weight = weight, seed = 0)
xgb7.fit(X_train, y_train)
y_scores8 = xgb5.predict_proba(X_test)[:, 1]
roc_auc_score(y_test, y_scores8) # the score is higher if using google colab (see the other file)

0.7013623816958193