# XGBoost Model

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
%matplotlib inline

In [2]:
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

In [3]:
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

In [4]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation,Dropout
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping

In [5]:
def NN_pred(yhat):
    if yhat >= 0.5:
        return 1
    else:
        return 0

In [6]:
def eval_err(y, yhat):
    m = y.shape[0]
    incorrect = 0
    y = y.tolist()
    for i in range(m):
        if yhat[i] != y[i]:
            incorrect += 1
            
    incorrect = incorrect / m
    
    return incorrect 

In [7]:
def pred_output(prediction):
    ex = prediction.shape[0]
    output = []
    for i in range(ex):
        output.append(NN_pred(prediction[i]))
    
    return output

In [8]:
def sigmoid(z):

    calc = math.e**-z
    g = 1 / (1 + calc)

    return g

### Training Set 1 as set

In [2]:
train_set = pd.read_csv("../TrainTestSet/TrainSet1.csv")
train_set.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Q,S,AgeFare,SibPar
0,0,3,1,22.0,1,0,7.25,0,1,159.5,1
1,1,1,0,38.0,1,0,71.2833,0,0,2708.7654,1
2,1,3,0,26.0,0,0,7.925,0,1,206.05,0
3,1,1,0,35.0,1,0,53.1,0,1,1858.5,1
4,0,3,1,35.0,0,0,8.05,0,1,281.75,0


### Creating x_train, y_train set

In [10]:
y = train_set['Survived']
x = train_set.drop(['Survived'], axis=1)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=101)

In [12]:
X_train = X_train.to_numpy()
y_train = y_train.to_numpy()

#### Scaling dataset

In [13]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train, y_train)
X_test = scaler.fit_transform(X_test, y_test)

### Creating and fitting the model on the training set 1

In [14]:
xgb_model = xgb.XGBClassifier()
xgb_model.fit(X_train, y_train)
pred_xgb_model = xgb_model.predict(X_test)
pred_xgb_model_train = xgb_model.predict(X_train)

### Model train set 1 Results

In [15]:
print('Test Accuracy: %f'%(np.mean(pred_xgb_model == y_test) * 100))
print('Train Accuracy: %f'%(np.mean(pred_xgb_model_train == y_train) * 100))

Test Accuracy: 78.089888
Train Accuracy: 98.171589


In [16]:
error_test = eval_err(y_test, pred_xgb_model)
error_train = eval_err(y_train, pred_xgb_model_train)
print(f"error test:  {error_test :0.3f}")
print(f"error train: {error_train :0.3f}")

error test:  0.219
error train: 0.018


In [17]:
print(classification_report(y_test, pred_xgb_model))
print(confusion_matrix(y_test, pred_xgb_model))

              precision    recall  f1-score   support

           0       0.81      0.83      0.82       107
           1       0.74      0.70      0.72        71

    accuracy                           0.78       178
   macro avg       0.77      0.77      0.77       178
weighted avg       0.78      0.78      0.78       178

[[89 18]
 [21 50]]


### Using GridSearch to improve model (currently overfitting)

In [18]:
xgb_model.get_params()

{'objective': 'binary:logistic',
 'use_label_encoder': False,
 'base_score': 0.5,
 'booster': 'gbtree',
 'callbacks': None,
 'colsample_bylevel': 1,
 'colsample_bynode': 1,
 'colsample_bytree': 1,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'gamma': 0,
 'gpu_id': -1,
 'grow_policy': 'depthwise',
 'importance_type': None,
 'interaction_constraints': '',
 'learning_rate': 0.300000012,
 'max_bin': 256,
 'max_cat_to_onehot': 4,
 'max_delta_step': 0,
 'max_depth': 6,
 'max_leaves': 0,
 'min_child_weight': 1,
 'missing': nan,
 'monotone_constraints': '()',
 'n_estimators': 100,
 'n_jobs': 0,
 'num_parallel_tree': 1,
 'predictor': 'auto',
 'random_state': 0,
 'reg_alpha': 0,
 'reg_lambda': 1,
 'sampling_method': 'uniform',
 'scale_pos_weight': 1,
 'subsample': 1,
 'tree_method': 'exact',
 'validate_parameters': 1,
 'verbosity': None}

### Parameters Grid

In [19]:
param_grid = {
    "max_depth": [3, 6, 9],
    "learning_rate": [0.1, 0.3, 0.5, 1],
    "gamma": [0, 0.01, 0.05],
    "reg_lambda": [0.5, 1, 5, 10],
    "scale_pos_weight": [0.5, 1, 5],
    "subsample": [0.8, 1, 1.2],
    "colsample_bytree": [0.5, 1, 2]
}

In [20]:
grid_cv = GridSearchCV(xgb_model, param_grid, n_jobs=-1, cv=3, scoring='roc_auc')

### Fitting data to parameterized model

In [21]:
grid_cv.fit(X_train, y_train)



GridSearchCV(cv=3,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     callbacks=None, colsample_bylevel=1,
                                     colsample_bynode=1, colsample_bytree=1,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                     gamma=0, gpu_id=-1,
                                     grow_policy='depthwise',
                                     importance_type=None,
                                     interaction_constraints='',
                                     learning_rate=0.300000012, max_bin=256,
                                     max_ca...
                                     monotone_constraints='()',
                                     n_estimators=100, n_jobs=0,
                                     num_parallel_tree=1, predictor='auto',
                                     rand

In [22]:
grid_cv.best_score_

0.8663460257628263

In [23]:
grid_cv.best_params_

{'colsample_bytree': 1,
 'gamma': 0.01,
 'learning_rate': 0.1,
 'max_depth': 9,
 'reg_lambda': 10,
 'scale_pos_weight': 0.5,
 'subsample': 0.8}

### Fitting model with new parameters

In [24]:
final_model_XGB = xgb.XGBClassifier(**grid_cv.best_params_, objective="binary:logistic")

In [25]:
final_model_XGB.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0.01, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.1, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=9, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=10, ...)

In [26]:
pred_xg_test = final_model_XGB.predict(X_test)
pred_xg_train = final_model_XGB.predict(X_train)

## XGBoost model Results

In [27]:
error_test = eval_err(y_test, pred_xg_test)
error_train = eval_err(y_train, pred_xg_train)
print(f"error test:  {error_test :0.3f}")
print(f"error train: {error_train :0.3f}")

error test:  0.174
error train: 0.142


In [41]:
print('Train Accuracy: %f'%(np.mean(pred_xg_train == y_train) * 100))
print('Test Accuracy: %f'%(np.mean(pred_xg_test == y_test) * 100))

Train Accuracy: 85.794655
Test Accuracy: 82.584270


In [42]:
print(classification_report(y_test, pred_xg_test))
print(confusion_matrix(y_test, pred_xg_test))

              precision    recall  f1-score   support

           0       0.79      0.97      0.87       107
           1       0.93      0.61      0.74        71

    accuracy                           0.83       178
   macro avg       0.86      0.79      0.80       178
weighted avg       0.85      0.83      0.82       178

[[104   3]
 [ 28  43]]
