# XGBoost (Extreme Gradient Boosting)

In [1]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import warnings

warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
df = sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [3]:
df.dropna(inplace=True)

## Data Pre-Processing

In [4]:
X = df[['pclass', 'sex', 'age']]

In [5]:
X = df[['pclass', 'sex', 'age']]
from sklearn import preprocessing
lb = preprocessing.LabelBinarizer()

In [6]:
X['sex'] = lb.fit_transform(X['sex'])

In [7]:
y=df['survived']

In [8]:
X.head()

Unnamed: 0,pclass,sex,age
1,1,0,38.0
3,1,0,35.0
6,1,1,54.0
10,3,0,4.0
11,1,0,58.0


In [9]:
y.head()

1     1
3     1
6     0
10    1
11    1
Name: survived, dtype: int64

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [12]:
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [13]:
def print_score(clf, X_train, y_train, X_test, y_test, train=True):
    if train:
        print("Train Results:\n")
        print("Accuracy Score: {0:.4f}\n".format(accuracy_score(y_train, clf.predict(X_train))))
        print("Classification Report: \n {} \n".format(classification_report(y_train, clf.predict(X_train))))
        print("Confusion Matrix: \n {} \n".format(confusion_matrix(y_train, clf.predict(X_train))))
        
        res = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy')
        print("Average Accuracy: \t {0:.4f}".format(np.mean(res)))
        print("Accuracy SD: \t\t {0:.4f}".format(np.std(res)))
        
    elif train==False:
        print("Test Results:\n")
        print("Accuracy Score: {0:.4f}\n".format(accuracy_score(y_test, clf.predict(X_test))))
        print("Classification Report: \n {} \n".format(classification_report(y_test, clf.predict(X_test))))
        print("Confusion Matrix: \n {} \n".format(confusion_matrix(y_test, clf.predict(X_test))))

## XGBoost

In [15]:
import xgboost as xgb

In [21]:
xgb_clf = xgb.XGBClassifier(max_depth=5, n_estimators=10000, learning_rate=0.3)

In [22]:
xgb_clf.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.3, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=10000,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [23]:
print_score(xgb_clf, X_train, y_train, X_test, y_test, train=True)

Train Results:

Accuracy Score: 0.9291

Classification Report: 
               precision    recall  f1-score   support

           0       0.95      0.83      0.89        42
           1       0.92      0.98      0.95        85

   micro avg       0.93      0.93      0.93       127
   macro avg       0.93      0.90      0.92       127
weighted avg       0.93      0.93      0.93       127
 

Confusion Matrix: 
 [[35  7]
 [ 2 83]] 

Average Accuracy: 	 0.6779
Accuracy SD: 		 0.0765


In [24]:
print_score(xgb_clf, X_train, y_train, X_test, y_test, train=False)

Test Results:

Accuracy Score: 0.8364

Classification Report: 
               precision    recall  f1-score   support

           0       0.72      0.76      0.74        17
           1       0.89      0.87      0.88        38

   micro avg       0.84      0.84      0.84        55
   macro avg       0.81      0.82      0.81        55
weighted avg       0.84      0.84      0.84        55
 

Confusion Matrix: 
 [[13  4]
 [ 5 33]] 

