# Gradient Boosting / Gradient Boosting Machine (GBM)

In [1]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import warnings

warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
df = sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [3]:
df.dropna(inplace=True)

## Data Pre-Processing

In [4]:
X = df[['pclass', 'sex', 'age']]

In [5]:
X = df[['pclass', 'sex', 'age']]
X = df[['pclass', 'sex', 'age']]
from sklearn import preprocessing
lb = preprocessing.LabelBinarizer()

In [6]:
X['sex'] = lb.fit_transform(X['sex'])

In [7]:
y=df['survived']

In [8]:
X.head()

Unnamed: 0,pclass,sex,age
1,1,0,38.0
3,1,0,35.0
6,1,1,54.0
10,3,0,4.0
11,1,0,58.0


In [9]:
y.head()

1     1
3     1
6     0
10    1
11    1
Name: survived, dtype: int64

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [12]:
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [13]:
def print_score(clf, X_train, y_train, X_test, y_test, train=True):
    if train:
        print("Train Results:\n")
        print("Accuracy Score: {0:.4f}\n".format(accuracy_score(y_train, clf.predict(X_train))))
        print("Classification Report: \n {} \n".format(classification_report(y_train, clf.predict(X_train))))
        print("Confusion Matrix: \n {} \n".format(confusion_matrix(y_train, clf.predict(X_train))))
        
        res = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy')
        print("Average Accuracy: \t {0:.4f}".format(np.mean(res)))
        print("Accuracy SD: \t\t {0:.4f}".format(np.std(res)))
        
    elif train==False:
        print("Test Results:\n")
        print("Accuracy Score: {0:.4f}\n".format(accuracy_score(y_test, clf.predict(X_test))))
        print("Classification Report: \n {} \n".format(classification_report(y_test, clf.predict(X_test))))
        print("Confusion Matrix: \n {} \n".format(confusion_matrix(y_test, clf.predict(X_test))))

## Gradient Boosting

In [14]:
from sklearn.ensemble import GradientBoostingClassifier

In [15]:
gbc_clf = GradientBoostingClassifier()
gbc_clf.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [17]:
print_score(gbc_clf, X_train, y_train, X_test, y_test, train=True)

Train Results:

Accuracy Score: 0.9528

Classification Report: 
               precision    recall  f1-score   support

           0       0.97      0.87      0.92        38
           1       0.95      0.99      0.97        89

   micro avg       0.95      0.95      0.95       127
   macro avg       0.96      0.93      0.94       127
weighted avg       0.95      0.95      0.95       127
 

Confusion Matrix: 
 [[33  5]
 [ 1 88]] 

Average Accuracy: 	 0.8050
Accuracy SD: 		 0.0681


In [18]:
print_score(gbc_clf, X_train, y_train, X_test, y_test, train=False)

Test Results:

Accuracy Score: 0.7818

Classification Report: 
               precision    recall  f1-score   support

           0       0.76      0.62      0.68        21
           1       0.79      0.88      0.83        34

   micro avg       0.78      0.78      0.78        55
   macro avg       0.78      0.75      0.76        55
weighted avg       0.78      0.78      0.78        55
 

Confusion Matrix: 
 [[13  8]
 [ 4 30]] 



## Grid Search

In [19]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [20]:
gbc_clf = GradientBoostingClassifier(random_state=42)

In [21]:
params_grid = {"max_depth": [3, None],
              "min_samples_split": [2, 3, 10],
              "min_samples_leaf": [1, 3, 10]}

In [22]:
grid_search = GridSearchCV(gbc_clf, params_grid,
                          n_jobs=-1, cv=5,
                          verbose=1, scoring="accuracy")

In [23]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    7.5s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:    9.1s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_sampl...      subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'max_depth': [3, None], 'min_samples_split': [2, 3, 10], 'min_samples_leaf': [1, 3, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=1)

In [24]:
grid_search.best_score_

0.8031496062992126

In [25]:
grid_search.best_estimator_.get_params()

{'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'deviance',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 10,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'presort': 'auto',
 'random_state': 42,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [26]:
print_score(grid_search, X_train, y_train, X_test, y_test, train=True)

Train Results:

Accuracy Score: 0.9134

Classification Report: 
               precision    recall  f1-score   support

           0       0.97      0.74      0.84        38
           1       0.90      0.99      0.94        89

   micro avg       0.91      0.91      0.91       127
   macro avg       0.93      0.86      0.89       127
weighted avg       0.92      0.91      0.91       127
 

Confusion Matrix: 
 [[28 10]
 [ 1 88]] 

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:    2.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:    2.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:    2.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:    3.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:    3.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:    2.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:    2.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:    2.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:    2.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 18 candidates, totalling 90 fits
Average Accuracy: 	 0.7666
Accuracy SD: 		 0.1003


[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:    3.2s finished


In [27]:
print_score(grid_search, X_train, y_train, X_test, y_test, train=False)

Test Results:

Accuracy Score: 0.7818

Classification Report: 
               precision    recall  f1-score   support

           0       0.76      0.62      0.68        21
           1       0.79      0.88      0.83        34

   micro avg       0.78      0.78      0.78        55
   macro avg       0.78      0.75      0.76        55
weighted avg       0.78      0.78      0.78        55
 

Confusion Matrix: 
 [[13  8]
 [ 4 30]] 

