# Boosting (Hypothesis Boosting)

In [1]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import warnings

warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
df = sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [3]:
df.dropna(inplace=True)

## Data Pre-Processing

In [4]:
X = df[['pclass', 'sex', 'age']]

In [5]:
X = df[['pclass', 'sex', 'age']]
X = df[['pclass', 'sex', 'age']]
from sklearn import preprocessing
lb = preprocessing.LabelBinarizer()

In [6]:
X['sex'] = lb.fit_transform(X['sex'])

In [7]:
y=df['survived']

In [8]:
X.head()

Unnamed: 0,pclass,sex,age
1,1,0,38.0
3,1,0,35.0
6,1,1,54.0
10,3,0,4.0
11,1,0,58.0


In [9]:
y.head()

1     1
3     1
6     0
10    1
11    1
Name: survived, dtype: int64

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [12]:
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [13]:
def print_score(clf, X_train, y_train, X_test, y_test, train=True):
    if train:
        print("Train Results:\n")
        print("Accuracy Score: {0:.4f}\n".format(accuracy_score(y_train, clf.predict(X_train))))
        print("Classification Report: \n {} \n".format(classification_report(y_train, clf.predict(X_train))))
        print("Confusion Matrix: \n {} \n".format(confusion_matrix(y_train, clf.predict(X_train))))
        
        res = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy')
        print("Average Accuracy: \t {0:.4f}".format(np.mean(res)))
        print("Accuracy SD: \t\t {0:.4f}".format(np.std(res)))
        
    elif train==False:
        print("Test Results:\n")
        print("Accuracy Score: {0:.4f}\n".format(accuracy_score(y_test, clf.predict(X_test))))
        print("Classification Report: \n {} \n".format(classification_report(y_test, clf.predict(X_test))))
        print("Confusion Matrix: \n {} \n".format(confusion_matrix(y_test, clf.predict(X_test))))

## AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [19]:
ada_clf = AdaBoostClassifier()

In [20]:
ada_clf.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)

In [21]:
print_score(ada_clf, X_train, y_train, X_test, y_test, train=True)

Train Results:

Accuracy Score: 0.8661

Classification Report: 
               precision    recall  f1-score   support

           0       0.90      0.66      0.76        41
           1       0.86      0.97      0.91        86

   micro avg       0.87      0.87      0.87       127
   macro avg       0.88      0.81      0.83       127
weighted avg       0.87      0.87      0.86       127
 

Confusion Matrix: 
 [[27 14]
 [ 3 83]] 

Average Accuracy: 	 0.6861
Accuracy SD: 		 0.0867


In [22]:
print_score(ada_clf, X_train, y_train, X_test, y_test, train=False)

Test Results:

Accuracy Score: 0.6545

Classification Report: 
               precision    recall  f1-score   support

           0       0.46      0.33      0.39        18
           1       0.71      0.81      0.76        37

   micro avg       0.65      0.65      0.65        55
   macro avg       0.59      0.57      0.57        55
weighted avg       0.63      0.65      0.64        55
 

Confusion Matrix: 
 [[ 6 12]
 [ 7 30]] 



## AdaBoost with Random Forest

In [23]:
from sklearn.ensemble import RandomForestClassifier 

In [29]:
ada_clf = AdaBoostClassifier(RandomForestClassifier(n_estimators=1000),n_estimators=1000)

In [30]:
ada_clf.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          learning_rate=1.0, n_estimators=1000, random_state=None)

In [26]:
print_score(ada_clf, X_train, y_train, X_test, y_test, train=True)

Train Results:

Accuracy Score: 0.9606

Classification Report: 
               precision    recall  f1-score   support

           0       0.95      0.93      0.94        41
           1       0.97      0.98      0.97        86

   micro avg       0.96      0.96      0.96       127
   macro avg       0.96      0.95      0.95       127
weighted avg       0.96      0.96      0.96       127
 

Confusion Matrix: 
 [[38  3]
 [ 2 84]] 

Average Accuracy: 	 0.7473
Accuracy SD: 		 0.0913


In [31]:
print_score(ada_clf, X_train, y_train, X_test, y_test, train=False)

Test Results:

Accuracy Score: 0.7091

Classification Report: 
               precision    recall  f1-score   support

           0       0.55      0.67      0.60        18
           1       0.82      0.73      0.77        37

   micro avg       0.71      0.71      0.71        55
   macro avg       0.68      0.70      0.69        55
weighted avg       0.73      0.71      0.72        55
 

Confusion Matrix: 
 [[12  6]
 [10 27]] 

