In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [2]:
from sklearn import datasets,metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

Boost Algorithms with breast cancer data

In [17]:
cancer=datasets.load_breast_cancer()

In [4]:
print(cancer.DESCR)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        worst/largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 0 is Mean Radi

In [5]:
x=cancer.data
y=cancer.target
x.shape, y.shape

((569, 30), (569,))

In [6]:
scaler=StandardScaler()
x_scale=scaler.fit_transform(x)
x_scale

array([[ 1.09706398, -2.07333501,  1.26993369, ...,  2.29607613,
         2.75062224,  1.93701461],
       [ 1.82982061, -0.35363241,  1.68595471, ...,  1.0870843 ,
        -0.24388967,  0.28118999],
       [ 1.57988811,  0.45618695,  1.56650313, ...,  1.95500035,
         1.152255  ,  0.20139121],
       ...,
       [ 0.70228425,  2.0455738 ,  0.67267578, ...,  0.41406869,
        -1.10454895, -0.31840916],
       [ 1.83834103,  2.33645719,  1.98252415, ...,  2.28998549,
         1.91908301,  2.21963528],
       [-1.80840125,  1.22179204, -1.81438851, ..., -1.74506282,
        -0.04813821, -0.75120669]])

In [7]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
import xgboost as xgb

In [9]:
x_train,x_test,y_train,y_test=train_test_split(x_scale,y,test_size=0.2,random_state=1,stratify=y)

In [11]:
rfc=RandomForestClassifier(n_estimators=200,random_state=1)
abc=AdaBoostClassifier(n_estimators=200,random_state=1,learning_rate=0.01)
gbc=GradientBoostingClassifier(n_estimators=200,random_state=1,learning_rate=0.01)
xgb_clf=xgb.XGBClassifier(n_estimators=200,random_state=1,learning_rate=0.01)

In [12]:
rfc.fit(x_train,y_train)
abc.fit(x_train,y_train)
gbc.fit(x_train,y_train)
xgb_clf.fit(x_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.01, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=200, n_jobs=0, num_parallel_tree=1, random_state=1,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [14]:
print('Random Forest:',rfc.score(x_test,y_test))
print('Adaboost:',abc.score(x_test,y_test))
print('GradientBoost:',gbc.score(x_test,y_test))
print('XGBoost:',xgb_clf.score(x_test,y_test))

Random Forest: 0.9473684210526315
Adaboost: 0.9473684210526315
GradientBoost: 0.9736842105263158
XGBoost: 0.9649122807017544


Bagging with iris data

In [30]:
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier

In [18]:
iris=datasets.load_iris()

In [19]:
x=iris.data
y=iris.target

In [21]:
x.shape , y.shape

((150, 4), (150,))

In [22]:
x=np.repeat(x,repeats=200,axis=0)
y=np.repeat(y,repeats=200,axis=0)

In [23]:
x.shape, y.shape

((30000, 4), (30000,))

train without bagging

In [33]:
%%time
clf=SVC(kernel='linear',probability=True,class_weight='balanced')
clf.fit(x,y)
print('SVC:',clf.score(x,y))

SVC: 0.98
Wall time: 2.02 s


train with bagging

In [34]:
%%time
n_estimators=10
clf=BaggingClassifier(SVC(kernel='linear',probability=True,class_weight='balanced'),n_estimators=n_estimators,max_samples=1.0/n_estimators)
clf.fit(x,y)
print('SVC:',clf.score(x,y))

SVC: 0.98
Wall time: 1 s
