# Bagging

Temeli, bootstrap yöntemi ile oluşturulan birden fazla karar ağacının, ürettiği tahminlerin bir araya getirilerek değerlendirilmesine dayanır.

1996, Breiman

-------

# 1. Model (Bagged)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV,cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import scale 
from sklearn import model_selection
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor

from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
hit = pd.read_csv("Hitters.csv")

df = hit.copy()

df = df.dropna()

df.head()

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,League,Division,PutOuts,Assists,Errors,Salary,NewLeague
1,315,81,7,24,38,39,14,3449,835,69,321,414,375,N,W,632,43,10,475.0,N
2,479,130,18,66,72,76,3,1624,457,63,224,266,263,A,W,880,82,14,480.0,A
3,496,141,20,65,78,37,11,5628,1575,225,828,838,354,N,E,200,11,3,500.0,N
4,321,87,10,39,42,30,2,396,101,12,48,46,33,N,E,805,40,4,91.5,N
5,594,169,4,74,51,35,11,4408,1133,19,501,336,194,A,W,282,421,25,750.0,A


In [3]:
dms = pd.get_dummies(df[["League", "Division", "NewLeague"]])

dms.head()

Unnamed: 0,League_A,League_N,Division_E,Division_W,NewLeague_A,NewLeague_N
1,0,1,0,1,0,1
2,1,0,0,1,1,0
3,0,1,1,0,0,1
4,0,1,1,0,0,1
5,1,0,0,1,1,0


In [4]:
X_ = df.drop(["Salary", "League", "Division", "NewLeague"], axis = 1).astype("float64")

X_.head()

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,PutOuts,Assists,Errors
1,315.0,81.0,7.0,24.0,38.0,39.0,14.0,3449.0,835.0,69.0,321.0,414.0,375.0,632.0,43.0,10.0
2,479.0,130.0,18.0,66.0,72.0,76.0,3.0,1624.0,457.0,63.0,224.0,266.0,263.0,880.0,82.0,14.0
3,496.0,141.0,20.0,65.0,78.0,37.0,11.0,5628.0,1575.0,225.0,828.0,838.0,354.0,200.0,11.0,3.0
4,321.0,87.0,10.0,39.0,42.0,30.0,2.0,396.0,101.0,12.0,48.0,46.0,33.0,805.0,40.0,4.0
5,594.0,169.0,4.0,74.0,51.0,35.0,11.0,4408.0,1133.0,19.0,501.0,336.0,194.0,282.0,421.0,25.0


In [5]:
X = pd.concat([X_, dms[["League_N", "Division_W", "NewLeague_N"]]], axis = 1)

X.head()

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,PutOuts,Assists,Errors,League_N,Division_W,NewLeague_N
1,315.0,81.0,7.0,24.0,38.0,39.0,14.0,3449.0,835.0,69.0,321.0,414.0,375.0,632.0,43.0,10.0,1,1,1
2,479.0,130.0,18.0,66.0,72.0,76.0,3.0,1624.0,457.0,63.0,224.0,266.0,263.0,880.0,82.0,14.0,0,1,0
3,496.0,141.0,20.0,65.0,78.0,37.0,11.0,5628.0,1575.0,225.0,828.0,838.0,354.0,200.0,11.0,3.0,1,0,1
4,321.0,87.0,10.0,39.0,42.0,30.0,2.0,396.0,101.0,12.0,48.0,46.0,33.0,805.0,40.0,4.0,1,0,1
5,594.0,169.0,4.0,74.0,51.0,35.0,11.0,4408.0,1133.0,19.0,501.0,336.0,194.0,282.0,421.0,25.0,0,1,0


In [6]:
y = df["Salary"]

y.head()

1    475.0
2    480.0
3    500.0
4     91.5
5    750.0
Name: Salary, dtype: float64

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

In [8]:
bag_model = BaggingRegressor(bootstrap_features = True)

bag_model.fit(X_train, y_train)

BaggingRegressor(base_estimator=None, bootstrap=True, bootstrap_features=True,
                 max_features=1.0, max_samples=1.0, n_estimators=10,
                 n_jobs=None, oob_score=False, random_state=None, verbose=0,
                 warm_start=False)

In [9]:
bag_model.n_estimators

10

In [10]:
bag_model.estimators_ # birbirinden farklı 10 adet ağaç 'fit' edilmiş.

[DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       presort=False, random_state=766210766, splitter='best'),
 DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       presort=False, random_state=1499816503, splitter='best'),
 DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_w

In [14]:
bag_model.estimators_samples_  # her bir ağaçta bulunan örnekler

[array([ 29, 126, 112, 160,  50,  77,   3, 116, 183, 160, 143, 111, 133,
        126,  63, 110,  91,  10, 138,  79,  23,  71,  73,  21,  93,  39,
        107,  55, 124,  70,  38, 120, 105,  68,  71,   2, 136,   6,  72,
         92,  16,  10,  91, 109, 146, 129, 115, 117, 170,  59,  84, 141,
          5,  26, 192,  78,  44, 133, 129, 191, 173,  44, 101, 100, 162,
         74, 127, 165,  32,  12, 158,  71,  47,  94, 192, 120,  36,  74,
         66, 182,  63, 138, 102,  70, 169,  34, 194, 105,  23, 136, 182,
         25, 188,  23, 182,  55, 149,  99,  18,  47, 127, 131,  48, 195,
          1,  53, 127,  23, 145,  19, 145, 123,  54, 161,  81,  79, 153,
        108, 163, 171, 179,  71, 167, 110, 102, 122,  52,  61,  31,  89,
        177, 111, 109,  80,  24, 123,  22, 140,  74,  63,   1,  25, 134,
         76,  91,  66,  96,  65,  77, 124, 165, 196, 134, 177, 146,  86,
        140,  10,  48, 149,   7,  56,  35,  91,  34, 172,  41, 104, 184,
        169, 120, 182, 140,  28, 184,  24, 163,  45

In [15]:
bag_model.estimators_features_  # her bir ağacın bağımsız değişkenleri

[array([ 4, 18,  1, 16, 16,  9,  4,  0,  7, 15,  3, 18,  6,  0, 15,  3,  5,
         8, 16]),
 array([ 3,  0,  5, 16,  5,  3, 13,  7,  5,  6, 17,  4, 18, 15, 16,  0,  1,
        14,  9]),
 array([ 4,  9,  4,  9,  1, 12,  7,  2, 12, 13,  5, 17, 18,  8, 14,  8,  7,
         9,  9]),
 array([ 2,  2, 12,  6, 10,  1,  1,  2,  1,  2,  4,  2,  0,  3,  9, 11,  8,
        17,  9]),
 array([12, 10,  3, 12,  6, 13,  5,  5, 14, 17, 15,  8, 13, 17,  3, 12, 16,
         9, 10]),
 array([ 6, 15,  0, 14, 11,  3,  1,  8, 13,  9,  9, 16, 18,  4,  7,  0,  9,
        18,  4]),
 array([10, 13,  6,  2, 15, 14, 17,  6,  8,  7,  0, 15,  1,  6,  6, 14, 17,
        18,  7]),
 array([ 7,  5,  2,  2,  3, 16, 13,  3,  9,  9,  3, 18, 10,  3,  4,  5, 16,
         9,  7]),
 array([ 7, 18,  7, 13, 15,  2, 18, 15, 11,  2, 17,  3, 13, 17, 17,  2, 11,
         2,  3]),
 array([13, 17,  0,  5, 10,  3, 11,  2, 15, 12, 16,  0,  7, 12, 15, 17, 13,
         3,  0])]

In [16]:
bag_model.estimators_[0]

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=766210766, splitter='best')

In [17]:
bag_model.estimators_[1]

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=1499816503, splitter='best')

------

# Tahmin (Bagged)

In [18]:
bag_model.predict(X_test)

array([ 658.75  , 1009.6666, 1112.    ,  155.35  ,  574.3333,  317.75  ,
        215.5   ,  125.9   ,  695.    ,  557.    ,  923.5   ,  903.3333,
        460.    ,  158.1667,  380.5   ,  837.6666,  917.8333,   99.5   ,
        835.2   ,  246.5   ,  443.5   ,  962.75  ,  891.6666,  818.1667,
        351.2   ,   73.55  , 1008.0001,  431.    ,  555.5   ,   94.5   ,
         81.5   ,  750.5   ,  495.6667,  229.75  ,  268.1667,  828.5832,
       1224.8904,  238.35  ,  158.5   ,  481.    ,   86.75  ,  132.45  ,
        683.4999,  732.    , 1300.    ,  973.0833,  442.    ,  106.15  ,
        160.5   ,  805.3333,  799.9166,  503.5001,  786.5   ,  180.    ,
        104.45  ,  722.4166,  647.5   ,  644.1666,  262.6667, 1249.3071,
       1132.4738,  462.5   , 1038.    ,  417.5   ,  677.75  ,   78.5   ])

In [19]:
y_pred_test = bag_model.predict(X_test)

In [21]:
np.sqrt(mean_squared_error(y_test, y_pred_test))

347.4824455159753

In [29]:
bir_y_pred = bag_model.estimators_[1].fit(X_train, y_train).predict(X_test)

np.sqrt(mean_squared_error(y_test, iki_y_pred))  # beceriksiz ağaç :D

441.95505113795053

In [26]:
alti_y_pred = bag_model.estimators_[6].fit(X_train, y_train).predict(X_test)

np.sqrt(mean_squared_error(y_test, yedi_y_pred))  # beceriksiz ağaç :D

466.8847325687605

In [28]:
sifir_y_pred = bag_model.estimators_[0].fit(X_train, y_train).predict(X_test)

np.sqrt(mean_squared_error(y_test, sifir_y_pred))  # beceriksiz ağaç :D

464.198292420759

In [32]:
baggedList = []

for i in range(0, 10):
    y_pred_test = bag_model.estimators_[i].fit(X_train, y_train).predict(X_test)
    baggedList.append(np.sqrt(mean_squared_error(y_test, y_pred_test)))
    
for i in range(0, len(baggedList)):
    print("{}.Ağaç : {}".format(i, baggedList[i]))


0.Ağaç : 464.198292420759
1.Ağaç : 441.95505113795053
2.Ağaç : 512.5510627689754
3.Ağaç : 452.4953526696393
4.Ağaç : 500.33661495207105
5.Ağaç : 497.48516596744025
6.Ağaç : 466.8847325687605
7.Ağaç : 464.94118703895975
8.Ağaç : 453.1948378991027
9.Ağaç : 430.6566776123834


-------

# 3. Model Doğrulama / Model Tuning (Bagged)

In [33]:
bag_model = BaggingRegressor(bootstrap_features = True)

bag_model.fit(X_train, y_train)

BaggingRegressor(base_estimator=None, bootstrap=True, bootstrap_features=True,
                 max_features=1.0, max_samples=1.0, n_estimators=10,
                 n_jobs=None, oob_score=False, random_state=None, verbose=0,
                 warm_start=False)

In [36]:
bag_params = {"n_estimators" : range(2, 20)}

bag_cv_model = GridSearchCV(bag_model, bag_params, cv = 10)

bag_cv_model.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=BaggingRegressor(base_estimator=None, bootstrap=True,
                                        bootstrap_features=True,
                                        max_features=1.0, max_samples=1.0,
                                        n_estimators=10, n_jobs=None,
                                        oob_score=False, random_state=None,
                                        verbose=0, warm_start=False),
             iid='warn', n_jobs=None, param_grid={'n_estimators': range(2, 20)},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [37]:
bag_cv_model.best_params_

{'n_estimators': 19}

In [38]:
# final modeli

bag_tuned = BaggingRegressor(n_estimators = 19, random_state = 45)

bag_tuned.fit(X_train, y_train)

BaggingRegressor(base_estimator=None, bootstrap=True, bootstrap_features=False,
                 max_features=1.0, max_samples=1.0, n_estimators=19,
                 n_jobs=None, oob_score=False, random_state=45, verbose=0,
                 warm_start=False)

In [39]:
bag_tuned.predict(X_test)

array([ 803.52631579,  876.53505263, 1111.15789474,  130.39473684,
        528.42105263,  346.89473684,  199.84210526,  111.55263158,
        867.01752632,  526.66663158,  736.52631579,  979.98615789,
        554.47368421,  175.26315789,  375.65789474,  977.28068421,
       1003.37721053,  100.        ,  872.64910526,  267.89473684,
        386.75436842, 1075.65789474,  708.15789474,  727.193     ,
        462.63157895,   80.21052632,  979.95615789,  286.79826316,
        635.21931579,   90.05263158,   85.86842105,  696.97368421,
        489.42989474,  303.81578947,  258.        ,  729.86842105,
       1408.96363158,  220.5       ,  123.68421053,  637.85084211,
         81.        ,  130.23684211,  752.807     ,  706.85963158,
       1240.13157895,  730.57015789,  434.64915789,  102.71052632,
        180.89473684,  799.34210526,  871.53505263,  629.91231579,
        787.78068421,  203.28947368,   99.71052632,  863.54384211,
        683.50873684,  658.15784211,  206.05268421, 1298.34957

In [40]:
y_pred_test = bag_tuned.predict(X_test)

np.sqrt(mean_squared_error(y_test, y_pred_test))

332.4260086061929

---------------