# Import packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_diabetes

# Load dataset

In [2]:
data=load_diabetes()

In [4]:
feature=data.data
target=data.target

# Train test split

In [50]:
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV

In [27]:
x_train, x_test, y_train, y_test = train_test_split(feature, target, test_size=0.2, random_state=32)

In [28]:
x_train.shape

(353, 10)

In [29]:
y_train.shape

(353,)

In [30]:
x_test.shape

(89, 10)

In [31]:
y_test.shape

(89,)

In [32]:
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [33]:
lr=LinearRegression()
dt=DecisionTreeRegressor()
knn=KNeighborsRegressor()

In [35]:
lr_score=cross_val_score(lr,x_train,y_train,cv=5,scoring='r2').mean()
dt_score=cross_val_score(dt,x_train,y_train,cv=5,scoring='r2').mean()
knn_score=cross_val_score(knn,x_train,y_train,cv=5,scoring='r2').mean()

In [37]:
print(f"LinearRegression {lr_score}")
print(f"DecessionTree {dt_score}")
print(f"Knn {knn_score}")

LinearRegression 0.4591961816869198
DecessionTree -0.17284882081371342
Knn 0.3175615152439928


**We see that `LinearRegression` can give best result now apply bagging**

In [43]:
bag=BaggingRegressor(estimator=dt)
bag_score=cross_val_score(bag,x_train,y_train,cv=5,scoring='r2').mean()

In [44]:
bag_score.mean()

0.331426752203053

In [45]:
bag2=BaggingRegressor(estimator=lr)
bag_score2=cross_val_score(bag2,x_train,y_train,cv=5,scoring='r2').mean()

In [47]:
bag_score2

0.45006327693014825

In [46]:
bag3=BaggingRegressor(estimator=knn)
bag_score3=cross_val_score(bag3,x_train,y_train,cv=5,scoring='r2').mean()

In [48]:
bag_score3

0.34054823932651174

**We see that using bagging with `DecessionTree` we will get `33% score`**

# Hyperparameter Tunning with `DecessionTree`

In [57]:
params={
    'n_estimators':[10,20,30,50,100],
    'estimator':[lr,dt,knn],
    'max_samples':[0.5,1.0],
    'bootstrap':[True,False],
    'bootstrap_features':[True,False],
    'oob_score':[True,False],
    'warm_start':[True,False]
}

In [58]:
bag_=BaggingRegressor()

In [59]:
grid_search=GridSearchCV(bag_,param_grid=params,scoring='r2',cv=5,n_jobs=-1,verbose=1)
grid_search

In [60]:
grid_search.fit(x_train,y_train)

Fitting 5 folds for each of 480 candidates, totalling 2400 fits


900 fits failed out of a total of 2400.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
300 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Laptop House\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Laptop House\anaconda3\Lib\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Laptop House\anaconda3\Lib\site-packages\sklearn\ensemble\_bagging.py", line 338, in fit
    return self._fit(X, y, self.max_samples, sample_weight=sample_weight)
           ^^^^^^^^^^^^^^^^

In [62]:
print(f"Best estimators is {grid_search.best_params_}")

Best estimators is {'bootstrap': True, 'bootstrap_features': False, 'estimator': LinearRegression(), 'max_samples': 1.0, 'n_estimators': 30, 'oob_score': False, 'warm_start': True}


In [63]:
print(f"Best score is {grid_search.best_score_:.4f}")

Best score is 0.4645


**We can see that we can get best result at `LinearRegression estimator`**

In [65]:
final=BaggingRegressor(estimator=lr,max_samples=1.0,n_estimators=30,oob_score=False,warm_start=True,bootstrap=True,bootstrap_features=False)
final

In [66]:
final.fit(x_train,y_train)

In [68]:
pre=final.predict(x_test)

In [70]:
r2_score(y_test,pre)

0.4554876221138515