<a href="https://colab.research.google.com/github/VIHAN-droid/AI-ML-PROJECT/blob/main/ENSEMBLE%20LEARNING/BAGGING/BAGGING.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [None]:
x,y = make_classification(n_samples=10000, n_features=10, n_informative=3)  # samples -> rows , features -> clms

In [None]:
x_train , x_test , y_train , y_test = train_test_split(x,y,test_size=0.1,random_state=2)

In [None]:
dt = DecisionTreeClassifier()
dt.fit(x_train,y_train)
y_pred = dt.predict(x_test)
accuracy_score(y_test,y_pred)

0.946

# BAGGING

In [59]:
bag = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=500, max_samples=0.25, bootstrap=True)

# estimator -> define the algo u want to use
# n_estimators -> how many estimators are used
# max_samples -> how much of the data u want to use in training each model
# bootstrap -> if set true then data can be repeated

In [None]:
bag.fit(x_train,y_train)
y_pred = bag.predict(x_test)
accuracy_score(y_test,y_pred)

0.963

In [None]:
bag.estimators_samples_[0]  # tells us which indexed row did the first model recieved for training

array([8637, 4787, 5956, ..., 1244, 6901, 8464])

In [None]:
# ABOVE WAS ROW SAMPLING
# TO DO CLM SAMPLING -> max_samples=1.0 and max_features=0.3 and bootstrap=false and bootstrap_features=true

In [60]:
bag = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=500, max_samples=1.0, bootstrap=False, max_features=0.3,
                        bootstrap_features=True)

In [61]:
bag.fit(x_train,y_train)
y_pred = bag.predict(x_test)
accuracy_score(y_test,y_pred)

0.929

# OOB SCORE

In [None]:
# Some rows are missed while training the model which are not given to any model for training. They are called out of bag samples and can be used for testing.

In [None]:
bag = BaggingClassifier(estimator=DecisionTreeClassifier(),n_estimators=500,max_samples=0.25,bootstrap=True,oob_score=True)
bag.fit(x_train,y_train)
bag.oob_score_

0.9655555555555555

In [62]:
y_pred = bag.predict(x_test)
accuracy_score(y_test,y_pred)

0.929

# BAGGING TIPS

In [None]:
'''
1) Bagging gives better results than pasting
2) Good results come around 25% to 50% of row sampling mark
3) Do clm sampling while dealing with high dimensional data
'''

# 4) TO FIND CORRECT PARAMS DO GridSearchCV / RandomSearchCV

# GRID SEARCH CV

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
parameters = {'n_estimators':[50,100,500], 'max_samples':[0.1,0.4,0.7,1.0], 'bootstrap':[True,False], 'max_features':[0.1,0.4,0.7,1.0]}

In [88]:
search = GridSearchCV(estimator=BaggingClassifier(), param_grid=parameters, cv=3)
search.fit(x_train,y_train)

In [89]:
search.best_estimator_

In [90]:
search.best_params_

{'bootstrap': True,
 'max_features': 0.7,
 'max_samples': 1.0,
 'n_estimators': 100}

In [91]:
search.best_score_  # best score that can be reached through bagging classification using the params in the above output cell

np.float64(0.9813650128115535)

# BAGGING REGRESSOR

In [71]:
from sklearn.datasets import load_wine

In [79]:
x,y = load_wine(return_X_y=True,as_frame=True)
x = x.iloc[:,0:11]
x.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04


In [75]:
y.head()

Unnamed: 0,target
0,0
1,0
2,0
3,0
4,0


In [77]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

In [83]:
x_train , x_test , y_train , y_test = train_test_split(x,y,test_size=0.1,random_state=3)
lr = LinearRegression()
dt = DecisionTreeRegressor()
knn = KNeighborsRegressor()

In [84]:
lr.fit(x_train,y_train)
dt.fit(x_train,y_train)
knn.fit(x_train,y_train)

y_pred1 = lr.predict(x_test)
y_pred2 = dt.predict(x_test)
y_pred3 = knn.predict(x_test)

print(r2_score(y_test,y_pred1))
print(r2_score(y_test,y_pred2))
print(r2_score(y_test,y_pred3))

0.800278938641717
0.8461538461538461
0.6861538461538461


In [87]:
from sklearn.ensemble import BaggingRegressor
bag = BaggingRegressor()  # using default params
bag.fit(x_train,y_train)
y_pred_bag = bag.predict(x_test)
r2_score(y_test,y_pred_bag)

0.9261538461538461

In [None]:
# you can use grid search cv to get the best params