In [19]:
import numpy as np 
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,VotingClassifier,VotingRegressor,RandomForestRegressor,BaggingClassifier,BaggingRegressor
from sklearn.tree import DecisionTreeClassifier,DecisionTreeRegressor
from xgboost import XGBRegressor,XGBRFRegressor
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC,SVR
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.preprocessing import StandardScaler
from seedtools import load_seed,dropper,mapper_auto

## CONTENTS 
-  BaggingClassifier (+variations)
- BaggingRegressor 


### About Hyperparamteres 
`boostrap = True` =>  row duplication <br /> 
<br />
`boostrap_features  = True` => feature duplication <br />
`max_samples :`
-  if float value => 1.0/0.25 use 100% rows , 0.25 rows 
-  if int value =>  700 , 400 use exactly 700 rows , 400 rows 

`max_features :`  or you can say ***columns***
-  if float value => 1.0/0.25 use 100% features , 0.25 features 
-  if int value =>  700 , 400 use exactly 700 features , 400 features 
- if  str like "log2" =>  log_2(n_features) | "sqrt" =>  sqrt(n_features)

### Bagging Tips
- Bagging generally gives better results than Pasting
- Good results come around the 25% to 50% row sampling mark
- Random patches and subspaces should be used while dealing with high dimensional data
- To find the correct hyperparameter values we can do GridSearchCV/RandomSearchCV


## BaggingClassifer 

In [20]:
df = load_seed("Social_Network_Ads.csv","v3",True).data

ss =  StandardScaler()
cols = ["Age","EstimatedSalary"]

for col  in cols:
    df[col]  = ss.fit_transform(df[col].values.reshape(-1,1))
    
x = df.drop(columns=["Purchased"]).values
y =  df["Purchased"].values

x_train,x_test,y_train,y_test =  train_test_split(x,y,test_size=0.2)
print("X train shape :",x_train.shape)
df.head(3)

X train shape : (320, 3)


Unnamed: 0,Gender,Age,EstimatedSalary,Purchased
0,0,-1.781797,-1.490046,0
1,0,-0.253587,-1.460681,0
2,1,-1.113206,-0.78529,0


**Variation 1 : {bootstrap=True}**

In [21]:

bagging_C =  BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=0.5, # 100% rows
    bootstrap=True, # row duplication,
    oob_score=True,
    max_features=1.0, # 100% columns 
    random_state=42
    
)
bagging_C.fit(x_train,y_train)
print("Score :",bagging_C.score(x_test,y_test))
print("OOB Score :",bagging_C.oob_score_) # we dont use r2 score for classification tasks 

Score : 0.925
OOB Score : 0.903125


**Variation 2 : {bootstrap=False}**

In [22]:
bagging2 =  BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=0.5, # 50% samples
    bootstrap=False, # No row duplication 
    random_state=42
    
)
bagging2.fit(x_train,y_train)

print("Score :",bagging2.score(x_test,y_test))


Score : 0.9125


## BaggingRegressor

In [23]:
def score_model(model,model_name,given=False):
    if "model_" in (locals() or globals()):
        del model_
        
    
    model_ =  model() if given ==False else model 
    print(f"{model_name} :")
    model_.fit(x_train,y_train)
    score = model_.score(x_test,y_test)
    pred =  model_.predict(x_test)
    
    r2  = r2_score(y_test,pred)
    mse =  mean_squared_error(y_test,pred)
    
    print(f"|| Score : {score}")
    print(f"|| R2 Score : {r2}")
    print(f"|| MSE : {mse}")
    

In [24]:
cols_ =  ["odometer_value","engine_capacity"]
ss2 = StandardScaler()

cars  = load_seed("cars.csv","v2",quiet=True).data.dropna()[:9000]
for col  in cols_:
    cars[col]  = ss2.fit_transform(cars[col].values.reshape(-1,1))


x =  cars.drop(columns=['price_usd']).values
y =  cars["price_usd"].values
x_train,x_test,y_train,y_test =  train_test_split(x,y,test_size=0.2)
print("X train shape :",x_train.shape)
cars.head(3)

X train shape : (7200, 10)


Unnamed: 0,transmission,odometer_value,engine_fuel,engine_has_gas,engine_type,engine_capacity,has_warranty,state,drivetrain,price_usd,is_exchangeable
0,0,-0.375676,0,0,0,0.965421,0,0,0,10900.0,0
1,0,0.381466,0,0,0,1.826107,0,0,0,5000.0,1
2,0,1.229466,0,0,0,0.965421,0,0,0,2800.0,1


In [25]:
bgreg =  BaggingRegressor(random_state=1,bootstrap=True,bootstrap_features=False,max_features=1.0,max_samples=0.5)
score_model(bgreg,"Bagging_regressor",given=True)


score_model(DecisionTreeRegressor,"decision_tree_regressor")
score_model(RandomForestRegressor,"random_forest_regressor ")


Bagging_regressor :
|| Score : 0.5729725912400716
|| R2 Score : 0.5729725912400716
|| MSE : 12118118.130967489
decision_tree_regressor :
|| Score : 0.390125584918823
|| R2 Score : 0.390125584918823
|| MSE : 17306922.355335984
random_forest_regressor  :
|| Score : 0.5717453487076618
|| R2 Score : 0.5717453487076618
|| MSE : 12152944.630808042
