In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import make_regression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split

In [15]:
X, y = make_regression(n_samples=10000, n_features=10, n_informative=3)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
'''
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)
'''

In [23]:
model_dt = DecisionTreeRegressor(random_state=42)
model_dt.fit(X_train, y_train)
y_pred_dt = model_dt.predict(X_test)

print("Decision Tree R2-Score: ", r2_score(y_test, y_pred_dt))
print("Decision Tree MAE: ", mean_absolute_error(y_test, y_pred_dt))
print("Decision Tree MSE: ", mean_squared_error(y_test, y_pred_dt))
print("Decision Tree RMSE: ", np.sqrt(mean_squared_error(y_test, y_pred_dt)))

Decision Tree R2-Score:  0.9929727933548429
Decision Tree MAE:  6.283649468882851
Decision Tree MSE:  72.3250222630294
Decision Tree RMSE:  8.504411929288786


### **Bagging**

In [24]:
bag = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
                        n_estimators=500,
                        max_samples=0.5,
                        bootstrap=True,
                        random_state=42,
                        )

In [25]:
bag.fit(X_train, y_train)



In [26]:
y_pred_bag = bag.predict(X_test)

In [27]:
print("Bagged Tree R2-Score: ", r2_score(y_test, y_pred_bag))
print("Bagged Tree MAE: ", mean_absolute_error(y_test, y_pred_bag))
print("Bagged Tree MSE: ", mean_squared_error(y_test, y_pred_bag))
print("Bagged Tree RMSE: ", np.sqrt(mean_squared_error(y_test, y_pred_bag)))

Bagged Tree R2-Score:  0.99745830018062
Bagged Tree MAE:  3.2638910073880316
Bagged Tree MSE:  26.159540384269384
Bagged Tree RMSE:  5.114639809827216


### **Random Forest**

In [28]:
model_rf = RandomForestRegressor(random_state=42, n_estimators=500)
model_rf.fit(X_train, y_train)
y_pred_rf = model_rf.predict(X_test)

print("RF R2-Score: ", r2_score(y_test, y_pred_rf))
print("RF MAE: ", mean_absolute_error(y_test, y_pred_rf))
print("RF MSE: ", mean_squared_error(y_test, y_pred_rf))
print("RF RMSE: ", np.sqrt(mean_squared_error(y_test, y_pred_rf)))

RF R2-Score:  0.9979141823516834
RF MAE:  2.964903097636463
RF MSE:  21.467535461630444
RF RMSE:  4.633307184035011


### **Bagging using SVM**

In [29]:
bag_svm = BaggingRegressor(base_estimator=SVR(),
                        n_estimators=500,
                        max_samples=0.25,
                        bootstrap=True,
                        random_state=42,
                        )

In [30]:
bag_svm.fit(X_train, y_train)
y_pred_bagsvm = bag_svm.predict(X_test)

print("BR (SVR) R2-Score: ", r2_score(y_test, y_pred_bagsvm))
print("BR (SVR) MAE: ", mean_absolute_error(y_test, y_pred_bagsvm))
print("BR (SVR) MSE: ", mean_squared_error(y_test, y_pred_bagsvm))
print("BR (SVR) RMSE: ", np.sqrt(mean_squared_error(y_test, y_pred_bagsvm)))



BR (SVR) R2-Score:  0.6752840698785681
BR (SVR) MAE:  41.4348620720922
BR (SVR) MSE:  3342.023091263138
BR (SVR) RMSE:  57.810233447575165


### **Pasting**

In [31]:
pasting = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
                        n_estimators=500,
                        max_samples=0.25,
                        bootstrap=False,
                        random_state=42,
                        )

In [32]:
pasting.fit(X_train, y_train)
y_pred_pasting = pasting.predict(X_test)

print("Pasting R2-Score: ", r2_score(y_test, y_pred_pasting))
print("Pasting MAE: ", mean_absolute_error(y_test, y_pred_pasting))
print("Pasting MSE: ", mean_squared_error(y_test, y_pred_pasting))
print("Pasting RMSE: ", np.sqrt(mean_squared_error(y_test, y_pred_pasting)))



Pasting R2-Score:  0.9966829682329803
Pasting MAE:  3.7799551172311303
Pasting MSE:  34.139368387893875
Pasting RMSE:  5.84289041381865


Takeaways:

- Random Forest is better than Bagged models, and further Bagged models are better than Pasting
- Good results come around 25% to 50% row sampling
- In order to find the best parameters, we need to do hyper parameter optimization