In [7]:
import pandas as pd
import numpy as np

data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)

data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]

X_boston = data
Y_boston = target

print('Manually loaded dataset shape:', data.shape)
print('Target shape:', target.shape)


Manually loaded dataset shape: (506, 13)
Target shape: (506,)


In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(X_boston, Y_boston , train_size=0.80, test_size=0.20, random_state=123)
print('Train/Test Sets Sizes : ',X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
     

Train/Test Sets Sizes :  (404, 13) (102, 13) (404,) (102,)


In [12]:
lr = LinearRegression()
dt = DecisionTreeRegressor()
knn = KNeighborsRegressor()
     


In [15]:
lr.fit(X_train,Y_train)
dt.fit(X_train,Y_train)
knn.fit(X_train,Y_train)
     

In [16]:
KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                    weights='uniform')

In [17]:
y_pred1 = lr.predict(X_test)
y_pred2 = dt.predict(X_test)
y_pred3 = knn.predict(X_test)

In [18]:
print("R^2 score for LR",r2_score(Y_test,y_pred1))
print("R^2 score for DT",r2_score(Y_test,y_pred2))
print("R^2 score for KNN",r2_score(Y_test,y_pred3))

R^2 score for LR 0.6592466510354087
R^2 score for DT 0.42808590041529637
R^2 score for KNN 0.5475962186976784


In [19]:
from sklearn.ensemble import BaggingRegressor

In [20]:
bag_regressor = BaggingRegressor(random_state=1)
bag_regressor.fit(X_train, Y_train)

In [21]:
BaggingRegressor(estimator=None, bootstrap=True, bootstrap_features=False,
                 max_features=1.0, max_samples=1.0, n_estimators=10,
                 n_jobs=None, oob_score=False, random_state=1, verbose=0,
                 warm_start=False)

In [22]:
Y_preds = bag_regressor.predict(X_test)

In [23]:
print('Training Coefficient of R^2 : %.3f'%bag_regressor.score(X_train, Y_train))
print('Test Coefficient of R^2 : %.3f'%bag_regressor.score(X_test, Y_test))

Training Coefficient of R^2 : 0.980
Test Coefficient of R^2 : 0.818


In [25]:
params = {
    'estimator': [None, LinearRegression(), KNeighborsRegressor()],  # Base models to be bagged
    'n_estimators': [20, 50, 100],       # Number of base estimators in the ensemble
    'max_samples': [0.5, 1.0],           # Fraction of samples used for fitting each base estimator
    'max_features': [0.5, 1.0],          # Fraction of features used when training each base estimator
    'bootstrap': [True, False],          # Whether samples are drawn with replacement
    'bootstrap_features': [True, False]  # Whether features are drawn with replacement
}

# Perform grid search with 3-fold cross-validation
# n_jobs=-1 uses all available CPU cores; verbose=1 shows progress
bagging_regressor_grid = GridSearchCV(
    estimator=BaggingRegressor(random_state=1, n_jobs=-1),
    param_grid=params,
    cv=3,
    n_jobs=-1,
    verbose=1
)

# Fit the model on the training data
bagging_regressor_grid.fit(X_train, Y_train)

# Evaluate and print results
print('Train R^2 Score : %.3f' % bagging_regressor_grid.best_estimator_.score(X_train, Y_train))
print('Test R^2 Score : %.3f' % bagging_regressor_grid.best_estimator_.score(X_test, Y_test))
print('Best R^2 Score Through Grid Search : %.3f' % bagging_regressor_grid.best_score_)
print('Best Parameters : ', bagging_regressor_grid.best_params_)

Fitting 3 folds for each of 144 candidates, totalling 432 fits
Train R^2 Score : 0.983
Test R^2 Score : 0.805
Best R^2 Score Through Grid Search : 0.871
Best Parameters :  {'bootstrap': True, 'bootstrap_features': False, 'estimator': None, 'max_features': 1.0, 'max_samples': 1.0, 'n_estimators': 50}
