In [1]:
from sklearn.ensemble import GradientBoostingRegressor
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.datasets import load_boston
from sklearn.metrics import mean_absolute_error

In [2]:
from sklearn.datasets import fetch_california_housing

housing = fetch_california_housing()

**Data Set Characteristics**

    Number of Instances: 20640

    Number of Attributes: 8 numeric, predictive attributes and the target

    Attribute Information:
        - MedInc        median income in block group
        - HouseAge      median house age in block group
        - AveRooms      average number of rooms per household
        - AveBedrms     average number of bedrooms per household
        - Population    block group population
        - AveOccup      average number of household members
        - Latitude      block group latitude
        - Longitude     block group longitude

    Missing Attribute Values: None

    Target:
      - MedHouseVal     median house value in block group

Here's the dataset in tabular format.

In [3]:
df = pd.DataFrame(housing.data, columns=housing.feature_names)
df[housing.target_names[0]] = housing.target

df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [4]:
X = pd.DataFrame(housing.data, columns=housing.feature_names)
y = pd.Series(housing.target)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [6]:
regressor = GradientBoostingRegressor(
    max_depth=2,
    n_estimators=3,
    learning_rate=1.0
)
regressor.fit(X_train, y_train)

GradientBoostingRegressor(learning_rate=1.0, max_depth=2, n_estimators=3)

In [7]:
errors = [mean_squared_error(y_test, y_pred) for y_pred in regressor.staged_predict(X_test)]
best_n_estimators = np.argmin(errors)

In [8]:
best_regressor = GradientBoostingRegressor(
    max_depth=2,
    n_estimators=best_n_estimators,
    learning_rate=1.0
)
best_regressor.fit(X_train, y_train)

GradientBoostingRegressor(learning_rate=1.0, max_depth=2, n_estimators=2)

In [9]:
y_pred = best_regressor.predict(X_test)

mean_absolute_error(y_test, y_pred)

0.5991938794272984

In [10]:
comparison_table = pd.DataFrame(y_test, columns= ['Actual Values'])  

comparison_table["Predictions"] = y_pred

comparison_table

Unnamed: 0,Actual Values,Predictions
17053,3.088,2.828144
6842,1.839,1.274916
6797,2.178,1.274916
11650,3.159,2.828144
13363,1.813,1.274916
...,...,...
10266,2.269,2.828144
18745,1.001,1.598990
6984,2.197,2.004117
311,0.857,1.274916
