In [27]:
from sklearn.datasets import _california_housing
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor , HistGradientBoostingRegressor

In [28]:
housing = _california_housing.fetch_california_housing()

X = housing.data
y = housing.target
print(X.shape, y.shape)
print(housing.feature_names)
print(X[0])
print(y[0])

poly = PolynomialFeatures()
X = poly.fit_transform(X)
print(X.shape, y.shape)
print(housing.feature_names)
print(X[0])
print(y[0])

(20640, 8) (20640,)
['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']
[   8.3252       41.            6.98412698    1.02380952  322.
    2.55555556   37.88       -122.23      ]
4.526
(20640, 45) (20640,)
['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']
[ 1.00000000e+00  8.32520000e+00  4.10000000e+01  6.98412698e+00
  1.02380952e+00  3.22000000e+02  2.55555556e+00  3.78800000e+01
 -1.22230000e+02  6.93089550e+01  3.41333200e+02  5.81442540e+01
  8.52341905e+00  2.68071440e+03  2.12755111e+01  3.15358576e+02
 -1.01758920e+03  1.68100000e+03  2.86349206e+02  4.19761905e+01
  1.32020000e+04  1.04777778e+02  1.55308000e+03 -5.01143000e+03
  4.87780297e+01  7.15041572e+00  2.24888889e+03  1.78483245e+01
  2.64558730e+02 -8.53669841e+02  1.04818594e+00  3.29666667e+02
  2.61640212e+00  3.87819048e+01 -1.25140238e+02  1.03684000e+05
  8.22888889e+02  1.21973600e+04 -3.93580600e+04  6.53086420e+0

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=432) 
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape) 
# (16512, 8) (4128, 8)
# (16512,) (4128,)
print(X_train[0])
print(y_train[0])


(16512, 45) (4128, 45)
(16512,) (4128,)
[ 1.00000000e+00  2.14420000e+00  5.20000000e+01  3.94886364e+00
  1.03693182e+00  9.21000000e+02  2.61647727e+00  3.73400000e+01
 -1.21880000e+02  4.59759364e+00  1.11498400e+02  8.46715341e+00
  2.22338920e+00  1.97480820e+03  5.61025057e+00  8.00644280e+01
 -2.61335096e+02  2.70400000e+03  2.05340909e+02  5.39204545e+01
  4.78920000e+04  1.36056818e+02  1.94168000e+03 -6.33776000e+03
  1.55935240e+01  4.09470235e+00  3.63690341e+03  1.03321120e+01
  1.47450568e+02 -4.81287500e+02  1.07522760e+00  9.55014205e+02
  2.71310854e+00  3.87190341e+01 -1.26381250e+02  8.48241000e+05
  2.40977557e+03  3.43901400e+04 -1.12251480e+05  6.84595332e+00
  9.76992614e+01 -3.18896250e+02  1.39427560e+03 -4.55099920e+03
  1.48547344e+04]
1.889


In [31]:
LR = LinearRegression()
RF = RandomForestRegressor(n_estimators=100,n_jobs=-1)
GB = HistGradientBoostingRegressor()
for i in [RF,GB,LR]:
    i.fit(X_train, y_train)
    y_pred = i.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    print(i,"R2 score:", r2)

RandomForestRegressor(n_jobs=-1) R2 score: 0.8029138208552343
HistGradientBoostingRegressor() R2 score: 0.8373985228190206
LinearRegression() R2 score: 0.6610240200007315


In [37]:
for j in [0.1,0.01,0.001]:
    for i in range(100,500,50):
        model = HistGradientBoostingRegressor(
            max_iter=i,
            learning_rate=j,
        )
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        r2 = r2_score(y_test, y_pred)
        print(f"Max Iter: {i}, R2 score: {r2}")

Max Iter: 100, R2 score: 0.8368068273373815
Max Iter: 150, R2 score: 0.8399068572907876
Max Iter: 200, R2 score: 0.843357237529702
Max Iter: 250, R2 score: 0.8486422309605395
Max Iter: 300, R2 score: 0.8412317551172275
Max Iter: 350, R2 score: 0.8405029307414225
Max Iter: 400, R2 score: 0.8474458081363965
Max Iter: 450, R2 score: 0.8423465584155829
Max Iter: 100, R2 score: 0.6175243415898763
Max Iter: 150, R2 score: 0.7069081625865046
Max Iter: 200, R2 score: 0.7503764985327077
Max Iter: 250, R2 score: 0.7793163279585917
Max Iter: 300, R2 score: 0.7949858981042686
Max Iter: 350, R2 score: 0.8078734397726941
Max Iter: 400, R2 score: 0.8161502079646676
Max Iter: 450, R2 score: 0.819231288260803
Max Iter: 100, R2 score: 0.11995179939287626
Max Iter: 150, R2 score: 0.17188766365790809
Max Iter: 200, R2 score: 0.22085870410870323
Max Iter: 250, R2 score: 0.26317003382166715
Max Iter: 300, R2 score: 0.3034149177134451
Max Iter: 350, R2 score: 0.3406481756574461
Max Iter: 400, R2 score: 0.373

In [39]:
model = HistGradientBoostingRegressor(
    max_iter=200,
    learning_rate=0.1,
)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(f"Final Model - Max Iter: 200, R2 score: {r2}")

Final Model - Max Iter: 200, R2 score: 0.8421140374877238


In [40]:
import joblib
joblib.dump(model, 'california_housing_model.pkl')

['california_housing_model.pkl']