In [1]:
from sklearn.datasets import _california_housing
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor , HistGradientBoostingRegressor

In [2]:
housing = _california_housing.fetch_california_housing()

X = housing.data
y = housing.target
print(X.shape, y.shape)
print(housing.feature_names)
print(X[0])
print(y[0])

stand = StandardScaler()
X = stand.fit_transform(X)
print(X.shape, y.shape)
print(housing.feature_names)
print(X[0])
print(y[0])

(20640, 8) (20640,)
['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']
[   8.3252       41.            6.98412698    1.02380952  322.
    2.55555556   37.88       -122.23      ]
4.526
(20640, 8) (20640,)
['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']
[ 2.34476576  0.98214266  0.62855945 -0.15375759 -0.9744286  -0.04959654
  1.05254828 -1.32783522]
4.526


In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=432) 
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape) 
# (16512, 8) (4128, 8)
# (16512,) (4128,)
print(X_train[0])
print(y_train[0])


(16512, 8) (4128, 8)
(16512,) (4128,)
[-0.90877623  1.85618152 -0.59824914 -0.12606755 -0.44547977 -0.04373067
  0.79972754 -1.15313947]
1.889


In [4]:
LR = LinearRegression()
RF = RandomForestRegressor(n_estimators=100,n_jobs=-1)
GB = HistGradientBoostingRegressor()
for i in [RF,GB,LR]:
    i.fit(X_train, y_train)
    y_pred = i.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    print(i,"R2 score:", r2)

RandomForestRegressor(n_jobs=-1) R2 score: 0.8137391568770248
HistGradientBoostingRegressor() R2 score: 0.834831062696736
LinearRegression() R2 score: 0.6080229586580415


In [5]:
for j in [0.1,0.01,0.001]:
    for i in range(100,500,50):
        model = HistGradientBoostingRegressor(
            max_iter=i,
            learning_rate=j,
        )
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        r2 = r2_score(y_test, y_pred)
        print(f"Max Iter: {i}, R2 score: {r2}")

Max Iter: 100, R2 score: 0.8346239865809286
Max Iter: 150, R2 score: 0.8422578651975886
Max Iter: 200, R2 score: 0.8426312592370117
Max Iter: 250, R2 score: 0.8447595290427172
Max Iter: 300, R2 score: 0.8432822808521558
Max Iter: 350, R2 score: 0.8485367781036859
Max Iter: 400, R2 score: 0.8419303899239382
Max Iter: 450, R2 score: 0.8432126036646933
Max Iter: 100, R2 score: 0.6156882619725507
Max Iter: 150, R2 score: 0.6986585117591486
Max Iter: 200, R2 score: 0.7436762437755471
Max Iter: 250, R2 score: 0.7711843824447855
Max Iter: 300, R2 score: 0.7883606114686348
Max Iter: 350, R2 score: 0.8015676754495111
Max Iter: 400, R2 score: 0.8129298500856872
Max Iter: 450, R2 score: 0.8160053709002189
Max Iter: 100, R2 score: 0.11937450745889844
Max Iter: 150, R2 score: 0.1692181997486557
Max Iter: 200, R2 score: 0.21837197944162046
Max Iter: 250, R2 score: 0.25984504560160704
Max Iter: 300, R2 score: 0.30147663205147457
Max Iter: 350, R2 score: 0.33742294122293937
Max Iter: 400, R2 score: 0.

In [8]:
model = HistGradientBoostingRegressor(
    max_iter=350,
    learning_rate=0.1,
)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(f"Final Model - Max Iter: 200, R2 score: {r2}")

Final Model - Max Iter: 200, R2 score: 0.8430127935733467


In [10]:
import joblib
joblib.dump(model, 'california_housing_model_v2.pkl')

['california_housing_model_v2.pkl']