In [19]:
import np as np
import pandas as pd
from sklearn.datasets import _california_housing
from sklearn.ensemble import AdaBoostRegressor, HistGradientBoostingRegressor

data, target = _california_housing.fetch_california_housing(as_frame=True, return_X_y=True)

In [20]:
from sklearn.model_selection import train_test_split

data_train, data_test, target_train, target_test = train_test_split(data, target, test_size=0.2, random_state=69)

# Base

In [90]:
from sklearn.ensemble import AdaBoostRegressor, HistGradientBoostingRegressor

model = AdaBoostRegressor(random_state=69, learning_rate=1,n_estimators=100, base_estimator=HistGradientBoostingRegressor(learning_rate=0.1, max_iter=1000, max_depth=8, warm_start=True, max_leaf_nodes=None, max_bins=255))

_ = model.fit(data_train, target_train)

KeyboardInterrupt: 

In [None]:
print("R2 : "+str(model.score(data_test, target_test)*100))

## Search

In [22]:
def modelfit(alg):
    alg.fit(data_train, target_train)
    print("R2 : "+str(alg.score(data_test, target_test)*100))
    return alg.score(data_test, target_test)*100

### HistGradientBoost

In [34]:
model = HistGradientBoostingRegressor(
    random_state=69,
)

modelfit(model)

R2 : 82.88415603200865


82.88415603200865

In [36]:
from sklearn.model_selection import GridSearchCV

model = HistGradientBoostingRegressor(
    random_state=69,
    min_samples_leaf=20,
    max_depth=12,
    learning_rate=0.1,
)


parameters = {
    'max_iter':range(10,501,10),
}

clf = GridSearchCV(estimator=model, param_grid=parameters)

_ = clf.fit(data_train, target_train)
print(clf.best_params_)
print(clf.score(data_test, target_test)*100)

{'max_iter': 190}
83.84447277375038


In [39]:
from sklearn.model_selection import GridSearchCV

model = HistGradientBoostingRegressor(
    random_state=69,
    max_iter=190,
    learning_rate=0.1,
)


parameters = {
    'min_samples_leaf':range(10, 121, 10),
    'max_depth':range(10,21,2),
}

clf = GridSearchCV(estimator=model, param_grid=parameters)

_ = clf.fit(data_train, target_train)
print(clf.best_params_)
print(clf.score(data_test, target_test)*100)

{'max_depth': 12, 'min_samples_leaf': 50}
83.44065284718931


In [41]:
from sklearn.model_selection import GridSearchCV
import numpy as np

model = HistGradientBoostingRegressor(
    random_state=69,
    max_depth=12,
    min_samples_leaf=50,
    max_iter=190,
    learning_rate=0.1,
    warm_start=True
)


modelfit(model)

r = np.absolute(model.predict(data) - target)*100

print(str(np.mean(r)) + " +- " + str(np.std(r)))

R2 : 83.44065284718931
27.091312309030013 +- 30.137617107296307


### Ada Boost

In [42]:
from sklearn.ensemble import AdaBoostRegressor

m = AdaBoostRegressor(random_state=69
                      , base_estimator=model)
modelfit(m)

R2 : 83.73650435630721


83.73650435630721

In [45]:
m = AdaBoostRegressor(random_state=69,
                      base_estimator=model,
                      learning_rate=0.1)

parameters = {
    'n_estimators':range(40, 51, 2),
    'learning_rate':[0.1, 1],
}

clf = GridSearchCV(estimator=m, param_grid=parameters)

_ = clf.fit(data_train, target_train)
print(clf.best_params_)
print(clf.score(data_test, target_test)*100)

{'learning_rate': 0.1, 'n_estimators': 46}
84.33818618668829


In [17]:
m = AdaBoostRegressor(random_state=69,
                      base_estimator=model,
                      learning_rate=0.1,
                      n_estimators=22)

parameters = {
    'loss':["linear", "square", "exponential"],
}

clf = GridSearchCV(estimator=m, param_grid=parameters)

_ = clf.fit(data_train, target_train)
print(clf.best_params_)
print(clf.score(data_test, target_test)*100)

{'loss': 'linear'}
86.78349774145171


In [57]:
m = AdaBoostRegressor(random_state=69, learning_rate=1,n_estimators=115, base_estimator=HistGradientBoostingRegressor(learning_rate=0.1, max_iter=1000, max_depth=7, warm_start=True, max_leaf_nodes=60, random_state=69,  max_bins=255))

modelfit(m)

R2 : 85.1542366577762


85.1542366577762

In [58]:
import joblib
from joblib import dump, load
with open("IA/851542.joblib", 'wb') as fo:
    joblib.dump(m, fo)

In [7]:
import joblib

with open("IA/872215.joblib", 'rb') as fo:
    clf = joblib.load(fo)

In [55]:
r = np.absolute(m.predict(data) - target)*100

print(str(np.mean(r)) + " +- " + str(np.std(r)))

11.719514729009163 +- 17.784543854053656


In [8]:
data.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [16]:
import pandas as pd

clf.predict(pd.DataFrame(data={"MedInc": [8.3252], "HouseAge": [41], "AveRooms": [6.984127], "AveBedrms": [1.023810], "Population": [322], "AveOccup": [2.555556], "Latitude": [37.88], "Longitude": [-122.23]}))

array([4.42281801])