## Основные характеристики датасета

MedInc - медианный доход в районе

HouseAge - средний возраст домов в районе

AveRooms - среднее количество комнат на дом

AveBedrms - среднее количество спален на дом

Population - население района

AveOccup - среднее количество жителей на дом

Latitude - географическая широта района

Longitude - географическая долгота района

MedHouseVal - медианная стоимость домов в районе (целевая переменная)

## Подготовка

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor

RANDOM_STATE=123

In [None]:
# Загрузка данных
california = fetch_california_housing()
data = pd.DataFrame(data= np.c_[california['data'], california['target']],
                     columns= california['feature_names'] + ['target'])

In [None]:
data.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [None]:
data.isnull().sum()

MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
target        0
dtype: int64

In [None]:
data.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704,2.068558
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532,1.153956
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35,0.14999
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8,1.196
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49,1.797
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01,2.64725
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31,5.00001


## Разделение на выборки

In [None]:
X = data.iloc[:, :-1]
y = data.target

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

In [None]:
# Масштабирование
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Обучение моделей

In [None]:
# Бэггинг
bagging_params = {
    'n_estimators':[5, 10, 50, 100]
}

grid_search = GridSearchCV(estimator=BaggingRegressor(random_state=RANDOM_STATE), param_grid=bagging_params, cv=3)
grid_search.fit(X_train_scaled, y_train)

print(f"Лучшие параметры: {grid_search.best_params_}")
bagging = grid_search.best_estimator_

Лучшие параметры: {'n_estimators': 100}


In [None]:
# Случайный лес
tree_params = {
    'n_estimators':[50, 100, 150, 200]
}

grid_search = GridSearchCV(estimator=RandomForestRegressor(random_state=RANDOM_STATE), param_grid=tree_params, cv=3)
grid_search.fit(X_train_scaled, y_train)

print(f"Лучшие параметры: {grid_search.best_params_}")
random_forest = grid_search.best_estimator_

Лучшие параметры: {'n_estimators': 200}


In [None]:
# AdaBoost
adaboost_params = {
    'n_estimators':[30, 50, 100, 150]
}

grid_search = GridSearchCV(estimator=AdaBoostRegressor(estimator=DecisionTreeRegressor(), random_state=RANDOM_STATE), param_grid=adaboost_params, cv=3)
grid_search.fit(X_train_scaled, y_train)

print(f"Лучшие параметры: {grid_search.best_params_}")
adaboost = grid_search.best_estimator_

Лучшие параметры: {'n_estimators': 150}


In [None]:
grid_search = GridSearchCV(estimator=AdaBoostRegressor(random_state=RANDOM_STATE), param_grid=adaboost_params, cv=3)
grid_search.fit(X_train_scaled, y_train)

print(f"Лучшие параметры: {grid_search.best_params_}")
adaboost_limited_tree_depth = grid_search.best_estimator_

Лучшие параметры: {'n_estimators': 30}


In [None]:
# Градиентный бустинг
gradient_params = {
    'n_estimators':[50, 100, 150, 200]
}

grid_search = GridSearchCV(estimator=GradientBoostingRegressor(random_state=RANDOM_STATE), param_grid=gradient_params, cv=3)
grid_search.fit(X_train_scaled, y_train)

print(f"Лучшие параметры: {grid_search.best_params_}")
gradient_boosting = grid_search.best_estimator_

Лучшие параметры: {'n_estimators': 200}



## Оценка моделей

In [None]:
y_pred_bagging = bagging.predict(X_test_scaled)
y_pred_rf = random_forest.predict(X_test_scaled)
y_pred_adaboost = adaboost.predict(X_test_scaled)
y_pred_adaboost_limited = adaboost_limited_tree_depth.predict(X_test_scaled)
y_pred_gb = gradient_boosting.predict(X_test_scaled)

In [None]:
# MAE
print(f"Bagging: {mean_absolute_error(y_test, y_pred_bagging):.4f}")
print(f"Random Forest: {mean_absolute_error(y_test, y_pred_rf):.4f}")
print(f"AdaBoost: {mean_absolute_error(y_test, y_pred_adaboost):.4f}")
print(f"AdaBoost (tree depth = 3): {mean_absolute_error(y_test, y_pred_adaboost_limited):.4f}")
print(f"Gradient Boosting: {mean_absolute_error(y_test, y_pred_gb):.4f}")

Bagging: 0.3243
Random Forest: 0.3223
AdaBoost: 0.2902
AdaBoost (tree depth = 3): 0.7656
Gradient Boosting: 0.3376


In [None]:
# R^2
print(f"Bagging: {r2_score(y_test, y_pred_bagging):.4f}")
print(f"Random Forest: {r2_score(y_test, y_pred_rf):.4f}")
print(f"AdaBoost: {r2_score(y_test, y_pred_adaboost):.4f}")
print(f"AdaBoost (tree depth = 3): {r2_score(y_test, y_pred_adaboost_limited):.4f}")
print(f"Gradient Boosting: {r2_score(y_test, y_pred_gb):.4f}")

Bagging: 0.8127
Random Forest: 0.8142
AdaBoost: 0.8320
AdaBoost (tree depth = 3): 0.4136
Gradient Boosting: 0.8178


## Вывод

На основе анализа метрик MAE и R², AdaBoost является лучшей моделью для данной задачи, так как она показала наименьшую среднюю абсолютную ошибку и наибольшее значение коэффициента детерминации.

Однако по умолчанию в AdaBoost используется DecisionTreeRegressor с ограничением глубины дерева, что сильно ухудшает результаты.

Остальные модели показали примерно одинаковый результат.