<a href="https://colab.research.google.com/github/Vioron/ds_belhard/blob/main/work05/work05.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [59]:
!pip install -q catboost xgboost lightgbm

# Импорты

In [60]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import Ridge, Lasso, BayesianRidge
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import warnings
warnings.filterwarnings("ignore")
RANDOM_STATE = 42

# 1) Загрузка датасета

In [70]:
url_xls = "https://archive.ics.uci.edu/ml/machine-learning-databases/concrete/compressive/Concrete_Data.xls"

# читаем excel прямо из сети
df = pd.read_excel(url_xls, header=0)  # header=0 — первая строка с заголовками
print("shape:", df.shape)
display(df.head())

# В некоторых зеркалах название колонки цели может отличаться — посмотрим названия
print("Columns:", df.columns.tolist())

# В этом датасете цель — последняя колонка (Concrete compressive strength) (Прочность бетона на сжатие)
X = df.iloc[:, :-1].copy()
y = df.iloc[:, -1].astype(float).copy()

shape: (1030, 9)


Unnamed: 0,Cement (component 1)(kg in a m^3 mixture),Blast Furnace Slag (component 2)(kg in a m^3 mixture),Fly Ash (component 3)(kg in a m^3 mixture),Water (component 4)(kg in a m^3 mixture),Superplasticizer (component 5)(kg in a m^3 mixture),Coarse Aggregate (component 6)(kg in a m^3 mixture),Fine Aggregate (component 7)(kg in a m^3 mixture),Age (day),"Concrete compressive strength(MPa, megapascals)"
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.986111
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.887366
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.269535
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05278
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.296075


Columns: ['Cement (component 1)(kg in a m^3 mixture)', 'Blast Furnace Slag (component 2)(kg in a m^3 mixture)', 'Fly Ash (component 3)(kg in a m^3 mixture)', 'Water  (component 4)(kg in a m^3 mixture)', 'Superplasticizer (component 5)(kg in a m^3 mixture)', 'Coarse Aggregate  (component 6)(kg in a m^3 mixture)', 'Fine Aggregate (component 7)(kg in a m^3 mixture)', 'Age (day)', 'Concrete compressive strength(MPa, megapascals) ']


# 2) Простая предобработка: разделение и масштабирование числовых признаков

In [62]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE
)

num_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()

# Pipeline для числовых признаков (импутация + масштабирование)
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

preprocessor = ColumnTransformer([
    ("num", num_pipeline, num_cols)
])

# 3) Определяем модели

In [63]:
models = {
    "Ridge": Ridge(random_state=RANDOM_STATE),
    "Lasso": Lasso(random_state=RANDOM_STATE),
    "BayesianRidge": BayesianRidge(),
    "XGBoost": XGBRegressor(random_state=RANDOM_STATE, objective="reg:squarederror", verbosity=0),
    "LightGBM": LGBMRegressor(random_state=RANDOM_STATE),
    "CatBoost": CatBoostRegressor(random_state=RANDOM_STATE, verbose=0)
}

# 4) Сетки гиперпараметров для GridSearch (для тяжелых бустингов даём небольшие сетки)

In [68]:
param_grids = {
    "Ridge": {
        "model__alpha": [0.1, 1.0, 10.0]
    },
    "Lasso": {
        "model__alpha": [0.001, 0.01, 0.1]
    },
    "BayesianRidge": {
        "model__max_iter": [300, 500],
        "model__tol": [1e-4, 1e-3],
        "model__lambda_1": [1e-6, 1e-4, 1e-2],
        "model__lambda_2": [1e-6, 1e-4, 1e-2],
        "model__alpha_1": [1e-6, 1e-4],
        "model__alpha_2": [1e-6, 1e-4],
    },
    "XGBoost": {
        "model__n_estimators": [100, 200],
        "model__max_depth": [3, 6],
        "model__learning_rate": [0.05, 0.1]
    },
    "LightGBM": {
        "model__n_estimators": [100, 200],
        "model__max_depth": [ -1, 7],
        "model__learning_rate": [0.05, 0.1],
        "model__verbosity": [-1],
    },
    "CatBoost": {
        "model__iterations": [200, 400],
        "model__depth": [4, 6],
        "model__learning_rate": [0.03, 0.1]
    }
}

# 5) Обучаем модели

In [69]:
best_models = {}
for name, base_model in models.items():
    print(f"\n>>> Обучение и подбор для: {name}")
    pipe = Pipeline([
        ("preproc", preprocessor),
        ("model", base_model)
    ])
    if name in param_grids and param_grids[name]:
        grid = GridSearchCV(pipe, param_grids[name], cv=4,
                            scoring="neg_mean_squared_error", n_jobs=-1, verbose=0)
        grid.fit(X_train, y_train)
        print(" Лучшие параметры:", grid.best_params_)
        best_models[name] = grid.best_estimator_
    else:
        pipe.fit(X_train, y_train)
        best_models[name] = pipe
        print(" Обучено без GridSearch")


>>> Обучение и подбор для: Ridge
 Лучшие параметры: {'model__alpha': 1.0}

>>> Обучение и подбор для: Lasso
 Лучшие параметры: {'model__alpha': 0.001}

>>> Обучение и подбор для: BayesianRidge
 Лучшие параметры: {'model__alpha_1': 0.0001, 'model__alpha_2': 1e-06, 'model__lambda_1': 1e-06, 'model__lambda_2': 0.01, 'model__max_iter': 300, 'model__tol': 0.001}

>>> Обучение и подбор для: XGBoost
 Лучшие параметры: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 200}

>>> Обучение и подбор для: LightGBM
 Лучшие параметры: {'model__learning_rate': 0.1, 'model__max_depth': 7, 'model__n_estimators': 200, 'model__verbosity': -1}

>>> Обучение и подбор для: CatBoost
 Лучшие параметры: {'model__depth': 6, 'model__iterations': 400, 'model__learning_rate': 0.1}


# 6) Оценка на тесте: RMSE, MAE, R2

In [66]:
# Оценка на тесте: RMSE, MAE, R2 (совместимый вариант)
results = []
for name, mdl in best_models.items():
    y_pred = mdl.predict(X_test)
    y_pred = np.ravel(y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results.append([name, rmse, mae, r2])
    print(f"{name:10s} RMSE={rmse:.4f}  MAE={mae:.4f}  R2={r2:.4f}")

results_df = pd.DataFrame(results, columns=["model","RMSE","MAE","R2"]).sort_values("RMSE")
print("\nИтог (по RMSE, от лучшего к худшему):")
display(results_df.reset_index(drop=True))


Ridge      RMSE=9.7964  MAE=7.7518  R2=0.6276
Lasso      RMSE=9.7967  MAE=7.7460  R2=0.6275
BayesianRidge RMSE=9.7974  MAE=7.7626  R2=0.6275
XGBoost    RMSE=4.8911  MAE=3.5820  R2=0.9072
LightGBM   RMSE=4.4622  MAE=3.0837  R2=0.9227
CatBoost   RMSE=4.2668  MAE=2.7599  R2=0.9293

Итог (по RMSE, от лучшего к худшему):


Unnamed: 0,model,RMSE,MAE,R2
0,CatBoost,4.266828,2.759909,0.929347
1,LightGBM,4.462234,3.083707,0.922728
2,XGBoost,4.891075,3.581967,0.907162
3,Ridge,9.796411,7.751761,0.627564
4,Lasso,9.796663,7.74604,0.627545
5,BayesianRidge,9.79736,7.762594,0.627492
