## Настройки Colab

In [None]:
# Настройка пользователя (сделать один раз)
!git config --global user.email "nabludatellip@gmail.com"
!git config --global user.name "ProninPV"

In [None]:
!git clone https://github.com/ProninPV/ml-regression_concrete-strength.git
%cd ml-regression_concrete-strength


In [None]:
%cd /content/ml-regression_concrete-strength
!git config pull.rebase false
!git pull origin modeling

In [None]:
# Создай коммит слияния
!git add .
!git commit -m "Merge remote changes"

# Затем пуш
!git push origin modeling

In [None]:
%cd /content/ml-regression_concrete-strength

# Добавь ноутбук в git
!git add notebooks/05_Modeling_colab.ipynb

# Закоммить
!git commit -m "Update modeling notebook"

# Затем пушить


In [None]:
%cd /content/ml-regression_concrete-strength

# Если есть изменения - добавить и закоммитить
!git add .
!git commit -m "Your commit message"

# Затем пушить


In [None]:
!pip install catboost

## 5.0 Импорты библиотек

In [1]:
import os
import yaml
import logging
import pickle
import numpy as np
import scipy.stats as stats
import sys
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import levene
from scipy.stats import ttest_ind
from typing import List, Any, Optional, Tuple, Dict, Union
from datetime import datetime
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import cross_val_score, KFold, RepeatedKFold
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.compose import ColumnTransformer
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import time
import psutil
from tqdm import tqdm
import gc

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
# расширяем поле ноутбука для удобства
from IPython.display import display, HTML
display(HTML('<style>.container {width:87% !important;}</style>'))
display(HTML("<style>.output_scroll {height:auto !important; max-height:10000px !important;}</style>"))

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [4]:
# Настройки для pandas (количество отображаемых колонок)
pd.set_option('display.max_columns', 100)

In [5]:
# Определение стиля для pyplot
plt.style.use('ggplot')

In [6]:
# В Colab проект клонируется в /content/
# Устанавливаем правильную рабочую директорию
# project_root = Path('/content/ml-regression_concrete-strength')

# Устанавливаем правильную рабочую директорию для работы локально
cwd = Path().resolve()

# Поднимаемся на один уровень выше
project_root = cwd.parent

# Меняем рабочую директорию на корень проекта
os.chdir(project_root)

# Добавляем корень проекта в sys.path
sys.path.append(str(project_root))

# Проверяем наличие конфиг файла
config_path = project_root / "config" / "config.yaml"
print(f"Looking for config at: {config_path}")

# Загрузка данных из config.yaml
from src.data import downloader, loader, preprocessor, saving
from src.features import feat_preprocessing
from src.modeling import modeling

config = loader.load_config(config_path)
print("✅ Config loaded successfully!")

Looking for config at: D:\Skills\Kaggle\ml-regression_concrete-strength\config\config.yaml
✅ Config loaded successfully!


## 5.1. Загрузка данных

In [7]:
# Загрузка train
df_train = loader.data_load_preprocessed(data_type='train',
                                         config=config)

[⧗] Загружаю данные из: D:\Skills\Kaggle\ml-regression_concrete-strength\data\processed\eda_data_train.pkl
[✓] Данные успешно загружены. Форма: (781, 11)


In [8]:
# Вывод первых 5 строк тренировочного датасета
df_train.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength,W/C,Sp/C_pct
0,376.0,0.0,0.0,214.6,0.0,1003.5,762.4,3,16.28,0.570745,0.0
1,491.0,26.0,123.0,210.0,3.9,882.0,699.0,56,59.59,0.427699,0.007943
2,250.0,0.0,95.7,187.4,5.5,956.9,861.2,3,13.82,0.7496,0.022
3,310.0,0.0,0.0,192.0,0.0,1012.0,830.0,90,35.76,0.619355,0.0
4,252.1,97.1,75.6,193.8,8.3,835.5,821.4,28,33.4,0.768743,0.032923


In [9]:
# Загрузка test
df_test = loader.data_load_preprocessed(data_type='test', config=config)

[⧗] Загружаю данные из: D:\Skills\Kaggle\ml-regression_concrete-strength\data\processed\eda_data_test.pkl
[✓] Данные успешно загружены. Форма: (228, 10)


In [10]:
# Вывод первых 5 строк тестового датасета
df_test.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,W/C,Sp/C_pct
0,167.4,129.9,128.6,175.5,7.8,1006.3,746.6,28,1.048387,0.046595
1,475.0,118.8,0.0,181.1,8.9,852.1,781.5,7,0.381263,0.018737
2,251.4,0.0,118.3,188.5,6.4,1028.4,757.7,100,0.749801,0.025457
3,307.0,0.0,0.0,193.0,0.0,968.0,812.0,365,0.628664,0.0
4,143.6,0.0,174.9,158.4,17.9,942.7,844.5,28,1.103064,0.124652


## 5.2. Предобработка данных

In [11]:
# Разделение на признаки и целевую переменную
X = df_train.drop('Strength', axis=1)
y = df_train['Strength']
y_name = y.name

In [12]:
# Все доступные стратегии
all_strategies = [
    ['abnormal'],
    ['combine'],
    ['gost_binar'],
    ['iqr_remove'],
    ['gost_binar', 'combine']
]

results = []

# Тит используемых алгоритмов
model_types = ['linear_models',
               'trees_models'
               ]

# Определяем приоритетный список для доменных знаний о бетоне
DOMAIN_PRIORITY_LIST = [
    'Water',              # Критически влияет на прочность
    'W/C',
    'Cement',             # Основной вяжущий компонент
    'Age',                # Важный технологический параметр
    'Superplasticizer',   # Химическая добавка
    'Fly Ash',            # Минеральная добавка
    'Blast Furnace Slag', # Минеральная добавка
    'Coarse Aggregate',   # Крупный заполнитель
    'Fine Aggregate'      # Мелкий заполнитель - наименее важный
    'Has_Slag',           # Наличие шлака в смеси (0/1)
    'Has_FlyAsh',         # Наличие золы в смеси (0/1)
    'Has_Superplasticizer', # Наличие суперпластификатора (0/1)
    'Low_WC_ratio',       # Низкое В/Ц отношение (адаптивный порог)
    'High_WC_ratio',      # Высокое В/Ц отношение (адаптивный порог)
    'Low_WC_tech',        # Низкое В/Ц отношение (< 0.4)
    'High_WC_tech'        # Высокое В/Ц отношение (> 0.6)
]

# Конфигурация для преобразования признаков
# Сделай так:
feature_config = {
    'trend_settings': {
        'names': ['Linear', 'Log', 'Sqrt']  # Только безопасные
    }
}

In [13]:
# Список моделей
models = {
    # Деревья  
    # "RandomForestRegressor": RandomForestRegressor(n_estimators=100, random_state=42),
    # "DecisionTreeRegressor": DecisionTreeRegressor(random_state=42),

    # Бустинги
    # "CatBoostRegressor": CatBoostRegressor(iterations=100, random_seed=42, verbose=False),
    # "XGBRegressor": XGBRegressor(n_estimators=100, random_state=42, verbosity=0),
    # "HistGradientBoosting": HistGradientBoostingRegressor(max_iter=100, random_state=42),
    # "GradientBoosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
    # "LGBMRegressor": LGBMRegressor(n_estimators=100, random_state=42, verbosity=-1),

    # Линейные
    "LinearRegression": LinearRegression(),
    # "Ridge": Ridge(),
    # "Lasso": Lasso(alpha=1.0, random_state=42),
    # "ElasticNet": ElasticNet(alpha=1.0, l1_ratio=0.5, random_state=42)
}

## 5.3. Baseline

In [14]:
# Обучаем модели с разными стратегиями и выводим отчет
modeling_result = modeling.run_experiments(
    X=X, y=y,
    all_strategies=all_strategies,
    model_types=model_types, 
    feature_config=feature_config,
    models=models,
    config=config,
    y_name=y_name
)
modeling_result

Running experiments: 100%|███████████████████████████████████████████████████████████████████| 10/10 [00:56<00:00,  5.67s/model, model=LinearRegression, time=8.54s, memory=0.17MB]


Unnamed: 0,outlier_strategy,model_name,model_type,dataset_size,mean_rmse,std_rmse,training_time_sec,memory_used_mb
0,['abnormal'],LinearRegression,linear_models,770,6.8253,0.3264,3.78,2.02
1,['abnormal'],LinearRegression,trees_models,770,6.8281,0.3032,4.56,0.5
2,['combine'],LinearRegression,linear_models,770,6.8368,0.3335,3.69,0.02
3,['combine'],LinearRegression,trees_models,770,6.8385,0.3127,5.63,0.04
4,['gost_binar'],LinearRegression,linear_models,781,6.5861,0.3865,3.89,0.08
5,['gost_binar'],LinearRegression,trees_models,781,6.585,0.3709,8.3,0.2
6,['iqr_remove'],LinearRegression,linear_models,691,6.396,0.6418,4.9,0.03
7,['iqr_remove'],LinearRegression,trees_models,691,6.3816,0.6313,6.66,0.0
8,"['gost_binar', 'combine']",LinearRegression,linear_models,770,6.4902,0.384,4.62,0.26
9,"['gost_binar', 'combine']",LinearRegression,trees_models,770,6.504,0.377,8.54,0.17


### 5.4. Сохранение результатов

In [15]:
# Определяем лучшую стратию
best_strategy = modeling.get_best_model_strategy(modeling_result)

best_model = best_strategy['model']
best_model_type = best_strategy['model_type']
best_rmse = best_strategy['rmse']
best_outlier_strategy = eval(best_strategy['outlier_strategy'])

In [16]:
best_strategy

{'model': 'LinearRegression',
 'model_type': 'trees_models',
 'outlier_strategy': "['iqr_remove']",
 'rmse': np.float64(6.3816)}

In [17]:
# Сохраняем отчет по экспериментам обучения
modeling.save_sorted_modeling_report(config, modeling_result)

Отчет сохранен: D:\Skills\Kaggle\ml-regression_concrete-strength\models\modeling_report\modeling_experiments_20251115_142436.csv


In [18]:
# Сохраняем pipeline с лучшими стратегиями предобработки данных
modeling.save_best_pipeline(best_model,
                            best_model_type,
                            best_outlier_strategy,
                            models,
                            modeling_result,
                            feature_config,
                            config,
                            y_name)

Пайплайн сохранен: D:\Skills\Kaggle\ml-regression_concrete-strength\models\pipelines\best_pipeline_LinearRegression.pkl
Метаданные сохранены: D:\Skills\Kaggle\ml-regression_concrete-strength\models\pipelines\pipeline_metadata_LinearRegression.pkl
Лучшая модель: LinearRegression, RMSE: 6.3816


In [19]:
best_outlier_strategy

['iqr_remove']

In [20]:
# Создаем тренировочный датасет после обработки выбросов по лучшей стратегии
outlier_handler = feat_preprocessing.OutlierHandler(
        strategies=best_outlier_strategy,
        config=config,
        target_col=y_name
)

df_train_outliers, y_train_outliers = outlier_handler.fit_transform(X, y)

In [21]:
df_train_outliers

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,W/C,Sp/C_pct
0,376.0,0.0,0.0,214.6,0.0,1003.5,762.4,3,0.570745,0.000000
1,491.0,26.0,123.0,210.0,3.9,882.0,699.0,56,0.427699,0.007943
2,250.0,0.0,95.7,187.4,5.5,956.9,861.2,3,0.749600,0.022000
3,310.0,0.0,0.0,192.0,0.0,1012.0,830.0,90,0.619355,0.000000
4,252.1,97.1,75.6,193.8,8.3,835.5,821.4,28,0.768743,0.032923
...,...,...,...,...,...,...,...,...,...,...
793,310.0,0.0,0.0,192.0,0.0,1012.0,830.0,3,0.619355,0.000000
794,213.5,0.0,174.2,159.2,11.7,1043.6,771.9,100,0.745667,0.054801
795,304.8,0.0,99.6,196.0,9.8,959.4,705.2,28,0.643045,0.032152
797,288.0,192.0,0.0,192.0,0.0,932.0,717.8,28,0.666667,0.000000


In [22]:
# Сохраняем train в различных форматах, а пути к файлам в config
saving.save_preprocessed_data(df_train_outliers, config)

In [23]:
# Сохраняем train в различных форматах, а пути к файлам в config
saving.save_preprocessed_target(y_train_outliers, config)