## Настройки Colab

In [1]:
# Настройка пользователя (сделать один раз)
!git config --global user.email "nabludatellip@gmail.com"
!git config --global user.name "ProninPV"

In [2]:
!git clone https://github.com/ProninPV/ml-regression_concrete-strength.git
%cd ml-regression_concrete-strength


Cloning into 'ml-regression_concrete-strength'...
remote: Enumerating objects: 107, done.[K
remote: Counting objects: 100% (107/107), done.[K
remote: Compressing objects: 100% (83/83), done.[K
remote: Total 107 (delta 21), reused 104 (delta 18), pack-reused 0 (from 0)[K
Receiving objects: 100% (107/107), 1.26 MiB | 7.52 MiB/s, done.
Resolving deltas: 100% (21/21), done.
/content/ml-regression_concrete-strength


In [None]:
%cd /content/ml-regression_concrete-strength
!git config pull.rebase false
!git pull origin modeling

In [None]:
# Создай коммит слияния
!git add .
!git commit -m "Merge remote changes"

# Затем пуш
!git push origin modeling

In [None]:
%cd /content/ml-regression_concrete-strength

# Добавь ноутбук в git
!git add notebooks/05_Modeling_colab.ipynb

# Закоммить
!git commit -m "Update modeling notebook"

# Затем пушить


In [None]:
%cd /content/ml-regression_concrete-strength

# Если есть изменения - добавить и закоммитить
!git add .
!git commit -m "Your commit message"

# Затем пушить


In [3]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


## 6.0 Импорты библиотек

In [1]:
import os
import yaml
import logging
import pickle
import numpy as np
import scipy.stats as stats
import sys
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import levene
from scipy.stats import ttest_ind
from typing import List, Any, Optional, Tuple, Dict, Union
from datetime import datetime
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import cross_val_score, KFold, RepeatedKFold
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.compose import ColumnTransformer
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import time
import psutil
from tqdm import tqdm
import gc

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
# расширяем поле ноутбука для удобства
from IPython.display import display, HTML
display(HTML('<style>.container {width:87% !important;}</style>'))
display(HTML("<style>.output_scroll {height:auto !important; max-height:10000px !important;}</style>"))

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [4]:
# Настройки для pandas (количество отображаемых колонок)
pd.set_option('display.max_columns', 100)

In [5]:
# Определение стиля для pyplot
plt.style.use('ggplot')

In [11]:
# В Colab проект клонируется в /content/
# Устанавливаем правильную рабочую директорию
# project_root = Path('/content/ml-regression_concrete-strength')

# Устанавливаем правильную рабочую директорию для работы локально
cwd = Path().resolve()

# Поднимаемся на один уровень выше
project_root = cwd.parent

# Меняем рабочую директорию на корень проекта
os.chdir(project_root)

# Добавляем корень проекта в sys.path
sys.path.append(str(project_root))

# Проверяем наличие конфиг файла
config_path = project_root / "config" / "config.yaml"
print(f"Looking for config at: {config_path}")

# Загрузка данных из config.yaml
from src.data import downloader, loader, preprocessor, saving
from src.features import feat_preprocessing
from src.modeling import modeling

config = loader.load_config(config_path)
print("✅ Config loaded successfully!")

Looking for config at: D:\Skills\Kaggle\ml-regression_concrete-strength\config\config.yaml
✅ Config loaded successfully!


## 6.1. Загрузка данных

In [12]:
# Загрузка train
df_train = loader.data_load_preprocessed(data_type='train',
                                         config=config)

[⧗] Загружаю данные из: D:\Skills\Kaggle\ml-regression_concrete-strength\data\processed\eda_data_train.pkl
[✓] Данные успешно загружены. Форма: (781, 11)


In [13]:
# Вывод первых 5 строк тренировочного датасета
df_train.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength,W/C,Sp/C_pct
0,376.0,0.0,0.0,214.6,0.0,1003.5,762.4,3,16.28,0.570745,0.0
1,491.0,26.0,123.0,210.0,3.9,882.0,699.0,56,59.59,0.427699,0.007943
2,250.0,0.0,95.7,187.4,5.5,956.9,861.2,3,13.82,0.7496,0.022
3,310.0,0.0,0.0,192.0,0.0,1012.0,830.0,90,35.76,0.619355,0.0
4,252.1,97.1,75.6,193.8,8.3,835.5,821.4,28,33.4,0.768743,0.032923


In [14]:
# Загрузка test
df_test = loader.data_load_preprocessed(data_type='test', config=config)

[⧗] Загружаю данные из: D:\Skills\Kaggle\ml-regression_concrete-strength\data\processed\eda_data_test.pkl
[✓] Данные успешно загружены. Форма: (228, 10)


In [15]:
# Вывод первых 5 строк тестового датасета
df_test.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,W/C,Sp/C_pct
0,167.4,129.9,128.6,175.5,7.8,1006.3,746.6,28,1.048387,0.046595
1,475.0,118.8,0.0,181.1,8.9,852.1,781.5,7,0.381263,0.018737
2,251.4,0.0,118.3,188.5,6.4,1028.4,757.7,100,0.749801,0.025457
3,307.0,0.0,0.0,193.0,0.0,968.0,812.0,365,0.628664,0.0
4,143.6,0.0,174.9,158.4,17.9,942.7,844.5,28,1.103064,0.124652


## 6.2. Предобработка данных

In [14]:
# Разделение на признаки и целевую переменную
X = df_train.drop('Strength', axis=1)
y = df_train['Strength']
y_name = y.name

## 6.3. Подбор гипепараметров для Catboost

#### Объявление функций

#### Тюнинг Catboost

## 6.4. Подбор гипепараметров для 

## Отправка на Github

In [32]:
from getpass import getpass

# 1. Безопасный ввод токена
GITHUB_TOKEN = getpass('Введите ваш GitHub Personal Access Token: ')

Введите ваш GitHub Personal Access Token: ··········


In [33]:
!git status

On branch modeling
Your branch is up to date with 'origin/modeling'.

Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	[31mmodified:   catboost_info/catboost_training.json[m
	[31mmodified:   catboost_info/learn/events.out.tfevents[m
	[31mmodified:   catboost_info/time_left.tsv[m
	[31mmodified:   data/processed/data_train_outliers.csv[m
	[31mmodified:   data/processed/data_train_outliers.parquet[m
	[31mmodified:   data/processed/data_train_outliers.pkl[m
	[31mmodified:   data/processed/y_train_outliers.pkl[m

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	[31mmodels/modeling_report/modeling_experiments_20251115_121552.csv[m
	[31mmodels/pipelines/best_pipeline_CatBoostRegressor.pkl[m
	[31mmodels/pipelines/pipeline_metadata_CatBoostRegressor.pkl[m

no changes added to commit (use "git add" and/or "git commit -a")


In [34]:
# 1. Добавляем файлы
!git add .

# 2. Коммитим
!git commit -m "feat: Colab experiment with data preprocessing strategies - Train multiple models using different data preprocessing pipelines - Save the best performing pipeline and experiment report"

[modeling a511430] feat: Colab experiment with data preprocessing strategies - Train multiple models using different data preprocessing pipelines - Save the best performing pipeline and experiment report
 10 files changed, 395 insertions(+), 205 deletions(-)
 rewrite catboost_info/catboost_training.json (97%)
 rewrite catboost_info/learn/events.out.tfevents (95%)
 rewrite catboost_info/time_left.tsv (97%)
 rewrite data/processed/data_train_outliers.parquet (78%)
 rewrite data/processed/data_train_outliers.pkl (90%)
 rewrite data/processed/y_train_outliers.pkl (76%)
 create mode 100644 models/modeling_report/modeling_experiments_20251115_121552.csv
 create mode 100644 models/pipelines/best_pipeline_CatBoostRegressor.pkl
 create mode 100644 models/pipelines/pipeline_metadata_CatBoostRegressor.pkl


In [35]:
!git push https://{GITHUB_TOKEN}@github.com/ProninPV/ml-regression_concrete-strength.git modeling

Enumerating objects: 33, done.
Counting objects:   3% (1/33)Counting objects:   6% (2/33)Counting objects:   9% (3/33)Counting objects:  12% (4/33)Counting objects:  15% (5/33)Counting objects:  18% (6/33)Counting objects:  21% (7/33)Counting objects:  24% (8/33)Counting objects:  27% (9/33)Counting objects:  30% (10/33)Counting objects:  33% (11/33)Counting objects:  36% (12/33)Counting objects:  39% (13/33)Counting objects:  42% (14/33)Counting objects:  45% (15/33)Counting objects:  48% (16/33)Counting objects:  51% (17/33)Counting objects:  54% (18/33)Counting objects:  57% (19/33)Counting objects:  60% (20/33)Counting objects:  63% (21/33)Counting objects:  66% (22/33)Counting objects:  69% (23/33)Counting objects:  72% (24/33)Counting objects:  75% (25/33)Counting objects:  78% (26/33)Counting objects:  81% (27/33)Counting objects:  84% (28/33)Counting objects:  87% (29/33)Counting objects:  90% (30/33)Counting objects:  93% (31/33)Counting objects: