## 3.0 Импорты библиотек

In [11]:
import os
import yaml
import logging
import numpy as np
import scipy.stats as stats
import sys
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from typing import List, Any, Optional, Tuple, Dict
from datetime import datetime
from scipy.optimize import curve_fit
from sklearn.metrics import r2_score
from scipy.stats import pearsonr, spearmanr, kurtosis, skew
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
# расширяем поле ноутбука для удобства
from IPython.display import display, HTML
display(HTML('<style>.container {width:87% !important;}</style>'))
display(HTML("<style>.output_scroll {height:auto !important; max-height:10000px !important;}</style>"))

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [4]:
# Настройки для pandas (количество отображаемых колонок)
pd.set_option('display.max_columns', 100)

In [5]:
# Определение стиля для pyplot
plt.style.use('ggplot')

In [6]:
# Текущая рабочая директория
cwd = Path().resolve()

# Поднимаемся на один уровень выше
project_root = cwd.parent

# Добавляем корень проекта в sys.path
sys.path.append(str(project_root))

# Загрузка данных из config.yaml
from src.data import downloader, loader, preprocessor, saving

# Путь к файлу config.yaml
config_path = project_root / "config" / "config.yaml"

# Загружаем конфиг
config = loader.load_config(config_path)

## 3.1. Загрузка данных

In [7]:
# Загрузка train
df_train = loader.data_load_preprocessed(data_type='train', config=config)

[⧗] Загружаю данные из: ..\data/processed\eda_data_train.pkl
[✓] Данные успешно загружены. Форма: (781, 11)


In [8]:
# Вывод первых 5 строк тренировочного датасета
df_train.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength,W/C,Sp/C_pct
0,376.0,0.0,0.0,214.6,0.0,1003.5,762.4,3,16.28,0.570745,0.0
1,491.0,26.0,123.0,210.0,3.9,882.0,699.0,56,59.59,0.427699,0.007943
2,250.0,0.0,95.7,187.4,5.5,956.9,861.2,3,13.82,0.7496,0.022
3,310.0,0.0,0.0,192.0,0.0,1012.0,830.0,90,35.76,0.619355,0.0
4,252.1,97.1,75.6,193.8,8.3,835.5,821.4,28,33.4,0.768743,0.032923


In [9]:
# Загрузка test
df_test = loader.data_load_preprocessed(data_type='test', config=config)

[⧗] Загружаю данные из: ..\data/processed\eda_data_test.pkl
[✓] Данные успешно загружены. Форма: (228, 10)


In [10]:
# Вывод первых 5 строк тестового датасета
df_test.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,W/C,Sp/C_pct
0,167.4,129.9,128.6,175.5,7.8,1006.3,746.6,28,1.048387,0.046595
1,475.0,118.8,0.0,181.1,8.9,852.1,781.5,7,0.381263,0.018737
2,251.4,0.0,118.3,188.5,6.4,1028.4,757.7,100,0.749801,0.025457
3,307.0,0.0,0.0,193.0,0.0,968.0,812.0,365,0.628664,0.0
4,143.6,0.0,174.9,158.4,17.9,942.7,844.5,28,1.103064,0.124652


## 3.2. Обработка выбросов

### Объявление класса

In [26]:
class OutlierHandler(BaseEstimator, TransformerMixin):
    """
    Класс для обработки выбросов с различными стратегиями.
    """
    def __init__(self,
                 strategy: str='combined',
                 config: dict=config,
                 binary_thresholds: Optional[Dict] = None):
        """
        Инициализация обработчика выбросов
        
        Args:
            strategy: Стратегия обработки ('gost', 'iqr', 'binary', 'combined', 'remove', 'mark')
            config: Данные из файла конфигурации с допустимыми диапазонами
            binary_thresholds: Кастомные пороги для бинарных признаков
        """
        self.strategy = strategy
        self.config = config
        self.binary_thresholds = binary_thresholds

        # Инициализация атрибутов для совместимости
        self.gost_ranges = {}
        self.iqr_bounds = {}
        self.binary_features_config = {}
        self.fitted = False

        # Установка кастомных порогов если предоставлены
        if binary_thresholds:
            self._set_custom_thresholds(binary_thresholds)

    # def _set_custom_thresholds(self, binary_thresholds: Dict):
    #     """Установка кастомных порогов для бинарных признаков"""
    #     for feature, threshold in binary_thresholds.items():
    #         if feature in self.binary_features_config:
    #             self.binary_features_config[feature]['threshold'] = threshold

    def fit(self, X, y=None):
        """
        Обучение обработчика на данных
        
        Args:
            X: DataFrame или array-like с признаками
            y: Ignored, для совместимости с Pipeline
            
        Returns:
            self: Обученный трансформер
        """
        # Конвертация в DataFrame если необходимо
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)
        
        features = X.select_dtypes(include=[np.number]).columns.tolist()
        
        # Расчет IQR границ для каждого признака (если нужен для стратегии)
        if self.strategy in ['iqr', 'combined', 'remove']:
            for feature in features:
                if feature in X.columns:
                    Q1 = X[feature].quantile(0.25)
                    Q3 = X[feature].quantile(0.75)
                    IQR = Q3 - Q1
                    
                    self.iqr_bounds[feature] = {
                        'lower': Q1 - 1.5 * IQR,
                        'upper': Q3 + 1.5 * IQR
                    }
        
        # Конфигурация бинарных признаков на основе плана обработки
        if self.strategy in ['binary', 'combined', 'mark']:
            self.binary_features_config = {
                'high_sp_с': {
                    'source_feature': 'sp_c_pct',
                    'threshold': self.config['standard_value']['sp_c_pct']['max'],
                    'condition': 'greater'
                },
                'high_fa': {
                    'source_feature': 'fine_aggregate', 
                    'threshold': self.config['standard_value']['fine_aggregate']['max'],
                    'condition': 'greater'
                },
                'low_wc': {
                    'source_feature': 'w_c',
                    'threshold': self.config['standard_value']['w_c']['min'],
                    'condition': 'less'
                },
                'high_wc': {
                    'source_feature': 'w_c', 
                    'threshold': self.config['standard_value']['w_c']['max'],
                    'condition': 'greater'
                },
                'has_blast_furnace_slag': {
                    'source_feature': 'blast_furnace_slag',
                    'threshold': 0,
                    'condition': 'greater'
                }
            }
        
        self.fitted = True
        self.feature_names_in_ = features  # Для совместимости с новыми версиями sklearn
        print(self.binary_features_config)
        return self

    def transform(self, X):
        """
        Применение обработки выбросов к данным
        
        Args:
            X: DataFrame или array-like с признаками
            
        Returns:
            Обработанный DataFrame или array
        """
        if not self.fitted:
            raise ValueError("Сначала необходимо вызвать fit()")
        
        # Сохраняем исходный тип для возврата
        return_array = not isinstance(X, pd.DataFrame)
        if return_array:
            X = pd.DataFrame(X)
        
        df_processed = X.copy()
        
        print(f"Применение стратегии: {self.strategy}")
        
        # Применение стратегий в зависимости от выбора
        if self.strategy == 'gost':
            df_processed = self._apply_gost_processing(df_processed)
        elif self.strategy == 'iqr':
            df_processed = self._apply_iqr_processing(df_processed)
        elif self.strategy == 'binary':
            df_processed = self._create_binary_features(df_processed)
            df_processed = self._create_engineering_features(df_processed)
        elif self.strategy == 'remove':
            df_processed = self._apply_remove_strategy(df_processed)
        elif self.strategy == 'mark':
            df_processed = self._apply_mark_strategy(df_processed)
        else:  # 'combined' - комбинированная стратегия
            df_processed = self._apply_combined_strategy(df_processed)
        
        # Возвращаем в исходном формате
        if return_array:
            return df_processed.values
        return df_processed

### Проверка

In [12]:
strategies = ['gost', 'iqr', 'binary', 'combined', 'remove', 'mark']

X = df_train.drop(columns=["Strength"])


In [13]:
X

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,W/C,Sp/C_pct
0,376.0,0.0,0.0,214.6,0.0,1003.5,762.4,3,0.570745,0.000000
1,491.0,26.0,123.0,210.0,3.9,882.0,699.0,56,0.427699,0.007943
2,250.0,0.0,95.7,187.4,5.5,956.9,861.2,3,0.749600,0.022000
3,310.0,0.0,0.0,192.0,0.0,1012.0,830.0,90,0.619355,0.000000
4,252.1,97.1,75.6,193.8,8.3,835.5,821.4,28,0.768743,0.032923
...,...,...,...,...,...,...,...,...,...,...
793,310.0,0.0,0.0,192.0,0.0,1012.0,830.0,3,0.619355,0.000000
794,213.5,0.0,174.2,159.2,11.7,1043.6,771.9,100,0.745667,0.054801
795,304.8,0.0,99.6,196.0,9.8,959.4,705.2,28,0.643045,0.032152
797,288.0,192.0,0.0,192.0,0.0,932.0,717.8,28,0.666667,0.000000


In [27]:
for strategy in strategies:
    print(f"\n{'='*50}")
    print(f"ТЕСТИРУЕМ СТРАТЕГИЮ: {strategy.upper()}")
    print(f"{'='*50}")
    
    try:
        # Создаем и обучаем обработчик
        handler = OutlierHandler(strategy=strategy, config=config)
        handler.fit(X)
    except Exception as e:
        print(f"❌ Ошибка при выполнении стратегии {strategy}: {e}")
        import traceback
        traceback.print_exc()


ТЕСТИРУЕМ СТРАТЕГИЮ: GOST
{}


0,1,2
,strategy,'gost'
,config,"{'binary_features': {'from_zeros': ['Blast Furnace Slag', 'Superplasticizer']}, 'competition': {'name': 'skillbox-ml-junior-regression-10'}, 'data': {'logs': 'logs', 'processed_dir': 'data/processed', 'raw_dir': 'data/raw', 'test_file': 'test.csv', ...}, 'output': {'eda_report_dir': 'outputs/eda_report'}, ...}"
,binary_thresholds,



ТЕСТИРУЕМ СТРАТЕГИЮ: IQR
{}


0,1,2
,strategy,'iqr'
,config,"{'binary_features': {'from_zeros': ['Blast Furnace Slag', 'Superplasticizer']}, 'competition': {'name': 'skillbox-ml-junior-regression-10'}, 'data': {'logs': 'logs', 'processed_dir': 'data/processed', 'raw_dir': 'data/raw', 'test_file': 'test.csv', ...}, 'output': {'eda_report_dir': 'outputs/eda_report'}, ...}"
,binary_thresholds,



ТЕСТИРУЕМ СТРАТЕГИЮ: BINARY
{'high_sp_с': {'source_feature': 'sp_c_pct', 'threshold': 0.025, 'condition': 'greater'}, 'high_fa': {'source_feature': 'fine_aggregate', 'threshold': 800, 'condition': 'greater'}, 'low_wc': {'source_feature': 'w_c', 'threshold': 0.3, 'condition': 'less'}, 'high_wc': {'source_feature': 'w_c', 'threshold': 0.7, 'condition': 'greater'}, 'has_blast_furnace_slag': {'source_feature': 'blast_furnace_slag', 'threshold': 0, 'condition': 'greater'}}


0,1,2
,strategy,'binary'
,config,"{'binary_features': {'from_zeros': ['Blast Furnace Slag', 'Superplasticizer']}, 'competition': {'name': 'skillbox-ml-junior-regression-10'}, 'data': {'logs': 'logs', 'processed_dir': 'data/processed', 'raw_dir': 'data/raw', 'test_file': 'test.csv', ...}, 'output': {'eda_report_dir': 'outputs/eda_report'}, ...}"
,binary_thresholds,



ТЕСТИРУЕМ СТРАТЕГИЮ: COMBINED
{'high_sp_с': {'source_feature': 'sp_c_pct', 'threshold': 0.025, 'condition': 'greater'}, 'high_fa': {'source_feature': 'fine_aggregate', 'threshold': 800, 'condition': 'greater'}, 'low_wc': {'source_feature': 'w_c', 'threshold': 0.3, 'condition': 'less'}, 'high_wc': {'source_feature': 'w_c', 'threshold': 0.7, 'condition': 'greater'}, 'has_blast_furnace_slag': {'source_feature': 'blast_furnace_slag', 'threshold': 0, 'condition': 'greater'}}


0,1,2
,strategy,'combined'
,config,"{'binary_features': {'from_zeros': ['Blast Furnace Slag', 'Superplasticizer']}, 'competition': {'name': 'skillbox-ml-junior-regression-10'}, 'data': {'logs': 'logs', 'processed_dir': 'data/processed', 'raw_dir': 'data/raw', 'test_file': 'test.csv', ...}, 'output': {'eda_report_dir': 'outputs/eda_report'}, ...}"
,binary_thresholds,



ТЕСТИРУЕМ СТРАТЕГИЮ: REMOVE
{}


0,1,2
,strategy,'remove'
,config,"{'binary_features': {'from_zeros': ['Blast Furnace Slag', 'Superplasticizer']}, 'competition': {'name': 'skillbox-ml-junior-regression-10'}, 'data': {'logs': 'logs', 'processed_dir': 'data/processed', 'raw_dir': 'data/raw', 'test_file': 'test.csv', ...}, 'output': {'eda_report_dir': 'outputs/eda_report'}, ...}"
,binary_thresholds,



ТЕСТИРУЕМ СТРАТЕГИЮ: MARK
{'high_sp_с': {'source_feature': 'sp_c_pct', 'threshold': 0.025, 'condition': 'greater'}, 'high_fa': {'source_feature': 'fine_aggregate', 'threshold': 800, 'condition': 'greater'}, 'low_wc': {'source_feature': 'w_c', 'threshold': 0.3, 'condition': 'less'}, 'high_wc': {'source_feature': 'w_c', 'threshold': 0.7, 'condition': 'greater'}, 'has_blast_furnace_slag': {'source_feature': 'blast_furnace_slag', 'threshold': 0, 'condition': 'greater'}}


0,1,2
,strategy,'mark'
,config,"{'binary_features': {'from_zeros': ['Blast Furnace Slag', 'Superplasticizer']}, 'competition': {'name': 'skillbox-ml-junior-regression-10'}, 'data': {'logs': 'logs', 'processed_dir': 'data/processed', 'raw_dir': 'data/raw', 'test_file': 'test.csv', ...}, 'output': {'eda_report_dir': 'outputs/eda_report'}, ...}"
,binary_thresholds,
