In [1]:
import random
import numpy as np
import pandas as pd
import seaborn as sns
from IPython.display import display, Math, Latex
from matplotlib import pyplot as plt
import re
import json
import time
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
import random

In [3]:
# ухудшило почему то

In [2]:
import pandas as pd
import numpy as np
import random
from catboost import CatBoostRegressor

# Глобальная фиксация для всех библиотек
SEED = 322
random.seed(SEED)
np.random.seed(SEED)

def calculate_iou(lower_true, upper_true, lower_pred, upper_pred, epsilon=1e-6):
    intersection = np.maximum(0, np.minimum(upper_true, upper_pred) - np.maximum(lower_true, lower_pred))
    union = (upper_true - lower_true + epsilon) + (upper_pred - lower_pred + epsilon) - intersection
    return np.mean(intersection / union)

# 1. ЗАГРУЗКА
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train = train[train['price_p05'] > 0]
train['dt'] = pd.to_datetime(train['dt'])
test['dt'] = pd.to_datetime(test['dt'])

# 2. FEATURE ENGINEERING
def create_smart_features(df, train_ref=None):
    if train_ref is not None:
        prod_price_map = train_ref.groupby('product_id')['price_p05'].mean().to_dict()
        df['global_prod_avg'] = df['product_id'].map(prod_price_map)
        cat_price_map = train_ref.groupby('third_category_id')['price_p05'].mean().to_dict()
        df['global_cat_avg'] = df['third_category_id'].map(cat_price_map)
    
    cat_stores_map = df.groupby('third_category_id')['n_stores'].transform('mean')
    df['store_density_ratio'] = df['n_stores'] / (cat_stores_map + 1e-6)
    df['temp_hum_index'] = df['avg_temperature'] * (df['avg_humidity'] / 100)
    df['category_breadth'] = df.groupby(['dt', 'third_category_id'])['product_id'].transform('nunique')
    
    # Дополнительная фича на основе анализа ошибок: близость к границам месяца
    df['days_to_side'] = df['dt'].dt.day.apply(lambda x: min(x, 31-x))
    return df

train = create_smart_features(train, train_ref=train)
test = create_smart_features(test, train_ref=train)

# 3. ЦИКЛИЧЕСКИЕ ПРИЗНАКИ
def add_cyclical_features(df):
    df['dow_sin'] = np.sin(2 * np.pi * df['dow'] / 7)
    df['dow_cos'] = np.cos(2 * np.pi * df['dow'] / 7)
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    return df

train = add_cyclical_features(train)
test = add_cyclical_features(test)

# 4. СПИСОК ПРИЗНАКОВ
cat_features = ['management_group_id', 'first_category_id', 'activity_flag']
features = [
    'n_stores', 'precpt', 'avg_temperature', 'avg_humidity', 
    'avg_wind_level', 'week_of_year', 'month_sin', 'month_cos',
    'global_prod_avg', 'global_cat_avg', 'store_density_ratio',
    'temp_hum_index', 'category_breadth', 'days_to_side'
] + cat_features

train[features] = train[features].fillna(train[features].mean(numeric_only=True))
test[features] = test[features].fillna(train[features].mean(numeric_only=True))

# 5. ВАЛИДАЦИЯ / КАЛИБРОВКА
train_parts, val_parts = [], []
for _, group in train.groupby('dt'):
    group = group.sample(frac=1, random_state=SEED).reset_index(drop=True)
    split_idx = int(len(group) * 0.8)
    train_parts.append(group.iloc[:split_idx])
    val_parts.append(group.iloc[split_idx:])

train_part = pd.concat(train_parts)
val_part = pd.concat(val_parts)

# ПАРАМЕТРЫ CatBoost (Optuna)
best_optuna_params = {
    'learning_rate': 0.09981859399773757, 
    'depth': 7, 
    'l2_leaf_reg': 8.533654459953857, 
    'random_strength': 1.3535330397123693, 
    'bagging_temperature': 0.757231412091197
}

cb_params = {
    'iterations': 3000,
    'random_seed': SEED,
    'verbose': 100,
    'early_stopping_rounds': 200,
    **best_optuna_params
}

print("Обучение моделей для расчета калибровки...")
model_low = CatBoostRegressor(loss_function='MAE', **cb_params)
model_high = CatBoostRegressor(loss_function='MAE', **cb_params)

model_low.fit(train_part[features], train_part['price_p05'], 
              eval_set=(val_part[features], val_part['price_p05']), 
              cat_features=cat_features, use_best_model=True)

model_high.fit(train_part[features], train_part['price_p95'], 
               eval_set=(val_part[features], val_part['price_p95']), 
               cat_features=cat_features, use_best_model=True)

# --- БЛОК 1: ПОИСК ГЛОБАЛЬНОГО ALPHA ---
print("\nПоиск оптимального глобального alpha...")
calib_preds_low = model_low.predict(val_part[features])
calib_preds_high = model_high.predict(val_part[features])

nonconf_low = calib_preds_low - val_part['price_p05']
nonconf_high = val_part['price_p95'] - calib_preds_high

best_alpha = 0.1
best_iou = -1
global_q = (0, 0)

for a in np.linspace(0.1, 0.9, 81): 
    ql = np.quantile(nonconf_low, 1 - a/2)
    qh = np.quantile(nonconf_high, 1 - a/2)
    p05_c = calib_preds_low - ql
    p95_c = calib_preds_high + qh
    p95_c = np.maximum(p95_c, p05_c + 0.001)
    
    score = calculate_iou(val_part['price_p05'].values, val_part['price_p95'].values, p05_c, p95_c)
    if score > best_iou:
        best_iou = score
        best_alpha = a
        global_q = (ql, qh)

q_low_global, q_high_global = global_q

# --- БЛОК 2: ГРУППОВАЯ КАЛИБРОВКА (First Category) ---
print(f"Расчет локальных поправок для категорий (Best Alpha: {best_alpha:.3f})...")
group_q_low = {}
group_q_high = {}

val_results = val_part.copy()
val_results['err_low'] = nonconf_low
val_results['err_high'] = nonconf_high

for cat in val_results['first_category_id'].unique():
    subset = val_results[val_results['first_category_id'] == cat]
    
    if len(subset) >= 50: # Если данных достаточно для квантиля
        q_l_local = np.quantile(subset['err_low'], 1 - best_alpha / 2)
        q_h_local = np.quantile(subset['err_high'], 1 - best_alpha / 2)
        
        group_q_low[cat] = 0.5 * q_l_local + 0.5 * q_low_global
        group_q_high[cat] = 0.5 * q_h_local + 0.5 * q_high_global
    else:
        group_q_low[cat] = q_low_global
        group_q_high[cat] = q_high_global

# 6. ФИНАЛЬНОЕ ОБУЧЕНИЕ
print("\nОбучение финальных моделей на полном датасете...")
final_params_low = cb_params.copy()
final_params_low['iterations'] = model_low.get_best_iteration() + 1
final_params_low.pop('early_stopping_rounds', None)

final_params_high = cb_params.copy()
final_params_high['iterations'] = model_high.get_best_iteration() + 1
final_params_high.pop('early_stopping_rounds', None)

final_low = CatBoostRegressor(loss_function='MAE', allow_writing_files=False, **final_params_low)
final_high = CatBoostRegressor(loss_function='MAE', allow_writing_files=False, **final_params_high)

final_low.fit(train[features], train['price_p05'], cat_features=cat_features, verbose=100)
final_high.fit(train[features], train['price_p95'], cat_features=cat_features, verbose=100)

# 7. ПРЕДСКАЗАНИЕ С ГРУППОВОЙ КОРРЕКТИРОВКОЙ
print("Создание предсказаний...")
test_preds_low = final_low.predict(test[features])
test_preds_high = final_high.predict(test[features])

# Маппинг поправок
test['q_l'] = test['first_category_id'].map(group_q_low).fillna(q_low_global)
test['q_h'] = test['first_category_id'].map(group_q_high).fillna(q_high_global)

test['price_p05'] = test_preds_low - test['q_l']
test['price_p95'] = test_preds_high + test['q_h']

test['price_p95'] = np.maximum(test['price_p95'], test['price_p05'] + 0.001)

submission = test[['row_id', 'price_p05', 'price_p95']].sort_values('row_id')
submission.to_csv('submission_group_conformal.csv', index=False)
print(f"Готово! Использована групповая калибровка по first_category_id.")

Обучение моделей для расчета калибровки...
0:	learn: 0.1430434	test: 0.1348889	best: 0.1348889 (0)	total: 28.3ms	remaining: 1m 24s
100:	learn: 0.0865184	test: 0.0922428	best: 0.0921764 (79)	total: 2.5s	remaining: 1m 11s
200:	learn: 0.0820301	test: 0.0928133	best: 0.0921764 (79)	total: 4.89s	remaining: 1m 8s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.09217644948
bestIteration = 79

Shrink model to first 80 iterations.
0:	learn: 0.0986727	test: 0.0928803	best: 0.0928803 (0)	total: 28.1ms	remaining: 1m 24s
100:	learn: 0.0810298	test: 0.0887490	best: 0.0882764 (51)	total: 2.43s	remaining: 1m 9s
200:	learn: 0.0767242	test: 0.0892307	best: 0.0882764 (51)	total: 4.84s	remaining: 1m 7s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.08827639493
bestIteration = 51

Shrink model to first 52 iterations.

Поиск оптимального глобального alpha...
Расчет локальных поправок для категорий (Best Alpha: 0.570)...

Обучение финальных моделей на полном датасет

In [13]:
# идеи - признаки через стандартные отклонения и т д в разных разрезах?
# анализ текущих ошибок модели на валидации
# замена валидации на OOT ?
# обучение нейросети + conformal + кастомный лосс Pinball + coverage penalty для нейросети 


# Quantile Loss (Обязательно): Твой анализ показал, что MAE просто не «дотягивается» до краев. Использование loss_function='Quantile:alpha=0.05' в CatBoost само по себе уменьшит эти err_low/high в несколько раз.

# Признак "Дней до конца месяца": Раз 31-е число и начало месяца такие шумные, добавь фичу: df['days_to_side'] = df['dt'].dt.day.apply(lambda x: min(x, 31-x)) — это покажет модели близость к «границе», где цена может скакать.

# Взаимодействие Категория + Акция: Добавь признак train['cat_promo'] = train['first_category_id'].astype(str) + "_" + train['activity_flag'].astype(str). Это поможет модели понять, что акция в категории "Алкоголь" и в категории "Хлеб" — это разные ценовые стратегии.