In [1]:
import random
import numpy as np
import pandas as pd
import seaborn as sns
from IPython.display import display, Math, Latex
from matplotlib import pyplot as plt
import re
import json
import time
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
import random

In [2]:
import pandas as pd
import numpy as np
import random
from catboost import CatBoostRegressor

# Глобальная фиксация для всех библиотек
SEED = 322
random.seed(SEED)
np.random.seed(SEED)

def calculate_iou(lower_true, upper_true, lower_pred, upper_pred, epsilon=1e-6):
    intersection = np.maximum(0, np.minimum(upper_true, upper_pred) - np.maximum(lower_true, lower_pred))
    union = (upper_true - lower_true + epsilon) + (upper_pred - lower_pred + epsilon) - intersection
    return np.mean(intersection / union)

# 1. ЗАГРУЗКА
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train = train[train['price_p05'] > 0]
train['dt'] = pd.to_datetime(train['dt'])
test['dt'] = pd.to_datetime(test['dt'])

# 2. FEATURE ENGINEERING
def create_smart_features(df, train_ref=None):
    if train_ref is not None:
        prod_price_map = train_ref.groupby('product_id')['price_p05'].mean().to_dict()
        df['global_prod_avg'] = df['product_id'].map(prod_price_map)
        cat_price_map = train_ref.groupby('third_category_id')['price_p05'].mean().to_dict()
        df['global_cat_avg'] = df['third_category_id'].map(cat_price_map)
    
    cat_stores_map = df.groupby('third_category_id')['n_stores'].transform('mean')
    df['store_density_ratio'] = df['n_stores'] / (cat_stores_map + 1e-6)
    df['temp_hum_index'] = df['avg_temperature'] * (df['avg_humidity'] / 100)
    df['category_breadth'] = df.groupby(['dt', 'third_category_id'])['product_id'].transform('nunique')
    return df

train = create_smart_features(train, train_ref=train)
test = create_smart_features(test, train_ref=train)

# 3. ЦИКЛИЧЕСКИЕ ПРИЗНАКИ
def add_cyclical_features(df):
    df['dow_sin'] = np.sin(2 * np.pi * df['dow'] / 7)
    df['dow_cos'] = np.cos(2 * np.pi * df['dow'] / 7)
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    return df

train = add_cyclical_features(train)
test = add_cyclical_features(test)

# 4. СПИСОК ПРИЗНАКОВ
cat_features = ['management_group_id', 'first_category_id', 'activity_flag']
features = [
    'n_stores', 'precpt', 'avg_temperature', 'avg_humidity', 
    'avg_wind_level', 'week_of_year', 'month_sin', 'month_cos',
    'global_prod_avg', 'global_cat_avg', 'store_density_ratio',
    'temp_hum_index', 'category_breadth'
] + cat_features

train[features] = train[features].fillna(train[features].mean(numeric_only=True))
test[features] = test[features].fillna(train[features].mean(numeric_only=True))

# 5. ВАЛИДАЦИЯ / КАЛИБРОВКА
train_parts, val_parts = [], []
for _, group in train.groupby('dt'):
    group = group.sample(frac=1, random_state=SEED).reset_index(drop=True)
    split_idx = int(len(group) * 0.8)
    train_parts.append(group.iloc[:split_idx])
    val_parts.append(group.iloc[split_idx:])

train_part = pd.concat(train_parts)
val_part = pd.concat(val_parts)

# ПАРАМЕТРЫ CatBoost (Optuna)
best_optuna_params = {
    'learning_rate': 0.09981859399773757, 
    'depth': 7, 
    'l2_leaf_reg': 8.533654459953857, 
    'random_strength': 1.3535330397123693, 
    'bagging_temperature': 0.757231412091197
}

cb_params = {
    'iterations': 3000,
    'random_seed': SEED,
    'verbose': 100,
    'early_stopping_rounds': 200,
    **best_optuna_params
}

# Обучение на частях для калибровки
print("Обучение моделей для расчета калибровки...")
model_low = CatBoostRegressor(loss_function='MAE', **cb_params)
model_high = CatBoostRegressor(loss_function='MAE', **cb_params)

model_low.fit(train_part[features], train_part['price_p05'], 
              eval_set=(val_part[features], val_part['price_p05']), 
              cat_features=cat_features, use_best_model=True)

model_high.fit(train_part[features], train_part['price_p95'], 
               eval_set=(val_part[features], val_part['price_p95']), 
               cat_features=cat_features, use_best_model=True)

# --- НОВЫЙ БЛОК: ПОИСК ЛУЧШЕГО ALPHA ЧЕРЕЗ IoU ---
print("\nПоиск оптимального alpha на валидации...")
calib_preds_low = model_low.predict(val_part[features])
calib_preds_high = model_high.predict(val_part[features])

nonconf_low = calib_preds_low - val_part['price_p05']
nonconf_high = val_part['price_p95'] - calib_preds_high

best_alpha = 0.1
best_iou = -1
best_q = (0, 0)

# Проверяем разные уровни уверенности
for a in np.linspace(0.1, 0.9, 81): 
    ql = np.quantile(nonconf_low, 1 - a/2)
    qh = np.quantile(nonconf_high, 1 - a/2)
    
    p05_corr = calib_preds_low - ql
    p95_corr = calib_preds_high + qh
    p95_corr = np.maximum(p95_corr, p05_corr + 0.001)
    
    current_iou = calculate_iou(
        val_part['price_p05'].values, 
        val_part['price_p95'].values, 
        p05_corr, 
        p95_corr
    )
    
    if current_iou > best_iou:
        best_iou = current_iou
        best_alpha = a
        best_q = (ql, qh)

q_low, q_high = best_q
print(f"Лучший Alpha: {best_alpha:.3f}")
print(f"Лучший IoU на валидации: {best_iou:.4f}")
print(f"Поправки: q_low = {q_low:.4f}, q_high = {q_high:.4f}\n")
# ------------------------------------------------

# 6. ФИНАЛЬНОЕ ОБУЧЕНИЕ
print("Обучение финальных моделей на полном датасете...")
final_params_low = cb_params.copy()
final_params_low['iterations'] = model_low.get_best_iteration() + 1
final_params_low.pop('early_stopping_rounds', None)

final_params_high = cb_params.copy()
final_params_high['iterations'] = model_high.get_best_iteration() + 1
final_params_high.pop('early_stopping_rounds', None)

final_low = CatBoostRegressor(loss_function='MAE', allow_writing_files=False, **final_params_low)
final_high = CatBoostRegressor(loss_function='MAE', allow_writing_files=False, **final_params_high)

final_low.fit(train[features], train['price_p05'], cat_features=cat_features, verbose=100)
final_high.fit(train[features], train['price_p95'], cat_features=cat_features, verbose=100)

# 7. ПРЕДСКАЗАНИЕ С КОРРЕКТИРОВКОЙ
print("Создание предсказаний с применением лучших Q...")
test['price_p05'] = final_low.predict(test[features]) - q_low
test['price_p95'] = final_high.predict(test[features]) + q_high

# Страховка от пересечения границ
test['price_p95'] = np.maximum(test['price_p95'], test['price_p05'] + 0.001)

submission = test[['row_id', 'price_p05', 'price_p95']].sort_values('row_id')
submission.to_csv('submission_conformal_optimized.csv', index=False)
print(f"Готово! Использован подобранный alpha={best_alpha:.3f}.")

Обучение моделей для расчета калибровки...
0:	learn: 0.1428280	test: 0.1346232	best: 0.1346232 (0)	total: 178ms	remaining: 8m 52s
100:	learn: 0.0860270	test: 0.0921969	best: 0.0921843 (94)	total: 2.72s	remaining: 1m 18s
200:	learn: 0.0812801	test: 0.0923276	best: 0.0919774 (140)	total: 5.22s	remaining: 1m 12s
300:	learn: 0.0785447	test: 0.0929835	best: 0.0919774 (140)	total: 7.79s	remaining: 1m 9s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.09197735328
bestIteration = 140

Shrink model to first 141 iterations.
0:	learn: 0.0986558	test: 0.0928609	best: 0.0928609 (0)	total: 25.6ms	remaining: 1m 16s
100:	learn: 0.0811582	test: 0.0884697	best: 0.0878753 (37)	total: 2.59s	remaining: 1m 14s
200:	learn: 0.0765841	test: 0.0896853	best: 0.0878753 (37)	total: 5.28s	remaining: 1m 13s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.08787530913
bestIteration = 37

Shrink model to first 38 iterations.

Поиск оптимального alpha на валидации...
Лучший Alph

In [3]:
val_results = val_part.copy()
val_results['pred_low'] = calib_preds_low
val_results['pred_high'] = calib_preds_high
val_results['err_low'] = val_results['pred_low'] - val_results['price_p05']
val_results['err_high'] = val_results['price_p95'] - val_results['pred_high']

# Посмотри ошибки в разрезе категорий
error_by_cat = val_results.groupby('first_category_id')[['err_low', 'err_high']].mean()
print(error_by_cat)

                    err_low  err_high
first_category_id                    
0                  0.002596  0.013387
1                  0.014239 -0.037431
2                 -0.001163  0.005923
3                 -0.032504 -0.003936
4                  0.002961 -0.015616
5                  0.018577 -0.019776
7                 -0.021205  0.066528
8                 -0.002271  0.005479
9                  0.020033  0.002462
10                 0.000421 -0.067648
11                -0.010998 -0.042149
15                 0.001022 -0.135139
16                 0.022370 -0.141594
17                -0.005285 -0.021200
18                -0.014276  0.003708
19                -0.018426  0.003612
20                 0.015326 -0.004077
21                -0.014312 -0.006857
22                -0.010140  0.026143
23                -0.015527  0.021044
24                -0.025562  0.046676
25                -0.013748  0.052584
26                -0.001667 -0.020364
27                -0.013828  0.001445
28          

In [8]:
print(val_results.groupby('dt')[['err_low', 'err_high']].mean())
# cat_features = ['management_group_id', 'first_category_id', 'activity_flag']

             err_low  err_high
dt                            
2024-03-28  0.005613 -0.008775
2024-03-29 -0.001414  0.002565
2024-03-30 -0.036733  0.035139
2024-03-31 -0.074616  0.071816
2024-04-01 -0.049253  0.064038
2024-04-02 -0.031228  0.033350
2024-04-03 -0.029195  0.040379
2024-04-04 -0.020483  0.032297
2024-04-05 -0.001688  0.002532
2024-04-06  0.013436 -0.009422
2024-04-07  0.009761 -0.001489
2024-04-08 -0.017430  0.028399
2024-04-09 -0.027965  0.031732
2024-04-10 -0.028738  0.028872
2024-04-11 -0.025209  0.017010
2024-04-12  0.025939 -0.009345
2024-04-13  0.009695 -0.008421
2024-04-14  0.030720 -0.026144
2024-04-15 -0.005061  0.006779
2024-04-16  0.001772  0.003963
2024-04-17 -0.017438  0.025382
2024-04-18 -0.007084  0.017335
2024-04-19 -0.011994  0.012356
2024-04-20 -0.001638 -0.018512
2024-04-21  0.007578 -0.027295
2024-04-22 -0.020449  0.012368
2024-04-23 -0.001189 -0.011282
2024-04-24 -0.063827  0.059576
2024-04-25 -0.059441  0.058850
2024-04-26 -0.066944  0.058695
2024-04-

In [9]:
print(val_results.groupby('management_group_id')[['err_low', 'err_high']].mean())

                      err_low  err_high
management_group_id                    
0                    0.000291 -0.001535
1                   -0.006621 -0.002706
2                   -0.015632 -0.006745
3                   -0.022146 -0.030059
4                    0.013296  0.000681
5                   -0.003855 -0.001507
6                   -0.008921  0.017740


In [10]:

print(val_results.groupby('activity_flag')[['err_low', 'err_high']].mean())

                err_low  err_high
activity_flag                    
0             -0.005809  0.012297
1             -0.010093 -0.001697


In [17]:
print(val_results.groupby('week_of_year')[['err_low', 'err_high']].mean())

               err_low  err_high
week_of_year                    
13           -0.026787  0.025186
14           -0.015522  0.023098
15           -0.004713  0.008872
16           -0.004838  0.002858
17           -0.033921  0.028490
18           -0.010352  0.000083
19           -0.000798  0.000108
20            0.009465 -0.019316
21            0.007394 -0.014601


In [19]:
train['first_category_id'].value_counts()

first_category_id
24    3426
23    3185
8     2955
18    2403
17    2217
0     1934
2     1450
26    1438
20    1385
9     1181
7     1032
1      907
10     801
21     640
19     594
3      591
5      530
27     477
25     463
4      327
22     263
16     174
13     174
11     149
15     119
6       81
28      79
14      64
12      61
Name: count, dtype: int64

In [12]:
train = pd.read_csv('train.csv')
train

Unnamed: 0,dt,price_p05,price_p95,n_stores,precpt,avg_temperature,avg_humidity,avg_wind_level,holiday_flag,activity_flag,management_group_id,first_category_id,second_category_id,third_category_id,dow,day_of_month,week_of_year,month,product_id
0,2024-03-28,1.136102,1.167625,-0.588881,-0.298690,-1.936029,0.621756,-0.439349,0,0,6,8,2,76,3,28,13,3,0
1,2024-03-29,1.133129,1.133590,-0.646402,0.074779,-2.090906,0.756995,-0.759561,0,0,6,8,33,77,4,29,13,3,0
2,2024-03-30,1.134522,1.165849,-0.636272,-0.644614,-1.825121,0.577839,0.122226,1,0,6,8,72,76,5,30,13,3,0
3,2024-03-31,1.152592,1.153091,-0.579004,-0.614834,-1.817634,0.589653,-0.520220,1,0,6,8,21,77,6,31,13,3,0
4,2024-04-01,1.166641,1.167096,-0.623265,-0.116090,-1.892973,0.689895,-1.036820,0,0,6,8,50,76,0,1,14,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29095,2024-05-22,0.919188,1.111109,-0.111936,1.566629,0.636145,1.050948,-1.497790,0,1,1,23,21,72,2,22,21,5,634
29096,2024-05-23,0.985685,1.183500,-0.159316,1.675944,0.925121,0.997947,-0.901034,0,1,1,23,70,73,3,23,21,5,634
29097,2024-05-24,0.956122,1.146569,-0.138205,1.778198,0.597174,1.140967,0.056938,0,1,1,23,12,73,4,24,21,5,634
29098,2024-05-25,1.001419,1.189856,-0.088175,2.138214,1.108497,0.991764,-1.271978,1,0,1,23,7,72,5,25,21,5,634


In [4]:
train

Unnamed: 0,dt,price_p05,price_p95,n_stores,precpt,avg_temperature,avg_humidity,avg_wind_level,holiday_flag,activity_flag,...,product_id,global_prod_avg,global_cat_avg,store_density_ratio,temp_hum_index,category_breadth,dow_sin,dow_cos,month_sin,month_cos
0,2024-03-28,1.136102,1.167625,-0.588881,-0.298690,-1.936029,0.621756,-0.439349,0,0,...,0,1.112252,1.011509,2.143810,-0.012037,7,0.433884,-0.900969,1.000000,6.123234e-17
1,2024-03-29,1.133129,1.133590,-0.646402,0.074779,-2.090906,0.756995,-0.759561,0,0,...,0,1.112252,1.049900,1.884214,-0.015828,7,-0.433884,-0.900969,1.000000,6.123234e-17
2,2024-03-30,1.134522,1.165849,-0.636272,-0.644614,-1.825121,0.577839,0.122226,1,0,...,0,1.112252,1.011509,2.316336,-0.010546,6,-0.974928,-0.222521,1.000000,6.123234e-17
3,2024-03-31,1.152592,1.153091,-0.579004,-0.614834,-1.817634,0.589653,-0.520220,1,0,...,0,1.112252,1.049900,1.687751,-0.010718,8,-0.781831,0.623490,1.000000,6.123234e-17
4,2024-04-01,1.166641,1.167096,-0.623265,-0.116090,-1.892973,0.689895,-1.036820,0,0,...,0,1.112252,1.011509,2.268984,-0.013060,7,0.000000,1.000000,0.866025,-5.000000e-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29095,2024-05-22,0.919188,1.111109,-0.111936,1.566629,0.636145,1.050948,-1.497790,0,1,...,634,1.041945,1.069625,-0.101285,0.006686,9,0.974928,-0.222521,0.500000,-8.660254e-01
29096,2024-05-23,0.985685,1.183500,-0.159316,1.675944,0.925121,0.997947,-0.901034,0,1,...,634,1.041945,1.038891,-0.240966,0.009232,5,0.433884,-0.900969,0.500000,-8.660254e-01
29097,2024-05-24,0.956122,1.146569,-0.138205,1.778198,0.597174,1.140967,0.056938,0,1,...,634,1.041945,1.038891,-0.209036,0.006814,5,-0.433884,-0.900969,0.500000,-8.660254e-01
29098,2024-05-25,1.001419,1.189856,-0.088175,2.138214,1.108497,0.991764,-1.271978,1,0,...,634,1.041945,1.069625,-0.079785,0.010994,8,-0.974928,-0.222521,0.500000,-8.660254e-01


In [13]:
# идеи - признаки через стандартные отклонения и т д в разных разрезах?
# анализ текущих ошибок модели на валидации
# замена валидации на OOT ?
# обучение нейросети + conformal + кастомный лосс Pinball + coverage penalty для нейросети 


# Quantile Loss (Обязательно): Твой анализ показал, что MAE просто не «дотягивается» до краев. Использование loss_function='Quantile:alpha=0.05' в CatBoost само по себе уменьшит эти err_low/high в несколько раз.

# Признак "Дней до конца месяца": Раз 31-е число и начало месяца такие шумные, добавь фичу: df['days_to_side'] = df['dt'].dt.day.apply(lambda x: min(x, 31-x)) — это покажет модели близость к «границе», где цена может скакать.

# Взаимодействие Категория + Акция: Добавь признак train['cat_promo'] = train['first_category_id'].astype(str) + "_" + train['activity_flag'].astype(str). Это поможет модели понять, что акция в категории "Алкоголь" и в категории "Хлеб" — это разные ценовые стратегии.