In [1]:
import random
import numpy as np
import pandas as pd
import seaborn as sns
from IPython.display import display, Math, Latex
from matplotlib import pyplot as plt
import re
import json
import time
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [2]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor

# 0. ОПРЕДЕЛЕНИЕ МЕТРИКИ (Чтобы не было NameError)
def calculate_iou(lower_true, upper_true, lower_pred, upper_pred, epsilon=1e-6):
    intersection = np.maximum(0, np.minimum(upper_true, upper_pred) - np.maximum(lower_true, lower_pred))
    union = (upper_true - lower_true + epsilon) + (upper_pred - lower_pred + epsilon) - intersection
    return np.mean(intersection / union)

# 1. ЗАГРУЗКА ДАННЫХ
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train['dt'] = pd.to_datetime(train['dt'])
test['dt'] = pd.to_datetime(test['dt'])

# 2. ДОБАВЛЕНИЕ ЦИКЛИЧЕСКИХ ПРИЗНАКОВ
def add_cyclical_features(df):
    df['dow_sin'] = np.sin(2 * np.pi * df['dow'] / 7)
    df['dow_cos'] = np.cos(2 * np.pi * df['dow'] / 7)
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    df['day_sin'] = np.sin(2 * np.pi * df['day_of_month'] / 31)
    df['day_cos'] = np.cos(2 * np.pi * df['day_of_month'] / 31)
    return df

train = add_cyclical_features(train)
test = add_cyclical_features(test)

# 3. ОПРЕДЕЛЕНИЕ ПРИЗНАКОВ И КАТЕГОРИЙ
cat_features = [
    'management_group_id', 'first_category_id', 'second_category_id', 
    'third_category_id', 'product_id', 'holiday_flag', 'activity_flag'
]

features = [
    'n_stores', 'precpt', 'avg_temperature', 'avg_humidity', 
    'avg_wind_level', 'week_of_year',
    'dow_sin', 'dow_cos', 'month_sin', 'month_cos', 'day_sin', 'day_cos'
] + cat_features

for col in cat_features:
    train[col] = train[col].astype(int)
    test[col] = test[col].astype(int)

# 4. ВАЛИДАЦИЯ С СОХРАНЕНИЕМ ЛУЧШЕЙ МОДЕЛИ
max_train_date = train['dt'].max()
val_start_date = max_train_date - pd.Timedelta(days=4)

train_part = train[train['dt'] < val_start_date]
val_part = train[train['dt'] >= val_start_date]

# Общие параметры
common_params = {
    'iterations': 2000,          # Увеличили количество итераций
    'learning_rate': 0.05,       
    'depth': 6,
    'random_seed': 42,
    'verbose': 100,
    'thread_count': -1,
    'early_stopping_rounds': 100 
}

print("Обучение проверочной модели (Quantile Loss + Early Stopping)...")

model_low_val = CatBoostRegressor(**common_params, loss_function='Quantile:alpha=0.05')
model_high_val = CatBoostRegressor(**common_params, loss_function='Quantile:alpha=0.95')

# Обучаем и передаем валидационный набор в eval_set
model_low_val.fit(
    train_part[features], train_part['price_p05'],
    eval_set=(val_part[features], val_part['price_p05']),
    cat_features=cat_features,
    use_best_model=True
)

model_high_val.fit(
    train_part[features], train_part['price_p95'],
    eval_set=(val_part[features], val_part['price_p95']),
    cat_features=cat_features,
    use_best_model=True
)

# Предсказания для валидации
p_low_val = model_low_val.predict(val_part[features])
p_high_val = model_high_val.predict(val_part[features])

val_score = calculate_iou(val_part['price_p05'], val_part['price_p95'], p_low_val, p_high_val)
print(f"\nВалидационный IoU: {val_score:.4f}")
print(f"Лучшая итерация Low: {model_low_val.get_best_iteration()}")
print(f"Лучшая итерация High: {model_high_val.get_best_iteration()}")

# 5. ФИНАЛЬНОЕ ОБУЧЕНИЕ НА ВСЕМ TRAIN
print("\nОбучение финальной модели на полном датасете...")

final_model_low = CatBoostRegressor(
    **{**common_params, 'iterations': model_low_val.get_best_iteration() + 1}, 
    loss_function='Quantile:alpha=0.05'
)
final_model_high = CatBoostRegressor(
    **{**common_params, 'iterations': model_high_val.get_best_iteration() + 1}, 
    loss_function='Quantile:alpha=0.95'
)

final_model_low.fit(train[features], train['price_p05'], cat_features=cat_features)
final_model_high.fit(train[features], train['price_p95'], cat_features=cat_features)

# 6. ПРЕДСКАЗАНИЕ ДЛЯ TEST.CSV
print("\nСоздание предсказаний для submission...")
test['price_p05'] = final_model_low.predict(test[features])
test['price_p95'] = final_model_high.predict(test[features])

# Пост-процессинг
test['price_p95'] = np.maximum(test['price_p95'], test['price_p05'] + 0.001)

# 7. СОХРАНЕНИЕ SUBMISSION
submission = test[['row_id', 'price_p05', 'price_p95']].sort_values('row_id')
submission.to_csv('submission_catboost_final.csv', index=False)
print("Файл submission_catboost_final.csv успешно сохранен!")

Обучение проверочной модели (Quantile Loss + Early Stopping)...
0:	learn: 0.0258848	test: 0.0236834	best: 0.0236834 (0)	total: 169ms	remaining: 5m 38s
100:	learn: 0.0172046	test: 0.0163838	best: 0.0163838 (100)	total: 2.72s	remaining: 51.2s
200:	learn: 0.0163312	test: 0.0161870	best: 0.0161745 (198)	total: 5.44s	remaining: 48.7s
300:	learn: 0.0156863	test: 0.0161877	best: 0.0161346 (285)	total: 8.04s	remaining: 45.4s
400:	learn: 0.0153015	test: 0.0161221	best: 0.0161221 (400)	total: 10.5s	remaining: 42s
500:	learn: 0.0151424	test: 0.0161217	best: 0.0161052 (417)	total: 13s	remaining: 38.8s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.01610515468
bestIteration = 417

Shrink model to first 418 iterations.
0:	learn: 0.0264713	test: 0.0243080	best: 0.0243080 (0)	total: 26.9ms	remaining: 53.8s
100:	learn: 0.0257906	test: 0.0243701	best: 0.0241992 (28)	total: 2.36s	remaining: 44.5s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.02419917854
bestIt