In [25]:
import random
import numpy as np
import pandas as pd
import seaborn as sns
from IPython.display import display, Math, Latex
from matplotlib import pyplot as plt
import re
import json
import time
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [31]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor

# 1. ЗАГРУЗКА ДАННЫХ
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Преобразование даты
train['dt'] = pd.to_datetime(train['dt'])
test['dt'] = pd.to_datetime(test['dt'])

# 2. ДОБАВЛЕНИЕ ЦИКЛИЧЕСКИХ ПРИЗНАКОВ
def add_cyclical_features(df):
    # День недели (0-6)
    df['dow_sin'] = np.sin(2 * np.pi * df['dow'] / 7)
    df['dow_cos'] = np.cos(2 * np.pi * df['dow'] / 7)
    
    # Месяц (1-12)
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    
    # День месяца (1-31)
    df['day_sin'] = np.sin(2 * np.pi * df['day_of_month'] / 31)
    df['day_cos'] = np.cos(2 * np.pi * df['day_of_month'] / 31)
    
    return df

train = add_cyclical_features(train)
test = add_cyclical_features(test)

# 3. ОПРЕДЕЛЕНИЕ ПРИЗНАКОВ
# Мы убираем исходные dow, month, day_of_month и заменяем их на sin/cos
features = [
    'n_stores', 'precpt', 'avg_temperature', 'avg_humidity', 
    'avg_wind_level', 'holiday_flag', 'activity_flag', 
    'management_group_id', 'first_category_id', 'second_category_id', 
    'third_category_id', 'product_id', 'week_of_year',
    'dow_sin', 'dow_cos', 'month_sin', 'month_cos', 'day_sin', 'day_cos'
]

# 4. ВАЛИДАЦИЯ (Локальная проверка)
max_train_date = train['dt'].max()
val_start_date = max_train_date - pd.Timedelta(days=4)

train_part = train[train['dt'] < val_start_date]
val_part = train[train['dt'] >= val_start_date]

def calculate_iou(lower_true, upper_true, lower_pred, upper_pred, epsilon=1e-6):
    intersection = np.maximum(0, np.minimum(upper_true, upper_pred) - np.maximum(lower_true, lower_pred))
    union = (upper_true - lower_true + epsilon) + (upper_pred - lower_pred + epsilon) - intersection
    return np.mean(intersection / union)

print("Обучение проверочной модели...")
model_low_val = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
model_high_val = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)

model_low_val.fit(train_part[features], train_part['price_p05'])
model_high_val.fit(train_part[features], train_part['price_p95'])

p_low_val = model_low_val.predict(val_part[features])
p_high_val = model_high_val.predict(val_part[features])
val_score = calculate_iou(val_part['price_p05'], val_part['price_p95'], p_low_val, p_high_val)
print(f"Валидационный IoU: {val_score:.4f}")

# 5. ФИНАЛЬНОЕ ОБУЧЕНИЕ
print("Обучение финальной модели на полном датасете...")
final_model_low = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
final_model_high = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)

final_model_low.fit(train[features], train['price_p05'])
final_model_high.fit(train[features], train['price_p95'])

# 6. ПРЕДСКАЗАНИЕ ДЛЯ TEST.CSV
print("Создание предсказаний для теста...")
test['price_p05'] = final_model_low.predict(test[features])
test['price_p95'] = final_model_high.predict(test[features])
test['price_p95'] = np.maximum(test['price_p95'], test['price_p05'] + 0.001)

# 7. СОХРАНЕНИЕ SUBMISSION
submission = test[['row_id', 'price_p05', 'price_p95']].sort_values('row_id')
submission.to_csv('submission.csv', index=False)

print("Файл submission.csv успешно сохранен!")

Обучение проверочной модели...
Валидационный IoU: 0.1932
Обучение финальной модели на полном датасете...
Создание предсказаний для теста...
Файл submission.csv успешно сохранен!


In [32]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor

# 1. ЗАГРУЗКА ДАННЫХ
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train['dt'] = pd.to_datetime(train['dt'])
test['dt'] = pd.to_datetime(test['dt'])

# 2. ДОБАВЛЕНИЕ ЦИКЛИЧЕСКИХ ПРИЗНАКОВ
def add_cyclical_features(df):
    df['dow_sin'] = np.sin(2 * np.pi * df['dow'] / 7)
    df['dow_cos'] = np.cos(2 * np.pi * df['dow'] / 7)
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    df['day_sin'] = np.sin(2 * np.pi * df['day_of_month'] / 31)
    df['day_cos'] = np.cos(2 * np.pi * df['day_of_month'] / 31)
    return df

train = add_cyclical_features(train)
test = add_cyclical_features(test)

# 3. ОПРЕДЕЛЕНИЕ ПРИЗНАКОВ И КАТЕГОРИЙ
# Важно: CatBoost любит, когда категории передаются как int или string
cat_features = [
    'management_group_id', 'first_category_id', 'second_category_id', 
    'third_category_id', 'product_id', 'holiday_flag', 'activity_flag'
]

features = [
    'n_stores', 'precpt', 'avg_temperature', 'avg_humidity', 
    'avg_wind_level', 'week_of_year',
    'dow_sin', 'dow_cos', 'month_sin', 'month_cos', 'day_sin', 'day_cos'
] + cat_features

# Приводим категориальные признаки к типу int, чтобы CatBoost не ругался
for col in cat_features:
    train[col] = train[col].astype(int)
    test[col] = test[col].astype(int)

# 4. ВАЛИДАЦИЯ
max_train_date = train['dt'].max()
val_start_date = max_train_date - pd.Timedelta(days=4)

train_part = train[train['dt'] < val_start_date]
val_part = train[train['dt'] >= val_start_date]

def calculate_iou(lower_true, upper_true, lower_pred, upper_pred, epsilon=1e-6):
    intersection = np.maximum(0, np.minimum(upper_true, upper_pred) - np.maximum(lower_true, lower_pred))
    union = (upper_true - lower_true + epsilon) + (upper_pred - lower_pred + epsilon) - intersection
    return np.mean(intersection / union)

# Параметры CatBoost
cb_params = {
    'iterations': 1000,
    'learning_rate': 0.1,
    'depth': 6,
    'loss_function': 'MAE', # MAE более устойчив к выбросам в ценах
    'random_seed': 42,
    'verbose': 100,
    'thread_count': -1
}

print("Обучение проверочной модели CatBoost...")
model_low_val = CatBoostRegressor(**cb_params)
model_high_val = CatBoostRegressor(**cb_params)

model_low_val.fit(train_part[features], train_part['price_p05'], cat_features=cat_features)
model_high_val.fit(train_part[features], train_part['price_p95'], cat_features=cat_features)

p_low_val = model_low_val.predict(val_part[features])
p_high_val = model_high_val.predict(val_part[features])
val_score = calculate_iou(val_part['price_p05'], val_part['price_p95'], p_low_val, p_high_val)
print(f"Валидационный IoU (CatBoost): {val_score:.4f}")

# 5. ФИНАЛЬНОЕ ОБУЧЕНИЕ
print("Обучение финальной модели на полном датасете...")
final_model_low = CatBoostRegressor(**cb_params)
final_model_high = CatBoostRegressor(**cb_params)

final_model_low.fit(train[features], train['price_p05'], cat_features=cat_features)
final_model_high.fit(train[features], train['price_p95'], cat_features=cat_features)

# 6. ПРЕДСКАЗАНИЕ ДЛЯ TEST.CSV
print("Создание предсказаний...")
test['price_p05'] = final_model_low.predict(test[features])
test['price_p95'] = final_model_high.predict(test[features])
test['price_p95'] = np.maximum(test['price_p95'], test['price_p05'] + 0.001)

# 7. СОХРАНЕНИЕ SUBMISSION
submission = test[['row_id', 'price_p05', 'price_p95']].sort_values('row_id')
submission.to_csv('submission.csv', index=False)
print("Файл submission.csv успешно сохранен!")

Обучение проверочной модели CatBoost...
0:	learn: 0.1444630	total: 184ms	remaining: 3m 3s
100:	learn: 0.0966062	total: 2.78s	remaining: 24.7s
200:	learn: 0.0914754	total: 5.54s	remaining: 22s
300:	learn: 0.0886789	total: 8.26s	remaining: 19.2s
400:	learn: 0.0866891	total: 11s	remaining: 16.4s
500:	learn: 0.0854507	total: 13.8s	remaining: 13.7s
600:	learn: 0.0841701	total: 16.6s	remaining: 11s
700:	learn: 0.0831797	total: 19.4s	remaining: 8.26s
800:	learn: 0.0824058	total: 22.2s	remaining: 5.52s
900:	learn: 0.0816133	total: 25s	remaining: 2.75s
999:	learn: 0.0808562	total: 27.7s	remaining: 0us
0:	learn: 0.1002666	total: 26ms	remaining: 25.9s
100:	learn: 0.0836107	total: 2.69s	remaining: 23.9s
200:	learn: 0.0814301	total: 5.33s	remaining: 21.2s
300:	learn: 0.0801077	total: 8.04s	remaining: 18.7s
400:	learn: 0.0789124	total: 10.8s	remaining: 16.1s
500:	learn: 0.0779033	total: 13.5s	remaining: 13.4s
600:	learn: 0.0772246	total: 16.1s	remaining: 10.7s
700:	learn: 0.0765257	total: 18.8s	rema

In [28]:
pd.read_csv('test.csv')['dt'].min()

'2024-03-28'

In [30]:
train['dow']

0        3
1        4
2        5
3        6
4        0
        ..
29095    2
29096    3
29097    4
29098    5
29099    6
Name: dow, Length: 29100, dtype: int64