In [8]:
import pandas as pd
import numpy as np
import random
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler, LabelEncoder

# ==========================================
# 0. НАСТРОЙКИ И КОНФИГУРАЦИЯ
# ==========================================
SEED = 993

NN_CONFIG = {
    'learning_rate': 0.005,
    'batch_size': 4096,
    'epochs': 50,           
    'hidden_layers': [512, 256, 128, 64, 32], 
    'dropout': 0.2,         
    'loss_type': 'MAE', 
    'device': 'cuda' if torch.cuda.is_available() else 'cpu'
}

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(SEED)

# ==========================================
# 1. ЗАГРУЗКА И ПОДГОТОВКА ДАННЫХ
# ==========================================
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
# train = train[train['price_p05'] > 0]
train['dt'] = pd.to_datetime(train['dt'])
test['dt'] = pd.to_datetime(test['dt'])

# 2. FEATURE ENGINEERING
def create_smart_features(df, train_ref=None):
    if train_ref is not None:
        prod_price_map = train_ref.groupby('product_id')['price_p05'].mean().to_dict()
        df['global_prod_avg'] = df['product_id'].map(prod_price_map)
        cat_price_map = train_ref.groupby('third_category_id')['price_p05'].mean().to_dict()
        df['global_cat_avg'] = df['third_category_id'].map(cat_price_map)
    
    cat_stores_map = df.groupby('third_category_id')['n_stores'].transform('mean')
    df['store_density_ratio'] = df['n_stores'] / (cat_stores_map + 1e-6)
    df['temp_hum_index'] = df['avg_temperature'] * (df['avg_humidity'] / 100)
    df['category_breadth'] = df.groupby(['dt', 'third_category_id'])['product_id'].transform('nunique')
    return df

train = create_smart_features(train, train_ref=train)
test = create_smart_features(test, train_ref=train)

def add_cyclical_features(df):
    df['dow_sin'] = np.sin(2 * np.pi * df['dow'] / 7)
    df['dow_cos'] = np.cos(2 * np.pi * df['dow'] / 7)
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    return df

train = add_cyclical_features(train)
test = add_cyclical_features(test)

# ==========================================
# 3. ПОДГОТОВКА К НЕЙРОСЕТИ
# ==========================================
cat_cols = ['management_group_id', 'first_category_id', 'activity_flag', 'product_id', 'third_category_id']
num_cols = [
    'n_stores', 'precpt', 'avg_temperature', 'avg_humidity', 
    'avg_wind_level', 'week_of_year', 'month_sin', 'month_cos',
    'global_prod_avg', 'global_cat_avg', 'store_density_ratio',
    'temp_hum_index', 'category_breadth'
]

# 3.1 Заполнение пропусков
train[num_cols] = train[num_cols].fillna(train[num_cols].mean())
test[num_cols] = test[num_cols].fillna(train[num_cols].mean())
train[cat_cols] = train[cat_cols].fillna(-1)
test[cat_cols] = test[cat_cols].fillna(-1)

# 3.2 Label Encoding 
label_encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    full_col = pd.concat([train[col].astype(str), test[col].astype(str)], axis=0)
    le.fit(full_col)
    train[col] = le.transform(train[col].astype(str))
    test[col] = le.transform(test[col].astype(str))
    label_encoders[col] = le

# 3.3 Scaling 
scaler = StandardScaler()
train[num_cols] = scaler.fit_transform(train[num_cols])
test[num_cols] = scaler.transform(test[num_cols])

# 3.4 Правило размера эмбеддингов
embedding_sizes = []
for col in cat_cols:
    num_unique = len(label_encoders[col].classes_)
    emb_dim = min(50, (num_unique + 1) // 2)
    embedding_sizes.append((num_unique, emb_dim))
    print(f"Feature '{col}': {num_unique} unique -> embedding size {emb_dim}")

# ==========================================
# 4. МОДЕЛЬ НЕЙРОСЕТИ (PyTorch)
# ==========================================

class TabularDataset(Dataset):
    def __init__(self, df, cat_cols, num_cols, target=None):
        self.cats = df[cat_cols].values.astype(np.int64)
        self.nums = df[num_cols].values.astype(np.float32)
        self.target = df[target].values.astype(np.float32) if target is not None else None

    def __len__(self):
        return len(self.cats)

    def __getitem__(self, idx):
        if self.target is not None:
            return self.cats[idx], self.nums[idx], self.target[idx]
        return self.cats[idx], self.nums[idx]

class TabularNN(nn.Module):
    def __init__(self, embedding_sizes, n_cont, hidden_layers, dropout=0.2):
        super().__init__()
        self.embeddings = nn.ModuleList([nn.Embedding(ni, nf) for ni, nf in embedding_sizes])
        self.n_emb = sum(e.embedding_dim for e in self.embeddings)
        self.n_cont = n_cont
        
        layers = []
        in_size = self.n_emb + self.n_cont
        
        for h_size in hidden_layers:
            layers.append(nn.Linear(in_size, h_size))
            layers.append(nn.ReLU())
            layers.append(nn.BatchNorm1d(h_size))
            layers.append(nn.Dropout(dropout))
            in_size = h_size
            
        layers.append(nn.Linear(in_size, 1)) 
        self.layers = nn.Sequential(*layers)

    def forward(self, x_cat, x_cont):
        x = [e(x_cat[:, i]) for i, e in enumerate(self.embeddings)]
        x = torch.cat(x, 1)
        x = torch.cat([x, x_cont], 1)
        return self.layers(x).squeeze(1)

class QuantileLoss(nn.Module):
    def __init__(self, quantile):
        super().__init__()
        self.quantile = quantile

    def forward(self, preds, target):
        errors = target - preds
        loss = torch.max((self.quantile - 1) * errors, self.quantile * errors)
        return torch.abs(loss).mean()

def get_loss_fn(name, quantile=None):
    if name == 'MSE': return nn.MSELoss()
    if name == 'MAE': return nn.L1Loss()
    if name == 'Quantile': return QuantileLoss(quantile)
    raise ValueError("Unknown Loss Type")

def train_epoch(model, loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for cats, nums, targets in loader:
        cats, nums, targets = cats.to(device), nums.to(device), targets.to(device)
        optimizer.zero_grad()
        preds = model(cats, nums)
        loss = criterion(preds, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

# --- ИСПРАВЛЕННАЯ ФУНКЦИЯ PREDICT ---
def predict(model, loader, device):
    model.eval()
    preds_list = []
    with torch.no_grad():
        for batch in loader:
            # Универсальная распаковка:
            if len(batch) == 3:
                cats, nums, _ = batch # Если есть таргет (валидация), игнорируем его
            else:
                cats, nums = batch    # Если нет таргета (тест)
            
            cats, nums = cats.to(device), nums.to(device)
            preds = model(cats, nums)
            preds_list.append(preds.cpu().numpy())
    return np.concatenate(preds_list)

# ==========================================
# 5. ОБУЧЕНИЕ И ВАЛИДАЦИЯ
# ==========================================
train_parts, val_parts = [], []
for _, group in train.groupby('dt'):
    group = group.sample(frac=1, random_state=SEED).reset_index(drop=True)
    split_idx = int(len(group) * 0.8)
    train_parts.append(group.iloc[:split_idx])
    val_parts.append(group.iloc[split_idx:])

train_part = pd.concat(train_parts)
val_part = pd.concat(val_parts)

# Датасеты
train_ds_low = TabularDataset(train_part, cat_cols, num_cols, 'price_p05')
val_ds_low = TabularDataset(val_part, cat_cols, num_cols, 'price_p05')
train_ds_high = TabularDataset(train_part, cat_cols, num_cols, 'price_p95')
val_ds_high = TabularDataset(val_part, cat_cols, num_cols, 'price_p95')

loaders = {
    'train_low': DataLoader(train_ds_low, batch_size=NN_CONFIG['batch_size'], shuffle=True),
    'val_low': DataLoader(val_ds_low, batch_size=NN_CONFIG['batch_size']*2),
    'train_high': DataLoader(train_ds_high, batch_size=NN_CONFIG['batch_size'], shuffle=True),
    'val_high': DataLoader(val_ds_high, batch_size=NN_CONFIG['batch_size']*2)
}

device = NN_CONFIG['device']
print(f"Using device: {device}")

def train_and_validate(target_name, quantile, train_loader, val_loader):
    print(f"\n=== Training Model for {target_name} (Loss: {NN_CONFIG['loss_type']}) ===")
    model = TabularNN(embedding_sizes, len(num_cols), NN_CONFIG['hidden_layers'], NN_CONFIG['dropout']).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=NN_CONFIG['learning_rate'])
    criterion = get_loss_fn(NN_CONFIG['loss_type'], quantile)
    
    for epoch in range(NN_CONFIG['epochs']):
        loss = train_epoch(model, train_loader, optimizer, criterion, device)
        if (epoch+1) % 5 == 0:
            print(f"Epoch {epoch+1}/{NN_CONFIG['epochs']} | Loss: {loss:.5f}")
            
    # Валидация
    preds = predict(model, val_loader, device)
    return model, preds

# Обучаем валидационные модели
model_low_val, preds_low_val = train_and_validate('price_p05', 0.05, loaders['train_low'], loaders['val_low'])
model_high_val, preds_high_val = train_and_validate('price_p95', 0.95, loaders['train_high'], loaders['val_high'])

# РАСЧЕТ МЕТРИКИ (IoU)
def calculate_iou(lower_true, upper_true, lower_pred, upper_pred, epsilon=1e-6):
    intersection = np.maximum(0, np.minimum(upper_true, upper_pred) - np.maximum(lower_true, lower_pred))
    union = (upper_true - lower_true + epsilon) + (upper_pred - lower_pred + epsilon) - intersection
    return np.mean(intersection / union)

val_part['pred_p05'] = preds_low_val
val_part['pred_p95'] = np.maximum(preds_high_val, preds_low_val + 0.001)

iou_score = calculate_iou(
    val_part['price_p05'], val_part['price_p95'],
    val_part['pred_p05'], val_part['pred_p95']
)
print(f"\n>>> VALIDATION IoU SCORE: {iou_score:.5f} <<<")
print("-" * 40)

# ==========================================
# 6. ФИНАЛЬНОЕ ОБУЧЕНИЕ (FULL TRAIN)
# ==========================================
print("Retraining on FULL dataset...")
full_ds_low = TabularDataset(train, cat_cols, num_cols, 'price_p05')
full_ds_high = TabularDataset(train, cat_cols, num_cols, 'price_p95')

full_loader_low = DataLoader(full_ds_low, batch_size=NN_CONFIG['batch_size'], shuffle=True)
full_loader_high = DataLoader(full_ds_high, batch_size=NN_CONFIG['batch_size'], shuffle=True)

# Финальные модели
final_model_low, _ = train_and_validate('Final Low', 0.05, full_loader_low, full_loader_low)
final_model_high, _ = train_and_validate('Final High', 0.95, full_loader_high, full_loader_high)

# ==========================================
# 7. ПРЕДСКАЗАНИЕ
# ==========================================
print("Generating submission...")
test_ds = TabularDataset(test, cat_cols, num_cols, None)
test_loader = DataLoader(test_ds, batch_size=NN_CONFIG['batch_size']*2, shuffle=False)

test['price_p05'] = predict(final_model_low, test_loader, device)
test['price_p95'] = predict(final_model_high, test_loader, device)

test['price_p95'] = np.maximum(test['price_p95'], test['price_p05'] + 0.001)

submission = test[['row_id', 'price_p05', 'price_p95']].sort_values('row_id')
submission.to_csv('submission.csv', index=False)
print(f"Готово! Результаты зафиксированы с seed {SEED}.")

Feature 'management_group_id': 7 unique -> embedding size 4
Feature 'first_category_id': 29 unique -> embedding size 15
Feature 'activity_flag': 2 unique -> embedding size 1
Feature 'product_id': 635 unique -> embedding size 50
Feature 'third_category_id': 197 unique -> embedding size 50
Using device: cpu

=== Training Model for price_p05 (Loss: MAE) ===
Epoch 5/50 | Loss: 0.15383
Epoch 10/50 | Loss: 0.12024
Epoch 15/50 | Loss: 0.10964
Epoch 20/50 | Loss: 0.10170
Epoch 25/50 | Loss: 0.09597
Epoch 30/50 | Loss: 0.09295
Epoch 35/50 | Loss: 0.09159
Epoch 40/50 | Loss: 0.08800
Epoch 45/50 | Loss: 0.08678
Epoch 50/50 | Loss: 0.08324

=== Training Model for price_p95 (Loss: MAE) ===
Epoch 5/50 | Loss: 0.19286
Epoch 10/50 | Loss: 0.13254
Epoch 15/50 | Loss: 0.12011
Epoch 20/50 | Loss: 0.11000
Epoch 25/50 | Loss: 0.10561
Epoch 30/50 | Loss: 0.09797
Epoch 35/50 | Loss: 0.09751
Epoch 40/50 | Loss: 0.09264
Epoch 45/50 | Loss: 0.08961
Epoch 50/50 | Loss: 0.08760

>>> VALIDATION IoU SCORE: 0.24820 

In [5]:
submission

Unnamed: 0,row_id,price_p05,price_p95
0,0,0.781956,1.916166
1,1,0.732659,1.893208
2,2,0.793193,1.589085
3,3,0.736494,1.899157
4,4,0.737328,1.897033
...,...,...,...
28045,28045,0.838997,1.398469
28046,28046,0.937909,1.399663
28047,28047,0.915474,1.424258
28048,28048,0.793047,1.320388


In [6]:
test

Unnamed: 0,dt,n_stores,precpt,avg_temperature,avg_humidity,avg_wind_level,holiday_flag,activity_flag,management_group_id,first_category_id,...,global_cat_avg,store_density_ratio,temp_hum_index,category_breadth,dow_sin,dow_cos,month_sin,month_cos,price_p05,price_p95
0,2024-05-27,-0.336939,1.205993,1.296874,0.497419,-0.044858,0,0,0,28,...,0.702459,1.458132,0.178026,0.556403,0.000000,1.000000,-1.127961,-0.991565,0.781956,1.916166
1,2024-05-28,-0.341843,1.209575,1.374423,0.277848,-0.014438,0,1,0,28,...,0.702459,1.479999,0.062209,-0.480385,0.781831,0.623490,-1.127961,-0.991565,0.732659,1.893208
2,2024-05-29,-0.468778,0.760393,1.357904,0.286863,-0.109428,0,1,0,28,...,0.075046,0.101452,0.064954,0.297206,0.974928,-0.222521,-1.127961,-0.991565,0.793193,1.589085
3,2024-05-30,-0.402597,0.867831,1.490480,0.360188,0.610077,0,1,0,28,...,0.702459,1.750914,0.145259,-0.221188,0.433884,-0.900969,-1.127961,-0.991565,0.736494,1.899157
4,2024-05-31,-0.340944,2.078208,1.476580,0.582333,0.395694,0,1,0,28,...,0.702459,1.475992,0.305691,-0.221188,-0.433884,-0.900969,-1.127961,-0.991565,0.737328,1.897033
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28045,2024-06-21,0.428134,1.436231,2.831725,0.916202,0.372983,0,1,6,27,...,-0.182850,0.083494,1.501418,-0.480385,-0.433884,-0.900969,-3.732829,-1.542904,0.838997,1.398469
28046,2024-06-22,0.521439,0.649323,2.739616,0.794731,0.694538,1,0,6,27,...,-0.182850,0.113841,1.223131,0.038009,-0.974928,-0.222521,-3.732829,-1.542904,0.937909,1.399663
28047,2024-06-23,0.408661,0.541450,3.046013,0.751787,0.147509,1,1,6,27,...,0.993446,0.820347,1.321454,0.297206,-0.781831,0.623490,-3.732829,-1.542904,0.915474,1.424258
28048,2024-06-24,0.504282,0.265846,2.988317,0.748656,0.327897,0,1,6,28,...,-0.182850,0.108261,1.282570,0.038009,0.000000,1.000000,-3.732829,-1.542904,0.793047,1.320388
