In [1]:
import glob
import os
import pandas as pd
import pyarrow


# 1. Пути к данным (относительно корня проекта)

DATA_DIR = r'data'
TRAIN_DIR = os.path.join(DATA_DIR, 'train_data')
TEST_DIR  = os.path.join(DATA_DIR, 'test_data')

TRAIN_PATTERN = os.path.join(TRAIN_DIR, 'train_data_*.pq')
TEST_PATTERN  = os.path.join(TEST_DIR,  'test_data_*.pq')

# 2. Загрузка train-части

train_parts = []
for p in sorted(glob.glob(TRAIN_PATTERN)):
    df = pd.read_parquet(p)
    train_parts.append(df)
    print(f'{os.path.basename(p):<18} shape={df.shape}')

train = pd.concat(train_parts, ignore_index=True)
print(f'\nFULL TRAIN shape = {train.shape}')

target = pd.read_csv(os.path.join(DATA_DIR, 'train_target.csv'))
train = train.merge(target, on='id', how='left')
print('После merge с target:', train.shape)

# 3. Загрузка test-части

test_parts = []
for p in sorted(glob.glob(TEST_PATTERN)):
    df = pd.read_parquet(p)
    test_parts.append(df)
    print(f'{os.path.basename(p):<18} shape={df.shape}')

test = pd.concat(test_parts, ignore_index=True)
print(f'\nFULL TEST shape  = {test.shape}')

test_id = test['id'].values

# 4. Быстрый sanity-check

print('\nTrain columns:', len(train.columns))
print('Test  columns:', len(test.columns))
print('\nПропуски (train):')
print(train.isna().mean().sort_values(ascending=False).head(10))

train_data_0.pq    shape=(1974724, 59)
train_data_1.pq    shape=(2107305, 59)
train_data_10.pq   shape=(2296372, 59)
train_data_11.pq   shape=(2450630, 59)
train_data_2.pq    shape=(2080508, 59)
train_data_3.pq    shape=(2112592, 59)
train_data_4.pq    shape=(2064110, 59)
train_data_5.pq    shape=(2150908, 59)
train_data_6.pq    shape=(2176452, 59)
train_data_7.pq    shape=(2222245, 59)
train_data_8.pq    shape=(2242615, 59)
train_data_9.pq    shape=(2284256, 59)

FULL TRAIN shape = (26162717, 59)
После merge с target: (26162717, 60)
test_data_0.pq     shape=(2389773, 59)
test_data_1.pq     shape=(2334828, 59)

FULL TEST shape  = (4724601, 59)

Train columns: 60
Test  columns: 59

Пропуски (train):
id            0.0
rn            0.0
enc_col_30    0.0
enc_col_31    0.0
enc_col_32    0.0
enc_col_33    0.0
enc_col_34    0.0
enc_col_35    0.0
enc_col_36    0.0
enc_col_37    0.0
dtype: float64


In [2]:
# 5.  Подготовка последовательностей

import numpy as np, torch, gc
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics import roc_auc_score
from tqdm.auto import tqdm

ENC_COLS = [c for c in train.columns if c.startswith('enc_col_')]
N_FEATS  = len(ENC_COLS)
DEVICE   = 'cuda' if torch.cuda.is_available() else 'cpu'
print('DEVICE:', DEVICE)

train.sort_values(['id', 'rn'], inplace=True)
test.sort_values(['id', 'rn'],  inplace=True)

def split_sequences(df):
    X   = df[ENC_COLS].to_numpy(np.int16) + 1
    ids = df['id'].to_numpy(np.int64)
    cuts = np.flatnonzero(np.diff(ids)) + 1
    seqs = np.split(X, cuts)
    ids_unique = ids[np.concatenate(([0], cuts))]
    return ids_unique, seqs

ids_train, seqs_train = split_sequences(train)
ids_test,  seqs_test  = split_sequences(test)

y_full = (
    train[['id','target']].drop_duplicates()
         .set_index('id').loc[ids_train, 'target']
         .to_numpy('int8')
)

n_uniques = (pd.concat([train[ENC_COLS], test[ENC_COLS]]).max() + 2) \
              .astype(int).tolist()

print('train seqs:', len(seqs_train), '| test seqs:', len(seqs_test))

DEVICE: cuda
train seqs: 3000000 | test seqs: 500000


In [3]:
# 6.  Dataset

class SeqDs(Dataset):
    def __init__(self, seqs, labels=None):
        self.seqs = [torch.from_numpy(s) for s in seqs]
        self.labels = torch.tensor(labels, dtype=torch.float32) if labels is not None else None
    def __len__(self):  return len(self.seqs)
    def __getitem__(self, i):
        return (self.seqs[i], self.labels[i]) if self.labels is not None else self.seqs[i]

max_idx = torch.tensor([v-1 for v in n_uniques], dtype=torch.long)

def collate(batch):
    if isinstance(batch[0], tuple):
        seqs, labs = zip(*batch)
    else:
        seqs, labs = batch, None

    seqs = [s.unsqueeze(1) if s.ndim == 1 else s for s in seqs]

    pad = pad_sequence(seqs, batch_first=True).long()   # [B,T,F]
    pad.clamp_max_(max_idx)
    return (pad, torch.stack(labs)) if labs is not None else pad

In [None]:
# 7.  Stratified K-fold  +  test DataLoader (num_workers = 0)

from sklearn.model_selection import StratifiedKFold
from torch.optim.lr_scheduler import ReduceLROnPlateau

K_FOLDS      = 5
BATCH        = 256
EPOCHS       = 12
PATIENCE     = 2
INIT_LR      = 1e-3

test_dl = DataLoader(
    SeqDs(seqs_test),
    batch_size   = BATCH,
    shuffle      = False,
    collate_fn   = collate,
    num_workers  = 0,
    pin_memory   = True
)

oof_pred   = np.zeros(len(ids_train), dtype=np.float32)
test_blend = np.zeros(len(ids_test),  dtype=np.float32)

skf = StratifiedKFold(n_splits=K_FOLDS, shuffle=True, random_state=42)
fold_indices = [val_idx for _, val_idx in skf.split(ids_train, y_full)]
np.save("fold_indices.npy", np.array(fold_indices, dtype=object))
print("fold_indices.npy сохранён")

In [15]:
# 8.  Модель  (Bi-GRU  +  Max- & Avg-pooling)

EMB_DIM, HIDDEN = 8, 128
class BiGRUPool(nn.Module):
    def __init__(self, n_uniques, emb_dim=8, hidden=128):
        super().__init__()
        self.embs = nn.ModuleList([nn.Embedding(n, emb_dim, padding_idx=0) for n in n_uniques])
        self.drop = nn.Dropout2d(0.15)
        self.gru  = nn.GRU(input_size=emb_dim*N_FEATS, hidden_size=hidden,
                           batch_first=True, bidirectional=True)
        self.head = nn.Sequential(
            nn.Linear(hidden*4, 128), nn.ReLU(),
            nn.Linear(128, 1)
        )
    def forward(self, x):
        ein  = torch.cat([e(x[:,:,i]) for i,e in enumerate(self.embs)], dim=-1)
        ein  = ein.permute(0,2,1).unsqueeze(3)
        ein  = self.drop(ein).squeeze(3).permute(0,2,1)
        out, _ = self.gru(ein)
        max_p = out.max(dim=1)[0]
        avg_p = out.mean(dim=1)
        cat   = torch.cat([max_p, avg_p], dim=-1)
        return self.head(cat).squeeze(1)

model = BiGRUPool(n_uniques, EMB_DIM, HIDDEN).to(DEVICE)
opt   = torch.optim.AdamW(model.parameters(), lr=1e-3)
lossf = nn.BCEWithLogitsLoss()

In [16]:
def fast_auc(y_true: np.ndarray, y_prob: np.ndarray) -> float:
    order = np.argsort(y_prob)
    y_true = y_true[order]
    n_pos  = y_true.sum()
    n_neg  = len(y_true) - n_pos
    rank   = np.cumsum(y_true[::-1])[::-1].sum()
    return (rank - n_pos*(n_pos+1)/2) / (n_pos*n_neg + 1e-8)

def run_epoch(dl, train=True, epoch=0, fold=0):
    model.train(train)
    mode = 'TRN' if train else 'VAL'
    bar  = tqdm(dl, leave=False, desc=f'F{fold} {mode} E{epoch:02d}')

    tot, preds, gts = 0.0, [], []
    for step, (x, y) in enumerate(bar, 1):
        x, y = x.to(DEVICE), y.to(DEVICE)

        with torch.set_grad_enabled(train):
            logit = model(x)
            loss  = lossf(logit, y)

        if train:
            opt.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            opt.step()

        tot   += loss.item() * len(x)
        preds.append(torch.sigmoid(logit.detach()).cpu())
        gts.append(y.cpu())

        if step % 200 == 0:
            g_np = torch.cat(gts).numpy()
            p_np = torch.cat(preds).numpy()
            if g_np.min() != g_np.max():
                bar.set_postfix({'auc': f'{fast_auc(g_np, p_np):.4f}'})

    auc = roc_auc_score(torch.cat(gts).numpy(),
                        torch.cat(preds).numpy())
    return tot / len(dl.dataset), auc

In [None]:
# 8.  Цикл по K фолдам

for fold, (tr_idx, val_idx) in enumerate(skf.split(ids_train, y_full), 1):
    print(f"\n─ FOLD {fold}/{K_FOLDS} | train {len(tr_idx)} | val {len(val_idx)} ─")

    train_dl = DataLoader(
        SeqDs([seqs_train[i] for i in tr_idx], y_full[tr_idx]),
        batch_size=BATCH, shuffle=True, collate_fn=collate,
        num_workers=0, pin_memory=True)

    val_dl = DataLoader(
        SeqDs([seqs_train[i] for i in val_idx], y_full[val_idx]),
        batch_size=BATCH, shuffle=False, collate_fn=collate,
        num_workers=0, pin_memory=True)

    torch.cuda.empty_cache()
    model = BiGRUPool(n_uniques, EMB_DIM, HIDDEN).to(DEVICE)
    opt   = torch.optim.AdamW(model.parameters(), lr=INIT_LR, weight_decay=1e-2)
    scheduler = ReduceLROnPlateau(opt, mode='max',
                                  factor=0.5, patience=1,
                                  verbose=True, min_lr=1e-5)

    BEST, WAIT = 0.0, 0
    for ep in range(1, EPOCHS + 1):
        run_epoch(train_dl, True,  ep, fold)
        _, val_auc = run_epoch(val_dl, False, ep, fold)
        scheduler.step(val_auc)
        print(f"F{fold} E{ep:02d} | val_auc={val_auc:.4f} | lr={opt.param_groups[0]['lr']:.2e}")

        if val_auc > BEST + 1e-4:
            BEST, WAIT = val_auc, 0
            torch.save(model.state_dict(), f"best_fold{fold}.pt")
            print("checkpoint saved")
        else:
            WAIT += 1
            if WAIT >= PATIENCE:
                print("   early-stop"); break

    model.load_state_dict(torch.load(f"best_fold{fold}.pt", map_location=DEVICE))
    model.eval()
    with torch.no_grad():
        val_pred = torch.cat([
            torch.sigmoid(model(xb.to(DEVICE))).cpu()
            for xb, _ in val_dl]).numpy()
    oof_pred[val_idx] = val_pred

    with torch.no_grad():
        test_pred = torch.cat([
            torch.sigmoid(model(xb.to(DEVICE))).cpu()
            for xb in test_dl]).numpy()
    test_blend += test_pred / K_FOLDS

    torch.save({
        "state_dict": torch.load(f"best_fold{fold}.pt", map_location="cpu"),
        "val_idx"  : val_idx,
        "val_pred" : val_pred.astype('float32'),
        "test_pred": test_pred.astype('float32'),
        "config"   : {"emb_dim": EMB_DIM,
                      "hidden" : HIDDEN,
                      "n_uniques": n_uniques,
                      "fold": fold}
    }, f"gru_fold{fold}.pth")
    print(f"gru_fold{fold}.pth сохранён  (best val_auc {BEST:.4f})")



─ FOLD 1/5 | train 2400000 | val 600000 ─




F1 TRN E01:   0%|          | 0/9375 [00:00<?, ?it/s]

In [18]:
# 9.  Итоговый OOF-AUC  +  submission

full_auc = roc_auc_score(y_full, oof_pred)
print(f"\nFULL OOF ROC-AUC = {full_auc:.5f}")

sample = pd.read_csv(os.path.join(DATA_DIR, "sample_submission.csv"))
sample["target"] = pd.Series(test_blend, index=ids_test).reindex(sample["id"]).values.astype('float32')
sample.to_csv("submission_gru_kfold.csv", index=False)
print("submission_gru_kfold.csv готов:", sample.shape)


FULL OOF ROC-AUC = 0.78134
submission_gru_kfold.csv готов: (500000, 2)
