In [1]:
import glob
import os
import pandas as pd
import pyarrow

# 1. Пути к данным (относительно корня проекта)

DATA_DIR = r'../../data'
TRAIN_DIR = os.path.join(DATA_DIR, 'train_data')
TEST_DIR  = os.path.join(DATA_DIR, 'test_data')

TRAIN_PATTERN = os.path.join(TRAIN_DIR, 'train_data_*.pq')
TEST_PATTERN  = os.path.join(TEST_DIR,  'test_data_*.pq')

# 2. Загрузка train-части

train_parts = []
for p in sorted(glob.glob(TRAIN_PATTERN)):
    df = pd.read_parquet(p)
    train_parts.append(df)
    print(f'{os.path.basename(p):<18} shape={df.shape}')

train = pd.concat(train_parts, ignore_index=True)
print(f'\nFULL TRAIN shape = {train.shape}')

target = pd.read_csv(os.path.join(DATA_DIR, 'train_target.csv'))
train = train.merge(target, on='id', how='left')
print('После merge с target:', train.shape)

# 3. Загрузка test-части

test_parts = []
for p in sorted(glob.glob(TEST_PATTERN)):
    df = pd.read_parquet(p)
    test_parts.append(df)
    print(f'{os.path.basename(p):<18} shape={df.shape}')

test = pd.concat(test_parts, ignore_index=True)
print(f'\nFULL TEST shape  = {test.shape}')
# ▸ сохраним id теста, прежде чем удалить DataFrame
test_id = test['id'].values

# 4. Быстрый sanity-check

print('\nTrain columns:', len(train.columns))
print('Test  columns:', len(test.columns))
print('\nПропуски (train):')
print(train.isna().mean().sort_values(ascending=False).head(10))


train_data_0.pq    shape=(1974724, 59)
train_data_1.pq    shape=(2107305, 59)
train_data_10.pq   shape=(2296372, 59)
train_data_11.pq   shape=(2450630, 59)
train_data_2.pq    shape=(2080508, 59)
train_data_3.pq    shape=(2112592, 59)
train_data_4.pq    shape=(2064110, 59)
train_data_5.pq    shape=(2150908, 59)
train_data_6.pq    shape=(2176452, 59)
train_data_7.pq    shape=(2222245, 59)
train_data_8.pq    shape=(2242615, 59)
train_data_9.pq    shape=(2284256, 59)

FULL TRAIN shape = (26162717, 59)
После merge с target: (26162717, 60)
test_data_0.pq     shape=(2389773, 59)
test_data_1.pq     shape=(2334828, 59)

FULL TEST shape  = (4724601, 59)

Train columns: 60
Test  columns: 59

Пропуски (train):
id            0.0
rn            0.0
enc_col_30    0.0
enc_col_31    0.0
enc_col_32    0.0
enc_col_33    0.0
enc_col_34    0.0
enc_col_35    0.0
enc_col_36    0.0
enc_col_37    0.0
dtype: float64


In [2]:
# 5.  Агрегация последовательностей → одна строка на id

import numpy as np, gc
from pathlib import Path

enc_cols = [c for c in train.columns if c.startswith('enc_col_')]

train = train.sort_values(['id', 'rn'])
test  = test.sort_values(['id', 'rn'])

agg = {c: ['mean', 'std', 'min', 'max', 'last'] for c in enc_cols}
agg['rn'] = ['count']

def collapse(df, name='train'):
    out = (
        df.groupby('id', sort=False)
          .agg(agg)
    )
    out.columns = ['_'.join(x) for x in out.columns]
    out = out.reset_index()
    print(f'{name} collapsed:', out.shape)
    return out

train_c = collapse(train, 'train')
test_c  = collapse(test,  'test')

train_c = train_c.merge(target[['id', 'target']], on='id')
print('train_c with target:', train_c.shape)

train collapsed: (3000000, 287)
test collapsed: (500000, 287)
train_c with target: (3000000, 288)


In [3]:
# 6.  Подготовка матриц признаков

FEATS = [c for c in train_c.columns if c not in ('id', 'target')]

med = train_c[FEATS].median().astype('float32')

def prepare(df):
    return (df[FEATS]
              .fillna(med)
              .astype('float32')
              .to_numpy())

X   = prepare(train_c)
y   = train_c['target'].astype('int8').to_numpy()
Xte = prepare(test_c)

test_ids = test_c['id'].values

del train_c, test_c; gc.collect()

5

In [None]:
# 7.  K-fold split

from sklearn.model_selection import StratifiedKFold

FOLD_FILE = Path('../fold_indices.npy')
N_FOLDS   = 5

if FOLD_FILE.exists():
    fold_indices = np.load(FOLD_FILE, allow_pickle=True)
    assert len(fold_indices) == N_FOLDS, "n_folds mismatch"
    print('fold_indices.npy загружен')
else:
    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)
    fold_indices = [val_idx for _, val_idx in skf.split(X, y)]
    np.save(FOLD_FILE, np.array(fold_indices, dtype=object))
    print('fold_indices.npy сохранён')

In [5]:
# 8.  LightGBM  +  K-fold OOF / test-blend

import lightgbm as lgb
from tqdm.auto import tqdm
from sklearn.metrics import roc_auc_score
import joblib

pos_weight = (y == 0).sum() / (y == 1).sum()

PARAMS = {
    'objective'      : 'binary',
    'metric'         : 'auc',
    'learning_rate'  : 0.03,
    'num_leaves'     : 256,
    'max_depth'      : -1,
    'min_data_in_leaf': 50,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq'   : 1,
    'scale_pos_weight': pos_weight,
    'verbose'        : -1,
    'seed'           : 42
}
NUM_ROUND     = 6000
EARLY_STOP    = 400
MODEL_DIR     = Path('')
MODEL_DIR.mkdir(exist_ok=True)

oof_pred   = np.zeros_like(y, dtype='float32')
test_blend = np.zeros((len(test_ids), N_FOLDS), dtype='float32')

In [6]:
for fold, val_idx in enumerate(fold_indices, 1):
    tr_idx = np.setdiff1d(np.arange(len(y)), val_idx)
    print(f'\n─── FOLD {fold}/{N_FOLDS} | train {len(tr_idx):,} | val {len(val_idx):,} ───')

    dtrain = lgb.Dataset(X[tr_idx], y[tr_idx])
    dval   = lgb.Dataset(X[val_idx], y[val_idx])

    pbar = tqdm(total=NUM_ROUND, desc=f'Fold{fold}', leave=False)
    def _cb(env):
        pbar.update(env.iteration - pbar.n)
        if env.evaluation_result_list:
            _, _, auc, _ = env.evaluation_result_list[0]
            pbar.set_postfix({'auc': f'{auc:.4f}'}, refresh=False)
        if env.iteration + 1 == env.end_iteration: pbar.close()

    model = lgb.train(
        PARAMS,
        dtrain,
        num_boost_round=NUM_ROUND,
        valid_sets=[dval],
        valid_names=['val'],
        callbacks=[
            lgb.early_stopping(EARLY_STOP, first_metric_only=True, verbose=False),
            _cb
        ]
    )

    oof_pred[val_idx] = model.predict(X[val_idx], num_iteration=model.best_iteration)

    test_blend[:, fold-1] = model.predict(Xte, num_iteration=model.best_iteration)

    joblib.dump({
        'val_idx'  : val_idx,
        'val_pred' : oof_pred[val_idx].astype('float32'),
        'test_pred': test_blend[:, fold-1].astype('float32'),
        'feat_list': FEATS,
        'best_iter': model.best_iteration,
        'params'   : PARAMS
    }, MODEL_DIR / f'lgb_fold{fold}.pkl', compress=3)
    print(f'lgb_fold{fold}.pkl сохранён | best_iter={model.best_iteration}')


─── FOLD 1/5 | train 2,400,000 | val 600,000 ───


Fold1:   0%|          | 0/6000 [00:00<?, ?it/s]

✓ lgb_fold1.pkl сохранён | best_iter=394

─── FOLD 2/5 | train 2,400,000 | val 600,000 ───


Fold2:   0%|          | 0/6000 [00:00<?, ?it/s]

✓ lgb_fold2.pkl сохранён | best_iter=319

─── FOLD 3/5 | train 2,400,000 | val 600,000 ───


Fold3:   0%|          | 0/6000 [00:00<?, ?it/s]

✓ lgb_fold3.pkl сохранён | best_iter=338

─── FOLD 4/5 | train 2,400,000 | val 600,000 ───


Fold4:   0%|          | 0/6000 [00:00<?, ?it/s]

✓ lgb_fold4.pkl сохранён | best_iter=367

─── FOLD 5/5 | train 2,400,000 | val 600,000 ───


Fold5:   0%|          | 0/6000 [00:00<?, ?it/s]

✓ lgb_fold5.pkl сохранён | best_iter=388


In [None]:
# 9.  Финальная метрика OOF

full_auc = roc_auc_score(y, oof_pred)
print(f'\nFULL OOF ROC-AUC = {full_auc:.5f}')

# 10.  Сабмит

final_test_pred = test_blend.mean(1)

sample = pd.read_csv(os.path.join(DATA_DIR, 'sample_submission.csv'))
sample['target'] = pd.Series(final_test_pred, index=test_ids)\
                     .reindex(sample['id']).values.astype('float32')

SUB_NAME = f'submission_lgb_{full_auc:.5f}.csv'
sample.to_csv(SUB_NAME, index=False)
print('submission сохранён', SUB_NAME)