In [1]:
import os, glob, joblib, torch, warnings, gc
import numpy as np, pandas as pd
from pathlib import Path
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb
from tqdm.auto import tqdm

ROOT        = Path('.')
DATA_DIR    = ROOT / 'data'
SUB_NAME    = 'submission_meta_blend_3.csv'
N_FOLDS     = 5

In [2]:
# 1. Читаем таргет и fold_indices

train_target = pd.read_csv(DATA_DIR / 'train_target.csv')
y            = train_target.set_index('id')['target'].astype('int8').values

fold_file = ROOT / 'fold_indices.npy'
assert fold_file.exists(), 'fold_indices.npy не найден'
fold_indices = np.load(fold_file, allow_pickle=True)
assert len(fold_indices) == N_FOLDS, 'Количество фолдов в split-файле ≠ N_FOLDS'

In [3]:
import pickle

# 2. Собираем все *.pth и *.pkl

def load_ckpt(path):
    """Читает .pkl (joblib) или .pth (torch.load)."""
    if path.suffix == '.pkl':
        return joblib.load(path)

    try:
        return torch.load(path, map_location='cpu')          # weights_only=True (default)
    except pickle.UnpicklingError:
        return torch.load(path, map_location='cpu', weights_only=False)

files = []
# LightGBM
files += glob.glob(str(ROOT / 'models_lgb' / 'lgb_fold*.pkl'))
# RNN-base (gru_fold*.pth)
files += glob.glob(str(ROOT / 'RNN-base' / 'gru_fold*.pth'))
# BiGRU4Pool (bigru4pool_fold*.pth)
files += glob.glob(str(ROOT / 'BiGru4' / 'bigru4pool_fold*.pth'))

files = sorted(map(Path, files))
assert files, 'Не нашёл ни одного файла с предсказаниями!'

print(f'Найдено моделей для бленда: {len(files)}')

Найдено моделей для бленда: 15


In [4]:
# 3. Агрегируем фолдыс - один столбец на модель

import re, numpy as np
from collections import defaultdict

pat_fold = re.compile(r'_fold\d+\.(pth|pkl)$')
models = defaultdict(list)
for p in files:
    key = pat_fold.sub('', p.name)
    models[key].append(p)

print('Моделей:', list(models.keys()))

first = load_ckpt(models[next(iter(models))][0])
n_test = len(first['test_pred'])
oof_mat  = np.zeros((len(y), len(models)), dtype='float32')
test_mat = np.zeros((n_test,  len(models)), dtype='float32')

def load(path: Path):
    return (joblib.load(path) if path.suffix=='.pkl'
            else torch.load(path, map_location='cpu', weights_only=False))

for col, (m_name, paths) in enumerate(models.items()):
    print(f'{m_name}: {len(paths)} фолдов')
    acc_test = np.zeros(n_test, dtype='float32')
    for ckpt_path in paths:
        ckpt = load(ckpt_path)
        oof_mat[ckpt["val_idx"], col] = ckpt["val_pred"]
        acc_test += ckpt["test_pred"] / len(paths)
    test_mat[:, col] = acc_test

print("oof_mat:", oof_mat.shape, "| test_mat:", test_mat.shape)

Моделей: ['bigru4pool', 'lgb', 'gru']
bigru4pool: 5 фолдов
lgb: 5 фолдов
gru: 5 фолдов
oof_mat: (3000000, 3) | test_mat: (500000, 3)


In [5]:
oof_mat

array([[0.00767943, 0.24500681, 0.00810673],
       [0.08218422, 0.39394644, 0.0787522 ],
       [0.11407842, 0.57732713, 0.11190356],
       ...,
       [0.00672244, 0.4125769 , 0.00569409],
       [0.13254805, 0.72198755, 0.09740859],
       [0.10145935, 0.29503438, 0.12420425]], dtype=float32)

In [6]:
# 4.  Быстрый sanity-check: одиночные модели

for i, name in enumerate(models.keys()):
    auc = roc_auc_score(y, oof_mat[:, i])
    print(f'{name:12s}  OOF ROC-AUC = {auc:.5f}')

bigru4pool    OOF ROC-AUC = 0.78323
lgb           OOF ROC-AUC = 0.75598
gru           OOF ROC-AUC = 0.78134


In [7]:
# 5.  META-модель: логистическая регрессия

from sklearn.linear_model import LogisticRegression

meta_oof   = np.zeros(len(y),    dtype='float32')
meta_test  = np.zeros(test_mat.shape[0], dtype='float32')

skf_meta = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=101)
for tr, vl in skf_meta.split(oof_mat, y):
    clf = LogisticRegression(max_iter=1000, n_jobs=-1, solver='lbfgs')
    clf.fit(oof_mat[tr], y[tr])

    meta_oof[vl]  = clf.predict_proba(oof_mat[vl])[:, 1]
    meta_test    += clf.predict_proba(test_mat)[:, 1] / N_FOLDS

meta_auc  = roc_auc_score(y, meta_oof)
gini      = (meta_auc - 0.5) * 2
print(f'\nMETA-model OOF ROC-AUC = {meta_auc:.5f}  |  Gini ≈ {gini*100:.2f}')


META-model OOF ROC-AUC = 0.77380  |  Gini ≈ 54.76


In [None]:
# 6.  Подготовка id теста и финальный сабмит

if 'ids_test' in first:
    ids_test = first['ids_test']
else:
    test_parts = []
    for p in sorted((DATA_DIR / 'test_data').glob('test_data_*.pq')):
        test_parts.append(pd.read_parquet(p, columns=['id']))
    ids_test = pd.concat(test_parts, ignore_index=True)['id'].unique()

meta_test_ser = pd.Series(meta_test, index=ids_test)

sample = pd.read_csv(DATA_DIR / 'sample_submission.csv')
sample['target'] = meta_test_ser.reindex(sample['id']).values.astype('float32')

sample.to_csv(SUB_NAME, index=False)
print('submission saved', SUB_NAME, sample.shape)

In [9]:
# 4.  Meta-LightGBM c новым StratifiedKFold

from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb
from tqdm.auto import tqdm
import gc, numpy as np
from sklearn.metrics import roc_auc_score

scale_pos_weight = (y == 0).sum() / (y == 1).sum()
print(f'scale_pos_weight = {scale_pos_weight:.1f}')

LGB_META = {
    'objective'        : 'binary',
    'metric'           : 'auc',
    'learning_rate'    : 0.03,
    'num_leaves'       : 64,
    'feature_fraction' : 1.0,
    'bagging_fraction' : 1.0,
    'bagging_freq'     : 0,
    'max_depth'        : -1,
    'device_type'      : 'gpu',
    'scale_pos_weight' : scale_pos_weight,
    'seed'             : 42,
    'verbose'          : -1,
}

N_FOLDS    = 5
NUM_ROUNDS = 5_000
EARLY_STOP = 300

meta_oof  = np.zeros(len(y),           dtype='float32')
meta_test = np.zeros(test_mat.shape[0], dtype='float32')

skf_meta = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)


for fold, (tr_idx, val_idx) in enumerate(skf_meta.split(oof_mat, y), 1):

    dtrain = lgb.Dataset(oof_mat[tr_idx], y[tr_idx])
    dvalid = lgb.Dataset(oof_mat[val_idx], y[val_idx])

    pbar = tqdm(total=NUM_ROUNDS, desc=f'Fold{fold}', leave=False)

    def _cb(env):
        pbar.update(env.iteration - pbar.n)
        if env.evaluation_result_list:
            _, _, auc, _ = env.evaluation_result_list[0]
            pbar.set_postfix({'auc': f'{auc:.4f}'}, refresh=False)
        if env.iteration + 1 == env.end_iteration: pbar.close()

    booster = lgb.train(
        LGB_META,
        dtrain,
        num_boost_round=NUM_ROUNDS,
        valid_sets=[dvalid],
        valid_names=['val'],
        callbacks=[
            lgb.early_stopping(EARLY_STOP, first_metric_only=True, verbose=False),
            _cb
        ]
    )

    meta_oof[val_idx] = booster.predict(oof_mat[val_idx],
                                        num_iteration=booster.best_iteration)

    meta_test += booster.predict(test_mat,
                                 num_iteration=booster.best_iteration) / N_FOLDS
    del booster, dtrain, dvalid
    gc.collect()

meta_auc  = roc_auc_score(y, meta_oof)
meta_gini = (meta_auc - 0.5) * 2
print(f'\nMETA-LightGBM OOF ROC-AUC = {meta_auc:.5f}  |  Gini ≈ {meta_gini*100:.2f}')

scale_pos_weight = 27.2


Fold1:   0%|          | 0/5000 [00:00<?, ?it/s]

Fold2:   0%|          | 0/5000 [00:00<?, ?it/s]

Fold3:   0%|          | 0/5000 [00:00<?, ?it/s]

Fold4:   0%|          | 0/5000 [00:00<?, ?it/s]

Fold5:   0%|          | 0/5000 [00:00<?, ?it/s]


META-LightGBM OOF ROC-AUC = 0.76198  |  Gini ≈ 52.40


In [None]:
# 5.  Сабмит

sample = pd.read_csv(DATA_DIR / 'sample_submission.csv')

if 'ids_test' in first:
    ids_test = first['ids_test']
else:
    test_ids_df = pd.concat(
        [pd.read_parquet(f, columns=['id'])
         for f in sorted((DATA_DIR / 'test_data').glob('test_data_*.pq'))],
        ignore_index=True)
    ids_test = test_ids_df['id'].unique()

sample['target'] = (
    pd.Series(meta_test, index=ids_test)
      .reindex(sample['id'])
      .values.astype('float32')
)

sample.to_csv(SUB_NAME, index=False)
print('submission saved', SUB_NAME, sample.shape)