In [None]:
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier, Pool, cv
import joblib

In [None]:
import pandas as pd

train_df = pd.read_csv('ptrain_v1.csv', encoding='utf-8').set_index('index')
test_df  = pd.read_csv('ptest_v1.csv',  encoding='utf-8').set_index('index')

doc_emb = pd.read_csv('doc_emb_norm_old.csv',    index_col=0)
lsa200  = pd.read_csv('text_data_lsa200.csv',   index_col='index')
lsa300  = pd.read_csv('text_data_lsa300.csv',   index_col='index')
lsa200_old  = pd.read_csv('text_data_lsa200_old.csv',   index_col='index')
lsa300_old  = pd.read_csv('text_data_lsa300_old.csv',   index_col='index')
doc_emb_old = pd.read_csv('doc_emb_norm.csv',    index_col=0)

In [None]:
# Объединим все признаки по индексу

lsa300.columns = [col.replace('lsa_', 'lsa300_') for col in lsa300.columns]
lsa200_old.columns = [col.replace('lsa_', 'lsa200_old') for col in lsa200_old.columns]
lsa300_old.columns = [col.replace('lsa_', 'lsa300_old') for col in lsa300_old.columns]
doc_emb_old.columns = [col.replace('emb_', 'emb_old') for col in doc_emb_old.columns]

train_full = train_df.join([doc_emb,doc_emb_old, lsa200, lsa300,lsa200_old,lsa300_old], how='left')
test_full  = test_df .join([doc_emb,doc_emb_old, lsa200, lsa300,lsa200_old,lsa300_old], how='left')

print("Train shapes:")
print("  исходный:", train_df.shape)
print("  + doc_emb:", doc_emb.shape)
print("  + lsa200: ", lsa200.shape)
print("  + lsa300: ", lsa300.shape)
#print("  → итоговый:", train_full.shape, "\n")

#print("Test shapes:")
#print("  исходный:", test_df.shape)
#print("  → итоговый:", test_full.shape, "\n")

# 5. Сохраним итоговые файлы
#joblib.dump(train_full, 'train_full_features.joblib')
#joblib.dump(test_full, 'test_full_features.joblib')

#train_full.reset_index().to_csv('train_full_features.csv', index=False)
#test_full .reset_index().to_csv('test_full_features.csv',  index=False)

print("Готово! Сохранены:")
print("  • train_full_features.csv")
print("  • test_full_features.csv")


Train shapes:
  исходный: (247972, 1851)
  + doc_emb: (274446, 300)
  + lsa200:  (274446, 200)
  + lsa300:  (274446, 300)
Готово! Сохранены:
  • train_full_features.csv
  • test_full_features.csv


In [None]:
del doc_emb
del lsa200
del lsa300
del lsa200_old
del lsa300_old
del doc_emb_old
del train_df
del test_df
gc.collect()

0

In [None]:

#joblib.dump((X, y, train_idx), r'F:\train_data.joblib')
#joblib.dump((X_test, test_idx), r'F:\test_data.joblib')
#print('Saved train_data.joblib and test_data.joblib in data/ directory')

In [None]:
import os
import numpy as np
import pandas as pd
from scipy import sparse
from catboost import CatBoostClassifier, Pool, cv
import joblib

In [None]:
from catboost import CatBoostClassifier, Pool

if 'index' in train_full.columns:
    train_full = train_full.set_index('index')
idx = train_full.index
y   = train_full['target'].values if 'target' in train_full.columns else None
train_full  = train_full.drop(columns=['target','Unnamed: 0'], errors='ignore')

X = (
        train_full
        .apply(pd.to_numeric, errors='coerce')
        .fillna(0)
        .astype(np.float32)
    )


In [None]:
del train_full
gc.collect()

0

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from catboost import CatBoostClassifier
import xgboost as xgb
import gc
from tqdm import tqdm

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=18)
del X
del y
gc.collect()

4

In [None]:
from sklearn import set_config
set_config(enable_metadata_routing=True)

catboost_model = CatBoostClassifier(iterations=10000, depth=8, learning_rate=0.03, loss_function='Logloss', verbose=100, task_type="GPU", devices='0')
lgb_model = lgb.LGBMClassifier(n_estimators=1000, learning_rate=0.06, max_depth=10, min_data_in_leaf=10, metric="auc", verbosity=1, device='gpu', gpu_platform_id=0, gpu_device_id=0, num_threads=4)
xgb_model = xgb.XGBClassifier(n_estimators=1000, learning_rate=0.03, max_depth=8, objective='binary:logistic', tree_method="hist", device="cuda", verbosity=2)

meta_model = LogisticRegression()

stacking_model = StackingClassifier(
    estimators=[('catboost', catboost_model), ('lgbm', lgb_model), ('xgb', xgb_model)],
    final_estimator=meta_model,
    cv=StratifiedKFold(n_splits=5)
)

In [None]:
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Матрицы для стекинга
S_train = np.zeros((X_train.shape[0], 3))
S_test_folds = np.zeros((X_test.shape[0], 3, n_splits))

for i, (name, model) in enumerate([
    ('xgb',      xgb_model),
    ('catboost', catboost_model),
    ('lgbm',     lgb_model)
]):
    for fold, (tr_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
        X_tr, y_tr = X_train.iloc[tr_idx], y_train[tr_idx]
        X_val, y_val = X_train.iloc[val_idx],   y_train[val_idx]

        if name == 'xgb':
            # низкоуровневый train для XGBoost
            params    = model.get_xgb_params()
            num_round = model.get_params()['n_estimators']
            dtrain = xgb.DMatrix(X_tr, label=y_tr)
            deval  = xgb.DMatrix(X_val, label=y_val)
            bst = xgb.train(
                params,
                dtrain,
                num_boost_round=num_round,
                evals=[(deval, 'eval')],
                early_stopping_rounds=300,
                verbose_eval=False
            )
            # OOF-прогноз и тестовый прогноз
            S_train[val_idx, i]      = bst.predict(deval)
            S_test_folds[:, i, fold] = bst.predict(xgb.DMatrix(X_test))

        else:
            # CatBoost или LightGBM — клонируем модель
            m = clone(model)
            if name == 'catboost':
                fit_kwargs = dict(
                    eval_set=[(X_val, y_val)],
                    early_stopping_rounds=700,
                    use_best_model=True,
                    verbose=False
                )
            else:  # lgbm
                fit_kwargs = dict(
                    eval_set=[(X_val, y_val)],
                    eval_metric='auc',
                    callbacks=[lgb.early_stopping(stopping_rounds=300)]
                )
            m.fit(X_tr, y_tr, **fit_kwargs)
            S_train[val_idx, i]      = m.predict_proba(X_val)[:, 1]
            S_test_folds[:, i, fold] = m.predict_proba(X_test)[:, 1]
        gc.collect()


[23:14:45] INFO: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-08cbc0333d8d4aae1-1\xgboost\xgboost-ci-windows\src\data\simple_dmatrix.cc:139: Generating new Ellpack page.
[23:16:10] INFO: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-08cbc0333d8d4aae1-1\xgboost\xgboost-ci-windows\src\data\simple_dmatrix.cc:139: Generating new Ellpack page.
[23:17:36] INFO: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-08cbc0333d8d4aae1-1\xgboost\xgboost-ci-windows\src\data\simple_dmatrix.cc:139: Generating new Ellpack page.
[23:19:29] INFO: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-08cbc0333d8d4aae1-1\xgboost\xgboost-ci-windows\src\data\simple_dmatrix.cc:139: Generating new Ellpack page.
[23:21:49] INFO: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-08cbc0333d8d4aae1-1\xgboost\xgboost-ci-windows\src\data\simple_dmatrix.cc:139: Generating new Ellpack page.




[LightGBM] [Info] Number of positive: 2538, number of negative: 185920
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 567313
[LightGBM] [Info] Number of data points in the train set: 188458, number of used features: 3448
[LightGBM] [Info] Using requested OpenCL platform 0 device 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3080, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 1870 dense feature groups (336.45 MB) transferred to GPU in 0.154133 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.013467 -> initscore=-4.293940
[LightGBM] [Info] Start training from score -4.293940
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[134]	valid_0's auc: 0.664005
[LightGBM] [Info] Number of positive: 2538, number of negative: 185920


['xgb_model.joblib']

In [None]:
# усредняем прогнозы на тесте по всем фолдам
S_test = S_test_folds.mean(axis=2)
meta_model.set_params(max_iter=2000)
# обучаем мета-модель на OOF-фичах
meta_model.fit(S_train, y_train)

ValueError: Invalid parameter 'max_iter' for estimator RandomForestClassifier(max_depth=3, n_estimators=200, n_jobs=-1,
                       random_state=42). Valid parameters are: ['bootstrap', 'ccp_alpha', 'class_weight', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'max_samples', 'min_impurity_decrease', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'monotonic_cst', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start'].

In [None]:
joblib.dump(meta_model,'meta_model.joblib')

['meta_model.joblib']

In [None]:
joblib.dump(S_train,'S_train.joblib')
joblib.dump(y_train,'S_train.joblib')

['S_train.joblib']

In [None]:
# CatBoost
catboost_model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    early_stopping_rounds=700,
    use_best_model=True,
    verbose=False
)

# LightGBM
lgb_model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    eval_metric='auc',
    callbacks=[lgb.early_stopping(stopping_rounds=300)]
)

# XGBoost — переобучаем через xgb.train, чтобы был Booster с ранней остановкой
params    = xgb_model.get_xgb_params()
num_round = xgb_model.get_params()['n_estimators']
dtrain    = xgb.DMatrix(X_train, label=y_train)
dtest     = xgb.DMatrix(X_test,  label=y_test)

xgb_model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_round,
    evals=[(dtest, 'eval')],
    early_stopping_rounds=300,
    verbose_eval=False
)




[LightGBM] [Info] Number of positive: 3173, number of negative: 232400
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 568330
[LightGBM] [Info] Number of data points in the train set: 235573, number of used features: 3448
[LightGBM] [Info] Using requested OpenCL platform 0 device 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3080, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 1868 dense feature groups (419.66 MB) transferred to GPU in 0.149103 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.013469 -> initscore=-4.293783
[LightGBM] [Info] Start training from score -4.293783
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[164]	valid_0's auc: 0.639464
[01:35:04] INFO: C:\buildkite-agent\builds\buildkite-windows-cpu-autosc

In [None]:
joblib.dump(catboost_model,'catboost_model.joblib')
joblib.dump(lgb_model,'lgb_model.joblib')
joblib.dump(xgb_model,'xgb_model.joblib')

['xgb_model.joblib']

In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb

# Сохраняем список имен колонок сразу после загрузки train:
feature_names = X_train.columns.tolist() if hasattr(X_train, "columns") else None

def stacking_predict_proba(X):
    # 1) Приведём X к двум видам: numpy-массиву и DataFrame с нужными именами
    if isinstance(X, pd.DataFrame):
        X_df = X.copy()
        X_arr = X_df.values
    else:
        X_arr = X
        X_df = pd.DataFrame(X_arr, columns=feature_names) if feature_names else None

    # 2) Предсказания CatBoost и LightGBM всегда на DataFrame, чтобы сохранить имена
    p_cat = catboost_model.predict_proba(X_df if X_df is not None else X_arr)[:, 1]
    p_lgb = lgb_model.predict_proba(X_df if X_df is not None else X_arr)[:, 1]

    # 3) XGBoost — через DMatrix, отключаем жёсткую валидацию имён
    dmat = xgb.DMatrix(X_arr)
    p_xgb = xgb_model.predict(dmat, validate_features=False)

    # 4) Собираем фичи и предсказываем через мета-модель
    feats = np.column_stack([p_cat, p_lgb, p_xgb])
    return meta_model.predict_proba(feats)

In [None]:
y_pred_proba = stacking_predict_proba(X_test)



In [None]:
from sklearn.metrics import roc_auc_score

# y_pred_proba is shape (n_samples, 2)
# take only the positive-class probabilities:
pos_probs = y_pred_proba[:, 1]

print("Hold-out AUC:", roc_auc_score(y_test, pos_probs))

Hold-out AUC: 0.6671693226507708


In [None]:
print(S_test)

[[0.00269792 0.00770806 0.00894657]
 [0.00979176 0.01205127 0.01105827]
 [0.00244878 0.00193523 0.00695069]
 ...
 [0.02505243 0.0278228  0.02376606]
 [0.00353733 0.00477124 0.00578646]
 [0.00868134 0.01154252 0.01351558]]


In [None]:
# Обучение мета-модели
#stacking_model.fit(X_train, y_train)

# Predict probabilities for ROC-AUC evaluation
#y_pred_proba = stacking_model.predict_proba(X_test)[:, 1]

# Calculate ROC-AUC score
#roc_auc = roc_auc_score(y_test, y_pred_proba)
#print(f'ROC-AUC Score: {roc_auc:.4f}')

# Make final predictions
#y_pred = stacking_model.predict(X_test)

In [None]:
df = test_full

if 'index' in df.columns:
    df = df.set_index('index')
idx = df.index
y_test_s   = df['target'].values if 'target' in df.columns else None
df  = df.drop(columns=['target','Unnamed: 0'], errors='ignore')

X_test_s = (
        df
        .apply(pd.to_numeric, errors='coerce')
        .fillna(0)
        .astype(np.float32)
    )

In [None]:
del test_full
del df
gc.collect()

0

In [None]:
y_test_pred_1 = stacking_predict_proba(X_test_s)



In [None]:
joblib.dump(y_test_pred_1,'y_test_pred_5.joblib')

['y_test_pred_5.joblib']

In [None]:
print(y_test_pred_1[:, 1])

[0.01039864 0.01593811 0.01290938 ... 0.00977661 0.01152984 0.00977661]


In [None]:
import pandas as pd

submission = pd.DataFrame({
    'index': X_test_s.index,
    'score': y_test_pred_1[:, 1]
})

submission.to_csv('submission5.csv', index=False)
print("Файл submission.csv с колонками [id, target] готов к загрузке на Kaggle.")


Файл submission.csv с колонками [id, target] готов к загрузке на Kaggle.


In [None]:
print(submission)