In [None]:
%load_ext cudf.pandas

In [None]:
import xgboost as xgb
import pandas as pd
import numpy as np
import warnings
import optuna
import gc
from sklearn.model_selection import StratifiedKFold
from pandas.errors import PerformanceWarning
from sklearn.metrics import roc_auc_score
from optuna.samplers import TPESampler
from itertools import combinations
from xgboost import XGBClassifier
from tqdm import tqdm
import lightgbm as lgb
from catboost import CatBoostClassifier


warnings.simplefilter(action="ignore", category=PerformanceWarning)

In [None]:
#  --- Feature proccesing ----
train = pd.read_csv('/kaggle/input/playground-series-s5e8/train.csv', index_col='id')
test = pd.read_csv('/kaggle/input/playground-series-s5e8/test.csv', index_col='id')
orig = pd.read_csv('/kaggle/input/bank-marketing-dataset-full/bank-full.csv', delimiter=';')
orig['y'] = orig['y'].replace({'yes': 1, 'no': 0})

TARGET = 'y'
NUMS = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
CATS = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']

train[CATS] = train[CATS].astype('category')
test[CATS] = test[CATS].astype('category')
orig[CATS] = orig[CATS].astype('category')

TE_columns = []

columns = NUMS + CATS



# Creating new columns
for r in [2]:
    for cols in tqdm(list(combinations(columns, r))):
        name = '-'.join(cols)
    
        train[name] = train[cols[0]].astype(str)
        for col in cols[1:]:
            train[name] = train[name] + '_' + train[col].astype(str)
    
        test[name] = test[cols[0]].astype(str)
        for col in cols[1:]:
            test[name] = test[name] + '_' + test[col].astype(str)
    
        orig[name] = orig[cols[0]].astype(str)
        for col in cols[1:]:
            orig[name] = orig[name] + '_' + orig[col].astype(str)
        
        combined = pd.concat([train[name], test[name], orig[name]], ignore_index=True)
        combined, _ = combined.factorize()
        train[name] = combined[:len(train)]
        test[name] = combined[len(train):len(train) + len(test)]
        orig[name] = combined[len(train) + len(test):]
    
        TE_columns.append(name)

FEATURES = train.columns.tolist()
FEATURES.remove(TARGET)

In [None]:
def target_encode(train, valid, test, col, target=TARGET, kfold=5, smooth=20, agg='mean'):
    train['kfold'] = ((train.index) % kfold)
    col_name = '_'.join(col)
    train[f'TE_{agg.upper()}_' + col_name] = 0.
    for i in range(kfold):
        df_tmp = train[train['kfold'] != i]
        if agg == 'mean': mn = train[target].mean()
        elif agg == 'median': mn = train[target].median()
        elif agg == 'min': mn = train[target].min()
        elif agg == 'max': mn = train[target].max()
        elif agg == 'nunique': mn = 0
        df_tmp = df_tmp[col + [target]].groupby(col).agg([agg, 'count']).reset_index()
        df_tmp.columns = col + [agg, 'count']
        if agg == 'nunique':
            df_tmp['TE_tmp'] = df_tmp[agg] / df_tmp['count']
        else:
            df_tmp['TE_tmp'] = ((df_tmp[agg] * df_tmp['count']) + (mn * smooth)) / (df_tmp['count'] + smooth)
        df_tmp_m = train[col + ['kfold', f'TE_{agg.upper()}_' + col_name]].merge(df_tmp, how='left', left_on=col, right_on=col)
        df_tmp_m.loc[df_tmp_m['kfold'] == i, f'TE_{agg.upper()}_' + col_name] = df_tmp_m.loc[df_tmp_m['kfold'] == i, 'TE_tmp']
        train[f'TE_{agg.upper()}_' + col_name] = df_tmp_m[f'TE_{agg.upper()}_' + col_name].fillna(mn).values

    df_tmp = train[col + [target]].groupby(col).agg([agg, 'count']).reset_index()
    if agg == 'mean': mn = train[target].mean()
    elif agg == 'median': mn = train[target].median()
    elif agg == 'min': mn = train[target].min()
    elif agg == 'max': mn = train[target].max()
    elif agg == 'nunique': mn = 0
    df_tmp.columns = col + [agg, 'count']
    if agg == 'nunique':
        df_tmp['TE_tmp'] = df_tmp[agg] / df_tmp['count']
    else:
        df_tmp['TE_tmp'] = ((df_tmp[agg] * df_tmp['count']) + (mn * smooth)) / (df_tmp['count'] + smooth)
    df_tmp_m = valid[col].merge(df_tmp, how='left', left_on=col, right_on=col)
    valid[f'TE_{agg.upper()}_' + col_name] = df_tmp_m['TE_tmp'].fillna(mn).values
    valid[f'TE_{agg.upper()}_' + col_name] = valid[f'TE_{agg.upper()}_' + col_name].astype('float32')

    df_tmp_m = test[col].merge(df_tmp, how='left', left_on=col, right_on=col)
    test[f'TE_{agg.upper()}_' + col_name] = df_tmp_m['TE_tmp'].fillna(mn).values
    test[f'TE_{agg.upper()}_' + col_name] = test[f'TE_{agg.upper()}_' + col_name].astype('float32')

    train = train.drop('kfold', axis=1)
    train[f'TE_{agg.upper()}_' + col_name] = train[f'TE_{agg.upper()}_' + col_name].astype('float32')

    return (train, valid, test)

def count_encode(train, valid, test, col):
    counts = train[col].value_counts()

    train[f'CE_{col}'] = train[col].map(counts)
    valid[f'CE_{col}'] = valid[col].map(counts).fillna(0)
    test[f'CE_{col}'] = test[col].map(counts).fillna(0)
    
    return (train, valid, test)


In [None]:
oof = np.zeros(len(train))
pred = np.zeros(len(test))

oof_xgb = np.zeros(len(train))
pred_xgb = np.zeros(len(test))

oof_lgb = np.zeros(len(train))
pred_lgb = np.zeros(len(test))

oof_cat = np.zeros(len(train))
pred_cat = np.zeros(len(test))

skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

for idx, (train_idx, val_idx) in enumerate(skf.split(train, train[TARGET])):
    print(f"========== FOLD {idx+1} ==========")
    X_train, X_val = train.loc[train_idx, FEATURES], train.loc[val_idx, FEATURES]
    y_train, y_val = train.loc[train_idx, TARGET], train.loc[val_idx, TARGET]
    X_test = test.copy()

    # Аугментация и Feature Engineering (этот блок остается без изменений)
    X_train = pd.concat([X_train, orig[FEATURES]])
    y_train = pd.concat([y_train, orig[TARGET]])

    for col in tqdm(TE_columns):
        X_train, X_val, X_test = target_encode(pd.concat([X_train, y_train], axis=1), X_val, X_test, [col], smooth=10, agg='mean')
        X_train = X_train.drop(TARGET, axis=1)
        X_train, X_val, X_test = count_encode(X_train, X_val, X_test, col)
    
        X_train = X_train.drop(col, axis=1)
        X_val = X_val.drop(col, axis=1)
        X_test = X_test.drop(col, axis=1)
        
    # XGBoost, CatBoost и LightGBM могут работать с типом 'category' напрямую
    # Убедимся, что типы данных правильные после всех манипуляций
    X_train[CATS] = X_train[CATS].astype('category')
    X_val[CATS] = X_val[CATS].astype('category')
    X_test[CATS] = X_test[CATS].astype('category')

    # --- 1. Обучение XGBoost ---
    print("\n--- Training XGBoost ---")
    param_grid_xgb = {'colsample_bytree': 0.34, 'subsample': 0.89, 'reg_lambda': 4.06, 'reg_alpha': 2.91, 'max_depth': 8}
    model_xgb = XGBClassifier(**param_grid_xgb, 
                              n_estimators=10000,
                              objective='binary:logistic', eval_metric='auc', learning_rate=0.01,
                              early_stopping_rounds=200, random_state=42+idx,
                              enable_categorical=True, device='cuda', n_jobs=-1)
    
    model_xgb.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=0)
    
    oof_xgb[val_idx] = model_xgb.predict_proba(X_val)[:, 1]
    pred_xgb += model_xgb.predict_proba(X_test)[:, 1]
    print(f'XGB Fold {idx + 1} AUC: {roc_auc_score(y_val, oof_xgb[val_idx])}')
    
    # --- 2. Обучение LightGBM ---
    print("\n--- Training LightGBM ---")
    params_lgb = {'objective': 'binary', 'metric': 'auc', 'learning_rate': 0.01, 'n_estimators': 10000,
                  'lambda_l1': 0.1, 'lambda_l2': 0.1, 'num_leaves': 31, 'feature_fraction': 0.7,
                  'bagging_fraction': 0.7, 'bagging_freq': 1, 'verbose': -1, 'n_jobs': -1, 'seed': 42+idx}
                  
    model_lgb = lgb.LGBMClassifier(**params_lgb)
    
    model_lgb.fit(X_train, y_train, eval_set=[(X_val, y_val)],
                  callbacks=[lgb.early_stopping(200, verbose=False)])

    oof_lgb[val_idx] = model_lgb.predict_proba(X_val)[:, 1]
    pred_lgb += model_lgb.predict_proba(X_test)[:, 1]
    print(f'LGB Fold {idx + 1} AUC: {roc_auc_score(y_val, oof_lgb[val_idx])}')
    
    # --- 3. Обучение CatBoost ---
    print("\n--- Training CatBoost ---")
    # CatBoost лучше всего передать список категориальных признаков
    categorical_features_indices = [X_train.columns.get_loc(col) for col in CATS]
    
    model_cat = CatBoostClassifier(iterations=10000, learning_rate=0.02, loss_function='Logloss', eval_metric='AUC',
                                   random_seed=42+idx, verbose=0, cat_features=CATS,
                                   early_stopping_rounds=200, task_type="GPU") # Используем GPU

    model_cat.fit(X_train, y_train, eval_set=(X_val, y_val))
    
    oof_cat[val_idx] = model_cat.predict_proba(X_val)[:, 1]
    pred_cat += model_cat.predict_proba(X_test)[:, 1]
    print(f'CAT Fold {idx + 1} AUC: {roc_auc_score(y_val, oof_cat[val_idx])}')

    print(f"CV AUC XGB: {roc_auc_score(train[TARGET], oof_xgb)}")
    print(f"CV AUC LGB: {roc_auc_score(train[TARGET], oof_lgb)}")
    print(f"CV AUC CAT: {roc_auc_score(train[TARGET], oof_cat)}")

    # делим накопленные предсказания на количество фолдов
    pred_xgb /= skf.n_splits
    pred_lgb /= skf.n_splits
    pred_cat /= skf.n_splits
    
    # Простое усреднение
    ensemble_pred = (pred_xgb + pred_lgb + pred_cat) / 3
    
    # Также можно проверить качество ансамбля на OOF-предсказаниях
    ensemble_oof = (oof_xgb + oof_lgb + oof_cat) / 3
    print(f"\nCV AUC Ensemble: {roc_auc_score(train[TARGET], ensemble_oof)}")

    # Освобождаем память
    del model_xgb, model_lgb, model_cat, X_train, X_val, y_train, y_val, X_test
    gc.collect()

In [None]:
submission = pd.read_csv('/kaggle/input/playground-series-s5e8/sample_submission.csv')
submission['y'] = ensemble_pred
submission.to_csv('xgb.csv', index=False)
pd.DataFrame({'xgb_oof': oof}).to_csv('xgb_oof.csv', index=False)
pd.DataFrame({'xgb_pred': pred}).to_csv('xgb_pred.csv', index=False)