In [1]:
pip install -U numpy pandas matplotlib seaborn scikit-learn lightgbm

Looking in indexes: https://mirrors.aliyun.com/pypi/simple
Collecting numpy
  Downloading https://mirrors.aliyun.com/pypi/packages/10/a2/010b0e27ddeacab7839957d7a8f00e91206e0c2c47abbb5f35a2630e5387/numpy-2.3.3-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (16.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.9/16.9 MB[0m [31m744.5 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting pandas
  Downloading https://mirrors.aliyun.com/pypi/packages/bf/c9/63f8d545568d9ab91476b1818b4741f521646cbdd151c6efebf40d6de6f7/pandas-2.3.3-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m769.9 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting matplotlib
  Downloading https://mirrors.aliyun.com/pypi/packages/10/b7/4aa196155b4d846bd749cf82aa5a4c300cf55a8b5e0dfa5b722a63c0f8a0/matplotlib-3.10.7-cp311-cp311-manylinux2014_x86_64.manylinux_2_

In [5]:
pip install -U numpy pandas matplotlib seaborn scikit-learn xgboost

Looking in indexes: https://mirrors.aliyun.com/pypi/simple
Collecting xgboost
  Downloading https://mirrors.aliyun.com/pypi/packages/64/ad/61a86228e981b15361ff963e84648b1a29ab43debd95f7c2b3ef9d94dca1/xgboost-3.0.5-py3-none-manylinux_2_28_x86_64.whl (94.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.9/94.9 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:03[0m
Installing collected packages: xgboost
Successfully installed xgboost-3.0.5
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [6]:
import xgboost as xgb
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

print("xgboost version:", xgb.__version__)
X, y = make_classification(n_samples=2000, n_features=30, random_state=0)
Xtr, Xva, ytr, yva = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

clf = xgb.XGBClassifier(
    objective="binary:logistic",
    eval_metric="auc",
    tree_method="gpu_hist",   # 关键：GPU
    predictor="gpu_predictor",
    n_estimators=200,
    learning_rate=0.1
)
clf.fit(Xtr, ytr, eval_set=[(Xva, yva)], verbose=False)
print("AUC OK, GPU hist works.")


xgboost version: 3.0.5



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "predictor" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


AUC OK, GPU hist works.


In [2]:
# =========================================
# Home Credit Default Risk - Runnable Script (WOA-XGBoost, GPU, xgb.train 兼容)
# 数据目录：/hy-tmp/home-credit-default-risk/
# 输出文件：lgbm_feature_importance.png, roc_curve.png, submission.csv
# =========================================

import os
import gc
import time
import warnings
from contextlib import contextmanager

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder

import xgboost as xgb

warnings.filterwarnings('ignore')

# ---------- 基础设置 ----------
DATA_DIR = "/hy-tmp"  # ← 你的数据目录
os.makedirs(".", exist_ok=True)                # 确保当前目录可写

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print(f"{title} - Completed in: {time.time() - t0:.0f} seconds")

def _try_paths(base_dir, name):
    candidates = [f"{name}.csv", f"{name}.CSV", name]
    for cand in candidates:
        p = os.path.join(base_dir, cand)
        if os.path.isfile(p):
            return p
    raise FileNotFoundError(
        f"Cannot find file for '{name}' in '{base_dir}'. Tried: {', '.join(candidates)}"
    )

def read_csv_smart(base_dir, name, **kwargs):
    path = _try_paths(base_dir, name)
    return pd.read_csv(path, **kwargs)

# ---------- 1. 读取数据 ----------
print("Reading datasets...")
with timer("Read data"):
    app_train = read_csv_smart(DATA_DIR, "application_train")
    app_test  = read_csv_smart(DATA_DIR, "application_test")
    print('Number of training samples:', len(app_train))
    print('Number of test samples:', len(app_test))

# ---------- 2. 数据预处理 ----------
with timer("Data preprocessing"):
    print('Target variable distribution:')
    print(app_train['TARGET'].value_counts())
    print('Positive sample ratio: {:.2%}'.format(app_train['TARGET'].mean()))
    
    for df in (app_train, app_test):
        if 'DAYS_EMPLOYED' in df.columns:
            df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)
    
    categorical_features = [col for col in app_train.columns if app_train[col].dtype == 'object']
    print('Number of categorical features:', len(categorical_features))
    for col in categorical_features:
        le = LabelEncoder()
        le.fit(list(app_train[col].astype(str).values) + list(app_test[col].astype(str).values))
        app_train[col] = le.transform(list(app_train[col].astype(str).values))
        app_test[col]  = le.transform(list(app_test[col].astype(str).values))

# ---------- 3. 基础衍生特征 ----------
with timer("Create basic features"):
    for df in (app_train, app_test):
        df['CREDIT_INCOME_RATIO']  = df['AMT_CREDIT']  / df['AMT_INCOME_TOTAL']
        df['ANNUITY_INCOME_RATIO'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']
        df['CREDIT_TERM']          = df['AMT_CREDIT']  / df['AMT_ANNUITY']
        df['DAYS_EMPLOYED_RATIO']  = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']

# ---------- 4. Bureau ----------
with timer("Process Bureau data"):
    bureau = read_csv_smart(DATA_DIR, "bureau")
    bb     = read_csv_smart(DATA_DIR, "bureau_balance")  # 读取保持一致，不直接使用
    
    bureau_counts   = bureau.groupby('SK_ID_CURR')['SK_ID_BUREAU'].count().reset_index().rename(
        columns={'SK_ID_BUREAU': 'BUREAU_LOAN_COUNT'})
    bureau_avg_loan = bureau.groupby('SK_ID_CURR')['AMT_CREDIT_SUM'].mean().reset_index().rename(
        columns={'AMT_CREDIT_SUM': 'BUREAU_AVG_LOAN'})
    bureau_overdue  = bureau.groupby('SK_ID_CURR')['CREDIT_DAY_OVERDUE'].max().reset_index().rename(
        columns={'CREDIT_DAY_OVERDUE': 'BUREAU_MAX_OVERDUE'})
    
    app_train = app_train.merge(bureau_counts,   on='SK_ID_CURR', how='left')
    app_train = app_train.merge(bureau_avg_loan, on='SK_ID_CURR', how='left')
    app_train = app_train.merge(bureau_overdue,  on='SK_ID_CURR', how='left')
    
    app_test  = app_test.merge(bureau_counts,   on='SK_ID_CURR', how='left')
    app_test  = app_test.merge(bureau_avg_loan, on='SK_ID_CURR', how='left')
    app_test  = app_test.merge(bureau_overdue,  on='SK_ID_CURR', how='left')

    del bureau, bb, bureau_counts, bureau_avg_loan, bureau_overdue
    gc.collect()

# ---------- 5. Previous Application ----------
with timer("Process Previous_application data"):
    prev = read_csv_smart(DATA_DIR, "previous_application")
    
    prev_app_counts = prev.groupby('SK_ID_CURR')['SK_ID_PREV'].count().reset_index().rename(
        columns={'SK_ID_PREV': 'PREV_APP_COUNT'})
    prev_app_amt = prev.groupby('SK_ID_CURR')['AMT_CREDIT'].mean().reset_index().rename(
        columns={'AMT_CREDIT': 'PREV_APP_AVG_AMOUNT'})
    prev_app_rejected = prev.groupby('SK_ID_CURR')['NAME_CONTRACT_STATUS'].apply(
        lambda x: (x == 'Refused').sum() / len(x)
    ).reset_index().rename(columns={'NAME_CONTRACT_STATUS': 'PREV_APP_REJECTION_RATIO'})
    
    app_train = app_train.merge(prev_app_counts,   on='SK_ID_CURR', how='left')
    app_train = app_train.merge(prev_app_amt,      on='SK_ID_CURR', how='left')
    app_train = app_train.merge(prev_app_rejected, on='SK_ID_CURR', how='left')
    
    app_test  = app_test.merge(prev_app_counts,   on='SK_ID_CURR', how='left')
    app_test  = app_test.merge(prev_app_amt,      on='SK_ID_CURR', how='left')
    app_test  = app_test.merge(prev_app_rejected, on='SK_ID_CURR', how='left')

    del prev, prev_app_counts, prev_app_amt, prev_app_rejected
    gc.collect()

# ---------- 6. Installments ----------
with timer("Process Installments_payments data"):
    ins = read_csv_smart(DATA_DIR, "installments_payments")
    
    ins['DAYS_LATE'] = (ins['DAYS_ENTRY_PAYMENT'] - ins['DAYS_INSTALMENT']).clip(lower=0)
    avg_late_days = ins.groupby('SK_ID_CURR')['DAYS_LATE'].mean().reset_index().rename(
        columns={'DAYS_LATE': 'AVG_LATE_DAYS'})
    ins['PAYMENT_RATIO'] = ins['AMT_PAYMENT'] / ins['AMT_INSTALMENT']
    avg_payment_ratio = ins.groupby('SK_ID_CURR')['PAYMENT_RATIO'].mean().reset_index().rename(
        columns={'PAYMENT_RATIO': 'AVG_PAYMENT_RATIO'})
    
    app_train = app_train.merge(avg_late_days,     on='SK_ID_CURR', how='left')
    app_train = app_train.merge(avg_payment_ratio, on='SK_ID_CURR', how='left')
    app_test  = app_test.merge(avg_late_days,      on='SK_ID_CURR', how='left')
    app_test  = app_test.merge(avg_payment_ratio,  on='SK_ID_CURR', how='left')

    del ins, avg_late_days, avg_payment_ratio
    gc.collect()

# ---------- 7. POS_CASH ----------
with timer("Process POS_CASH_balance data"):
    pos = read_csv_smart(DATA_DIR, "POS_CASH_balance")
    
    avg_pos_dpd = pos.groupby('SK_ID_CURR')['SK_DPD'].mean().reset_index().rename(
        columns={'SK_DPD': 'AVG_POS_DPD'})
    max_pos_dpd = pos.groupby('SK_ID_CURR')['SK_DPD'].max().reset_index().rename(
        columns={'SK_DPD': 'MAX_POS_DPD'})
    pos_counts = pos.groupby('SK_ID_CURR').size().reset_index().rename(columns={0: 'POS_COUNT'})
    
    app_train = app_train.merge(avg_pos_dpd, on='SK_ID_CURR', how='left')
    app_train = app_train.merge(max_pos_dpd, on='SK_ID_CURR', how='left')
    app_train = app_train.merge(pos_counts,  on='SK_ID_CURR', how='left')
    app_test  = app_test.merge(avg_pos_dpd, on='SK_ID_CURR', how='left')
    app_test  = app_test.merge(max_pos_dpd, on='SK_ID_CURR', how='left')
    app_test  = app_test.merge(pos_counts,  on='SK_ID_CURR', how='left')

    del pos, avg_pos_dpd, max_pos_dpd, pos_counts
    gc.collect()

# ---------- 8. Credit Card ----------
with timer("Process Credit_card_balance data"):
    cc = read_csv_smart(DATA_DIR, "credit_card_balance")
    
    cc_counts = cc.groupby('SK_ID_CURR')['SK_ID_PREV'].nunique().reset_index().rename(
        columns={'SK_ID_PREV': 'CC_COUNT'})
    avg_cc_balance = cc.groupby('SK_ID_CURR')['AMT_BALANCE'].mean().reset_index().rename(
        columns={'AMT_BALANCE': 'AVG_CC_BALANCE'})
    max_cc_dpd = cc.groupby('SK_ID_CURR')['SK_DPD'].max().reset_index().rename(
        columns={'SK_DPD': 'MAX_CC_DPD'})
    
    app_train = app_train.merge(cc_counts,      on='SK_ID_CURR', how='left')
    app_train = app_train.merge(avg_cc_balance, on='SK_ID_CURR', how='left')
    app_train = app_train.merge(max_cc_dpd,     on='SK_ID_CURR', how='left')
    app_test  = app_test.merge(cc_counts,       on='SK_ID_CURR', how='left')
    app_test  = app_test.merge(avg_cc_balance,  on='SK_ID_CURR', how='left')
    app_test  = app_test.merge(max_cc_dpd,      on='SK_ID_CURR', how='left')

    del cc, cc_counts, avg_cc_balance, max_cc_dpd
    gc.collect()

# ---------- 9. 缺失值填充（修复 inf/-inf 问题） ----------
with timer("Fill missing values"):
    # 先把 inf/-inf 统一替换为 NaN（比值分母为 0 时会产生 inf）
    for df in (app_train, app_test):
        df.replace([np.inf, -np.inf], np.nan, inplace=True)
    # 再做填充
    app_train = app_train.fillna(-999)
    app_test  = app_test.fillna(-999)
    print('Training set shape:', app_train.shape)
    print('Test set shape:', app_test.shape)

# ------------------- 10. 训练 WOA-XGBoost（GPU, 使用 xgb.train） -------------------
with timer("Train WOA-XGBoost (GPU)"):
    features = [c for c in app_train.columns if c not in ['TARGET', 'SK_ID_CURR']]
    X, y = app_train[features], app_train['TARGET']
    X_test = app_test[features]
    
    print('Number of features:', len(features))
    print('X shape:', X.shape)
    print('y shape:', y.shape)
    print('X_test shape:', X_test.shape)

    # ----- 参数空间 -----
    space = {
        "max_depth":          (3, 10, 'int'),
        "learning_rate":      (0.01, 0.2, 'float'),
        "subsample":          (0.5, 1.0, 'float'),
        "colsample_bytree":   (0.5, 1.0, 'float'),
        "reg_alpha":          (0.0, 10.0, 'float'),
        "reg_lambda":         (0.0, 10.0, 'float'),
        "min_child_weight":   (1.0, 50.0, 'float'),
        "gamma":              (0.0, 10.0, 'float'),
        "n_estimators":       (200, 2000, 'int')
    }
    keys = list(space.keys())
    dim = len(keys)

    def clip_cast(name, v):
        low, high, tp = space[name]
        v = np.minimum(np.maximum(v, low), high)
        return int(round(v)) if tp == 'int' else float(v)

    def vec_to_param_dict(vec):
        # xgb.train 的 params（不包含 num_boost_round）
        raw = {k: clip_cast(k, v) for k, v in zip(keys, vec)}
        num_boost_round = int(raw.pop("n_estimators"))
        params = {
            "objective": "binary:logistic",
            "eval_metric": "auc",
            "tree_method": "gpu_hist",
            "predictor": "gpu_predictor",
            "max_depth": raw["max_depth"],
            "learning_rate": raw["learning_rate"],
            "subsample": raw["subsample"],
            "colsample_bytree": raw["colsample_bytree"],
            "reg_alpha": raw["reg_alpha"],
            "reg_lambda": raw["reg_lambda"],
            "min_child_weight": raw["min_child_weight"],
            "gamma": raw["gamma"],
            "max_bin": 256,
            "verbosity": 0,
            "seed": RANDOM_STATE
        }
        return params, num_boost_round

    # 预测工具：兼容不同版本的 best_ntree_limit / best_iteration
    def predict_with_best(bst, dmat):
        if hasattr(bst, "best_ntree_limit") and bst.best_ntree_limit:
            return bst.predict(dmat, ntree_limit=bst.best_ntree_limit)
        elif hasattr(bst, "best_iteration") and bst.best_iteration is not None:
            try:
                return bst.predict(dmat, iteration_range=(0, bst.best_iteration + 1))
            except TypeError:
                return bst.predict(dmat)
        else:
            return bst.predict(dmat)

    # 适应度：3-Fold AUC + 早停（DMatrix 显式 missing=np.nan）
    folds_woa = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)
    cache = {}

    def fitness(vec):
        key = tuple(np.round(vec, 6))
        if key in cache:
            return cache[key]
        params, num_round = vec_to_param_dict(vec)
        aucs = []
        for trn_idx, val_idx in folds_woa.split(X, y):
            X_trn, X_val = X.iloc[trn_idx], X.iloc[val_idx]
            y_trn, y_val = y.iloc[trn_idx], y.iloc[val_idx]
            dtrn = xgb.DMatrix(X_trn, label=y_trn, missing=np.nan)
            dval = xgb.DMatrix(X_val, label=y_val, missing=np.nan)
            watchlist = [(dtrn, 'train'), (dval, 'valid')]

            bst = xgb.train(
                params=params,
                dtrain=dtrn,
                num_boost_round=num_round,
                evals=watchlist,
                early_stopping_rounds=200,
                verbose_eval=False
            )
            pred_val = predict_with_best(bst, dval)
            aucs.append(roc_auc_score(y_val, pred_val))
            del bst, dtrn, dval, X_trn, X_val, y_trn, y_val
        score = float(np.mean(aucs))
        cache[key] = score
        return score

    # 初始化 WOA
    pop_size = 12
    max_iter = 15
    rng = np.random.RandomState(RANDOM_STATE)
    lb = np.array([space[k][0] for k in keys], dtype=float)
    ub = np.array([space[k][1] for k in keys], dtype=float)
    population = lb + (ub - lb) * rng.rand(pop_size, dim)
    fitness_vals = np.array([fitness(ind) for ind in population])
    best_idx = int(np.argmax(fitness_vals))
    best_pos = population[best_idx].copy()
    best_score = float(fitness_vals[best_idx])
    print(f"WOA init best AUC: {best_score:.6f}")

    # 主循环
    b = 1.0
    for t in range(max_iter):
        a = 2 - 2 * (t / (max_iter - 1 + 1e-12))
        for i in range(pop_size):
            r1, r2 = rng.rand(), rng.rand()
            A = 2 * a * r1 - a
            C = 2 * r2
            p = rng.rand()
            Xi = population[i].copy()

            if p < 0.5:
                if abs(A) < 1:
                    D = np.abs(C * best_pos - Xi)
                    new_pos = best_pos - A * D
                else:
                    rand_idx = rng.randint(pop_size)
                    Xrand = population[rand_idx]
                    D = np.abs(C * Xrand - Xi)
                    new_pos = Xrand - A * D
            else:
                l = rng.uniform(-1, 1)
                D = np.abs(best_pos - Xi)
                new_pos = D * np.exp(b * l) * np.cos(2 * np.pi * l) + best_pos

            new_pos = np.minimum(np.maximum(new_pos, lb), ub)
            new_fit = fitness(new_pos)
            if new_fit > fitness_vals[i]:
                population[i] = new_pos
                fitness_vals[i] = new_fit

        iter_best_idx = int(np.argmax(fitness_vals))
        iter_best_fit = float(fitness_vals[iter_best_idx])
        if iter_best_fit > best_score:
            best_score = iter_best_fit
            best_pos = population[iter_best_idx].copy()
        print(f"WOA iter {t+1}/{max_iter} best AUC: {best_score:.6f}")

    # 最优参数并 5-Fold 训练
    best_params, best_num_round = vec_to_param_dict(best_pos)
    print("Best params from WOA:")
    for k in keys:
        if k == "n_estimators":
            continue
        if k in best_params:
            print(f"  {k}: {best_params[k]}")
        else:
            print(f"  {k}: {clip_cast(k, dict(zip(keys, best_pos))[k])}")
    print(f"  n_estimators (num_boost_round): {best_num_round}")

    n_folds = 5
    folds = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=RANDOM_STATE)

    oof_preds = np.zeros(X.shape[0], dtype=float)
    test_preds = np.zeros(X_test.shape[0], dtype=float)
    feature_importance_df = pd.DataFrame()

    dtest = xgb.DMatrix(X_test, missing=np.nan)

    for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y), 1):
        print(f'Fold {fold_}')
        X_trn, X_val = X.iloc[trn_idx], X.iloc[val_idx]
        y_trn, y_val = y.iloc[trn_idx], y.iloc[val_idx]
        dtrn = xgb.DMatrix(X_trn, label=y_trn, missing=np.nan)
        dval = xgb.DMatrix(X_val, label=y_val, missing=np.nan)

        watchlist = [(dtrn, 'train'), (dval, 'valid')]
        bst = xgb.train(
            params=best_params,
            dtrain=dtrn,
            num_boost_round=best_num_round,
            evals=watchlist,
            early_stopping_rounds=200,
            verbose_eval=200
        )

        oof_preds[val_idx] = predict_with_best(bst, dval)
        test_preds += predict_with_best(bst, dtest) / n_folds

        # 特征重要性（gain）
        gain_map = bst.get_score(importance_type='gain')  # 可能是 f0/f1... 或列名
        mapped = {}
        for k, v in gain_map.items():
            if k.startswith('f') and k[1:].isdigit():
                idx = int(k[1:])
                if 0 <= idx < len(features):
                    mapped[features[idx]] = v
            else:
                mapped[k] = v
        imp_series = pd.Series(mapped)
        if not imp_series.empty:
            fold_imp = pd.DataFrame({
                "feature": imp_series.index,
                "importance": imp_series.values,
                "fold": fold_
            })
            feature_importance_df = pd.concat([feature_importance_df, fold_imp], axis=0)

        del bst, dtrn, dval, X_trn, X_val, y_trn, y_val
        gc.collect()

    cv_auc = roc_auc_score(y, oof_preds)
    print(f'Full AUC score: {cv_auc:.6f}')

    # 特征重要性图（沿用文件名）
    plt.figure(figsize=(10, 20))
    if feature_importance_df.empty:
        print("Warning: feature_importance_df is empty; using zeros as fallback.")
        importance = pd.Series(0, index=features, dtype=float)
    else:
        feature_importance = (feature_importance_df
                              .groupby('feature')['importance']
                              .mean()
                              .sort_values(ascending=False))
        importance = feature_importance

    top_features = importance.head(30).index
    sns.barplot(y=top_features, x=importance[top_features], orient='h')
    plt.title('XGBoost Features (Top 30 by gain importance)')
    plt.tight_layout()
    plt.savefig('lgbm_feature_importance.png')
    plt.close()

    # ROC 曲线
    plt.figure(figsize=(8, 6))
    fpr, tpr, _ = roc_curve(y, oof_preds)
    plt.plot(fpr, tpr, label=f'CV AUC: {cv_auc:.4f}')
    plt.title('ROC Curve')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend()
    plt.tight_layout()
    plt.savefig('roc_curve.png')
    plt.close()

# ---------- 11. 生成提交文件 ----------
with timer("Generate prediction results"):
    submission = pd.DataFrame({
        'SK_ID_CURR': app_test['SK_ID_CURR'],
        'TARGET': test_preds
    })
    submission.to_csv('submission.csv', index=False)
    print('Prediction results have been saved as submission.csv')

print('Done! Final CV score:', cv_auc)


Reading datasets...
Number of training samples: 307511
Number of test samples: 48744
Read data - Completed in: 4 seconds
Target variable distribution:
TARGET
0    282686
1     24825
Name: count, dtype: int64
Positive sample ratio: 8.07%
Number of categorical features: 16
Data preprocessing - Completed in: 4 seconds
Create basic features - Completed in: 0 seconds
Process Bureau data - Completed in: 7 seconds
Process Previous_application data - Completed in: 31 seconds
Process Installments_payments data - Completed in: 9 seconds
Process POS_CASH_balance data - Completed in: 7 seconds
Process Credit_card_balance data - Completed in: 6 seconds
Training set shape: (307511, 140)
Test set shape: (48744, 139)
Fill missing values - Completed in: 0 seconds
Number of features: 138
X shape: (307511, 138)
y shape: (307511,)
X_test shape: (48744, 138)
WOA init best AUC: 0.776585
WOA iter 1/15 best AUC: 0.777694
WOA iter 2/15 best AUC: 0.777694
WOA iter 3/15 best AUC: 0.777877
WOA iter 4/15 best AUC: