In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/binary-classification-with-a-bank-dataset-clone/sample_submission.csv
/kaggle/input/binary-classification-with-a-bank-dataset-clone/train.csv
/kaggle/input/binary-classification-with-a-bank-dataset-clone/test.csv


In [4]:
# ==========================
# Load Data
# ==========================
train = pd.read_csv("/kaggle/input/binary-classification-with-a-bank-dataset-clone/train.csv").drop('id', axis=1)
test = pd.read_csv("/kaggle/input/binary-classification-with-a-bank-dataset-clone/test.csv").drop('id', axis=1)


In [5]:
import warnings
import optuna

import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import mutual_info_classif

import xgboost as xgb
import lightgbm as lgb

warnings.filterwarnings('ignore')
optuna.logging.set_verbosity(optuna.logging.WARNING)

X = train.drop(columns=['y']).copy()
y = train['y']

for col in X.select_dtypes(include='object'):
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])

mi = mutual_info_classif(X, y, discrete_features='auto')
mi_series = pd.Series(mi, index=X.columns).sort_values(ascending=False)
print(mi_series)


duration     0.154631
housing      0.120942
poutcome     0.118989
balance      0.070033
contact      0.059184
marital      0.057853
education    0.056173
month        0.053843
job          0.033609
pdays        0.030757
campaign     0.024125
age          0.018686
day          0.014402
loan         0.012775
previous     0.012011
default      0.000999
dtype: float64


In [6]:
def apply_feature_engineering(df):
    df_fe = df.copy()
    
    label_encoders = {}
    for col in df_fe.select_dtypes(include='object').columns:
        if col != 'y':
            le = LabelEncoder()
            df_fe[col] = le.fit_transform(df_fe[col])
            label_encoders[col] = le
    
    df_fe['poutcome_duration'] = df_fe['poutcome'] * df_fe['duration']
    df_fe['housing_duration'] = df_fe['housing'] * df_fe['duration']
    df_fe['poutcome_housing'] = df_fe['poutcome'] * df_fe['housing']
    
    df_fe['duration_per_campaign'] = df_fe['duration'] / (df_fe['campaign'] + 1)
    df_fe['duration_log'] = np.log1p(df_fe['duration'])
    df_fe['is_long_call'] = (df_fe['duration'] > df_fe['duration'].median()).astype(int)
    
    df_fe['job_duration_mean'] = df_fe.groupby('job')['duration'].transform('mean')
    df_fe['job_age_mean'] = df_fe.groupby('job')['age'].transform('mean')
    df_fe['job_balance_mean'] = df_fe.groupby('job')['balance'].transform('mean')
    
    df_fe['education_duration_mean'] = df_fe.groupby('education')['duration'].transform('mean')
    df_fe['education_balance_mean'] = df_fe.groupby('education')['balance'].transform('mean')
    
    df_fe['marital_age_mean'] = df_fe.groupby('marital')['age'].transform('mean')
    
    df_fe['total_loans'] = df_fe['housing'] + df_fe['loan']
    df_fe['financial_stress'] = (df_fe['default'] + df_fe['housing'] + df_fe['loan']).clip(0, 3)
    
    df_fe['contact_success_rate'] = df_fe['poutcome'] / (df_fe['previous'] + 1)
    
    df_fe['age_balance_ratio'] = df_fe['balance'] / (df_fe['age'] + 1)
    
    return df_fe


In [7]:
train_fe = apply_feature_engineering(train)
test_fe = apply_feature_engineering(test)


In [8]:
X = train_fe.drop(columns=['y']).copy()
y = train_fe['y']

for col in X.select_dtypes(include='object'):
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])

mi = mutual_info_classif(X, y, discrete_features='auto')
mi_series = pd.Series(mi, index=X.columns).sort_values(ascending=False)
print(mi_series)


duration_log               0.155157
is_long_call               0.155091
duration                   0.154517
poutcome_duration          0.151345
duration_per_campaign      0.145703
contact_success_rate       0.127087
housing                    0.120864
poutcome                   0.119656
poutcome_housing           0.112343
marital_age_mean           0.085463
education_duration_mean    0.078729
housing_duration           0.078319
balance                    0.069715
financial_stress           0.069549
total_loans                0.069228
education_balance_mean     0.066718
age_balance_ratio          0.059430
contact                    0.058424
marital                    0.057723
education                  0.056240
month                      0.054559
job_age_mean               0.041661
job                        0.034055
job_balance_mean           0.030406
pdays                      0.029707
job_duration_mean          0.028646
campaign                   0.024858
age                        0

In [9]:
kf = StratifiedKFold(n_splits=5, shuffle=True)


In [10]:
def xgb_objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 300, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.4, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1.0),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.4, 1.0),
        'colsample_bynode': trial.suggest_float('colsample_bynode', 0.4, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 15),
        'gamma': trial.suggest_float('gamma', 0.0, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 5.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 5.0),
        'tree_method': 'gpu_hist',
        'use_label_encoder': False,
        'eval_metric': 'logloss',
        'verbosity': 0
    }
    model = xgb.XGBClassifier(**params)
    score = cross_val_score(model, X, y, cv=kf, scoring='roc_auc', n_jobs=1).mean()
    return score


In [11]:
def lgb_objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 200, 600),
        'max_depth': trial.suggest_int('max_depth', 4, 12),
        'num_leaves': trial.suggest_int('num_leaves', 30, 150),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 100),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 2.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 2.0),
        'device': 'gpu',
        'objective': 'binary',
        'metric': 'auc',
        'verbosity': -1
    }
    model = lgb.LGBMClassifier(**params)
    score = cross_val_score(model, X, y, cv=kf, scoring='roc_auc', n_jobs=1).mean()
    return score


In [12]:
def run_te_cvens_blending(train_data, test_data, submission_path, save_path, n_splits=10, xgb_model=None, lgb_model=None):
    X = train_data.drop(columns=["y"]).copy()
    y = train_data["y"]
    
    cat_cols = X.select_dtypes("object").columns.tolist()
    
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True)
    CV_result = []
    test_preds = []
    
    for fold, (train_idx, valid_idx) in enumerate(kf.split(X, y), 1):
        X_train, X_valid = X.iloc[train_idx].copy(), X.iloc[valid_idx].copy()
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
        test_fold = test_data.copy()
        
        for col in cat_cols:
            encoding_dict = y_train.groupby(X_train[col]).mean().to_dict()
            global_mean = y_train.mean()
            for category in X_train[col].unique():
                n = (X_train[col] == category).sum()
                smooth_mean = (encoding_dict.get(category, global_mean) * n + global_mean * 5) / (n + 5)
                encoding_dict[category] = smooth_mean
            X_train[col] = X_train[col].map(encoding_dict).fillna(global_mean)
            X_valid[col] = X_valid[col].map(encoding_dict).fillna(global_mean)
            test_fold[col] = test_fold[col].map(encoding_dict).fillna(global_mean)
        
        xgb_clf = xgb_model if xgb_model else xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss")
        lgb_clf = lgb_model if lgb_model else lgb.LGBMClassifier(objective="binary", verbose=-1)
        
        xgb_clf.fit(X_train, y_train)
        lgb_clf.fit(X_train, y_train)
        
        y_pred_proba = (xgb_clf.predict_proba(X_valid)[:, 1] + lgb_clf.predict_proba(X_valid)[:, 1]) / 2.0
        CV_result.append({"fold": fold, "roc_auc": roc_auc_score(y_valid, y_pred_proba)})
        
        test_pred_fold = (xgb_clf.predict_proba(test_fold)[:, 1] + lgb_clf.predict_proba(test_fold)[:, 1]) / 2.0
        test_preds.append(test_pred_fold)
    
    CV_result = pd.DataFrame(CV_result)
    print(CV_result)
    print(f"Mean CV Score: {CV_result['roc_auc'].mean():.5f}")
    
    y_test_pred_proba = np.mean(test_preds, axis=0)
    submission = pd.read_csv(submission_path)
    submission["y"] = y_test_pred_proba
    submission.to_csv(save_path, index=False)


In [13]:
xgb_tuned = xgb.XGBClassifier(
    use_label_encoder=False,
    eval_metric="logloss",
    n_estimators=810,
    max_depth=12,
    learning_rate=0.04122760306111473,
    subsample=0.9920195260264391,
    colsample_bytree=0.7260265095394081,
    colsample_bylevel=0.9989985352864291,
    colsample_bynode=0.5220381476579743,
    min_child_weight=4,
    gamma=0.208214273451558,
    reg_alpha=2.6398439551213113,
    reg_lambda=4.145113021798959
)


In [14]:
lgb_tuned = lgb.LGBMClassifier(
    objective="binary",
    verbose=-1,
    n_estimators=431,
    max_depth=13,
    num_leaves=81,
    learning_rate=0.10150620748447788,
    subsample=0.7515173871129093,
    colsample_bytree=0.5869950899621996,
    min_child_samples=64,
    reg_alpha=1.6572209442239088,
    reg_lambda=0.6510034115814697
)


In [15]:
run_te_cvens_blending(
    train_data=train_fe,
    test_data=test_fe,
    submission_path="/kaggle/input/binary-classification-with-a-bank-dataset-clone/sample_submission.csv",
    save_path="submission.csv",
    xgb_model=xgb_tuned,
    lgb_model=lgb_tuned
)


   fold   roc_auc
0     1  0.969572
1     2  0.969739
2     3  0.970018
3     4  0.970592
4     5  0.969795
5     6  0.970208
6     7  0.969156
7     8  0.970076
8     9  0.969463
9    10  0.969430
Mean CV Score: 0.96980
