## Needed Library

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
#import pytabkit
from sklearn.model_selection import train_test_split

import lightgbm as lgb
from sklearn.metrics import *

from sklearn.pipeline import Pipeline
from lightgbm import LGBMClassifier, early_stopping, log_evaluation,early_stopping
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from lightgbm import early_stopping, log_evaluation
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder


warnings.filterwarnings('ignore')
print('Done')

## Read Data Files

In [None]:
df = pd.read_csv("/kaggle/input/playground-series-s5e11/train.csv")
df_test = pd.read_csv("/kaggle/input/playground-series-s5e11/test.csv")
target = 'loan_paid_back'
print(df.shape)
print(df.columns.tolist())

df.head()

In [None]:
# train_pipeline.py
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import optuna

#   FEATURE ENGINEERING
def feature_engineer(df):
    df = df.copy()

    # --- grade & subgrade
    def extract_grade_subgrade(s):
        if pd.isna(s): return (np.nan, np.nan)
        s = str(s).strip()
        if len(s)==0: return (np.nan, np.nan)
        return (s[0], s[1:])

    g = df['grade_subgrade'].fillna("").astype(str).apply(extract_grade_subgrade)
    df['grade'] = g.apply(lambda x: x[0]).replace("", np.nan)
    df['subgrade'] = pd.to_numeric(g.apply(lambda x: x[1]), errors='coerce')

    # --- numeric
    num_cols = ['annual_income','loan_amount','debt_to_income_ratio','interest_rate','credit_score']
    for c in num_cols:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors='coerce')

    # --- interactions
    df['loan_to_income'] = df['loan_amount'] / df['annual_income'].replace(0, np.nan)
    df['income_to_loan'] = df['annual_income'] / df['loan_amount'].replace(0, np.nan)
    df['log_annual_income'] = np.log1p(df['annual_income'].clip(lower=0))
    df['log_loan_amount'] = np.log1p(df['loan_amount'].clip(lower=0))
    df['interest_x_loan'] = df['interest_rate'] * df['loan_amount']
    df['interest_x_credit'] = df['interest_rate'] * df['credit_score']
    df['dti_x_interest'] = df['debt_to_income_ratio'] * df['interest_rate']

    # --- credit bucket
    if 'credit_score' in df.columns:
        df['credit_score_bucket'] = pd.cut(
            df['credit_score'],
            bins=[0,580,670,740,800,900],
            labels=['poor','fair','good','very_good','excellent']
        ).astype(object)

    # --- freq encoding
    cat_cols = ['loan_purpose','employment_status','education_level','marital_status','gender']
    for c in cat_cols:
        if c in df.columns:
            freq = df[c].fillna('NA').value_counts(normalize=True)
            df[f'{c}_freq'] = df[c].fillna('NA').map(freq).astype(float)

    df['missing_count'] = df.isna().sum(axis=1)

    # --- grade one-hot
    df['grade'] = df['grade'].astype(object)
    grade_dummies = pd.get_dummies(df['grade'], prefix='grade', dummy_na=True)
    df = pd.concat([df, grade_dummies], axis=1)

    return df

#   LOAD DATA
train_path = "/kaggle/input/playground-series-s5e11/train.csv"
test_path  = "/kaggle/input/playground-series-s5e11/test.csv"

train = pd.read_csv(train_path)
test  = pd.read_csv(test_path)

target = 'loan_paid_back'

train_fe = feature_engineer(train)
test_fe  = feature_engineer(test)

FEATURES = [c for c in train_fe.columns if c not in ['id', target, 'grade_subgrade']]

train_X = train_fe[FEATURES].copy()
train_y = train_fe[target].copy()
test_X  = test_fe[FEATURES].copy()

#   Basic Fill
for c in train_X.columns:
    if train_X[c].dtype.kind in 'biufc': # numeric
        med = train_X[c].median()
        train_X[c] = train_X[c].fillna(med)
        test_X[c]  = test_X[c].fillna(med)
    else:
        train_X[c] = train_X[c].fillna('NA')
        test_X[c]  = test_X[c].fillna('NA')

# Convert object to category code
for c in train_X.columns:
    if train_X[c].dtype == 'object':
        train_X[c] = train_X[c].astype('category').cat.codes
        test_X[c]  = test_X[c].astype('category').cat.codes

#   OPTUNA OBJECTIVE
def objective(trial):

    param = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'verbosity': -1,
        'seed': 42,
        'n_jobs': -1,
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
        'num_leaves': trial.suggest_int('num_leaves', 30, 300),
        'max_depth': trial.suggest_int('max_depth', 4, 12),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 200),
        'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
    }

    folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    aucs = []

    for tr_idx, val_idx in folds.split(train_X, train_y):
        X_tr, X_val = train_X.iloc[tr_idx], train_X.iloc[val_idx]
        y_tr, y_val = train_y.iloc[tr_idx], train_y.iloc[val_idx]

        dtrain = lgb.Dataset(X_tr, label=y_tr)
        dval   = lgb.Dataset(X_val, label=y_val, reference=dtrain)

        bst = lgb.train(
            param,
            dtrain,
            valid_sets=[dval],
            num_boost_round=5000,
            callbacks=[
                lgb.early_stopping(100),
                lgb.log_evaluation(period=0)
            ]
        )

        pred = bst.predict(X_val, num_iteration=bst.best_iteration)
        aucs.append(roc_auc_score(y_val, pred))

    return np.mean(aucs)


#   RUN OPTUNA
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=40, show_progress_bar=True)

print("Best AUC:", study.best_value)
print("Best params:", study.best_params)

#   TRAIN FINAL MODEL
best_params = {
    **study.best_params,
    'objective':'binary',
    'metric':'auc',
    'boosting_type':'gbdt',
    'verbosity':-1,
    'seed':42,
    'n_jobs':-1
}

dtrain = lgb.Dataset(train_X, label=train_y)

final_model = lgb.train(
    best_params,
    dtrain,
    num_boost_round=5000,
    valid_sets=[dtrain],
    callbacks=[
        lgb.early_stopping(100),
        lgb.log_evaluation(period=0)
    ]
)

print("âœ… Done Training")


## Submission