In [None]:
# ============================================
# 0. load library
# ============================================
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# ============================================
# 1. load data
# ============================================

# load train data
df = pd.read_csv("/kaggle/input/playground-series-s5e11/train.csv")

# load test data
test = pd.read_csv("/kaggle/input/playground-series-s5e11/test.csv")

# Identify feature
target = df.columns.tolist()[-1]
cols = df.drop(columns=[target,"id"]).columns.tolist()

# Categorical features
cat = [c for c in cols if df[c].dtype in ["object","category"]]

# Numerical features
num = [c for c in cols if df[c].dtype not in ["object","category","bool"]]


In [None]:
df.head()

In [None]:
test.head()

In [None]:
# ============================================
# 1. EDA
# ============================================

from ydata_profiling import ProfileReport
report = ProfileReport(df,title='LoanPayback')
report.to_notebook_iframe()

In [None]:
#employment_status
import seaborn as sns
import matplotlib.pyplot as plt

sns.countplot(x='employment_status', hue='loan_paid_back', data=df)
plt.legend(['defaulted', 'paidback'])
plt.show()

display(pd.crosstab(df['employment_status'], df['loan_paid_back']))
display(pd.crosstab(df['employment_status'], df['loan_paid_back'], normalize='index'))

In [None]:
#grade_subgrade
import seaborn as sns
import matplotlib.pyplot as plt

sns.countplot(x='grade_subgrade', hue='loan_paid_back', data=df)
plt.legend(['defaulted', 'paidback'])
plt.show()

display(pd.crosstab(df['grade_subgrade'], df['loan_paid_back']))
display(pd.crosstab(df['grade_subgrade'], df['loan_paid_back'], normalize='index'))

In [None]:
# ============================================
# 2. Outlier Treatment & feature engineering
# ============================================

In [None]:
def create_frequency_features(df, df_test):
    """
    Add frequency and binning features efficiently.

    - For each categorical column, create <col>_freq = how often each value appears in train data.
    - For numeric columns, split values into 5, 10, 15 quantile bins.
    """
    # Pre-allocate DataFrames for new features to avoid fragmentation
    freq_features_train = pd.DataFrame(index=df.index)
    freq_features_test = pd.DataFrame(index=df_test.index)
    bin_features_train = pd.DataFrame(index=df.index)
    bin_features_test = pd.DataFrame(index=df_test.index)

    for col in cols:
        # --- Frequency encoding ---
        freq = df[col].value_counts()
        df[f"{col}_freq"] = df[col].map(freq)
        freq_features_test[f"{col}_freq"] = df_test[col].map(freq).fillna(freq.mean())

        # --- Quantile binning for numeric columns ---
        if col in num:
            for q in [5, 10, 15]:
                try:
                    train_bins, bins = pd.qcut(df[col], q=q, labels=False, retbins=True, duplicates="drop")
                    bin_features_train[f"{col}_bin{q}"] = train_bins
                    bin_features_test[f"{col}_bin{q}"] = pd.cut(df_test[col], bins=bins, labels=False, include_lowest=True)
                except Exception:
                    bin_features_train[f"{col}_bin{q}"] = 0
                    bin_features_test[f"{col}_bin{q}"] = 0

    # Concatenate all new features at once
    df = pd.concat([df, freq_features_train, bin_features_train], axis=1)
    df_test = pd.concat([df_test, freq_features_test, bin_features_test], axis=1)

    return df, df_test

In [None]:
from sklearn.model_selection import KFold
def target_encoding(train, predict, n_splits=5):
    """
    Add K-Fold target mean encoded features to train and predict datasets.
    
    Parameters:
    - train: training DataFrame
    - predict: prediction/test DataFrame
    - target: name of the target column
    - n_splits: number of folds for K-Fold encoding
    
    Returns:
    - train and predict DataFrames with new mean encoded features
    """
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    mean_features_train = pd.DataFrame(index=train.index)
    mean_features_test = pd.DataFrame(index=predict.index)

    for col in cols:
        # --- K-Fold Target Mean Encoding ---
        mean_encoded = np.zeros(len(train))
        for tr_idx, val_idx in kf.split(train):
            tr_fold = train.iloc[tr_idx]
            val_fold = train.iloc[val_idx]
            mean_map = tr_fold.groupby(col)[target].mean()
            mean_encoded[val_idx] = val_fold[col].map(mean_map)

        mean_features_train[f'mean_{col}'] = mean_encoded

        # --- Apply global mean mapping to prediction/test data ---
        global_mean = train.groupby(col)[target].mean()
        mean_features_test[f'mean_{col}'] = predict[col].map(global_mean)

    # --- Concatenate new features at once to avoid fragmentation ---
    train = pd.concat([train, mean_features_train], axis=1)
    predict = pd.concat([predict, mean_features_test], axis=1)

    # Defragment
    train = train.copy()
    predict = predict.copy()
    return train, predict

In [None]:
# Specific feature engineering
df['subgrade'] = df['grade_subgrade'].str[1:].astype(int)
test['subgrade'] = test['grade_subgrade'].str[1:].astype(int)

df['grade'] = df['grade_subgrade'].str[0]
test['grade'] = test['grade_subgrade'].str[0]

In [None]:
# Creating new features based on the frequency of numerical features
df2, test2 = target_encoding(df, test, 10)
df2, test2 = create_frequency_features(df2, test2)

# Preparing categorical features
df2[cat], test2[cat] = df[cat].astype("category"), test[cat].astype("category")

print(df2.columns.tolist())

In [None]:
# Dropping unnecessary columns
remove = ["interest_rate",
          "education_level","loan_purpose", "grade_subgrade", "marital_status", "gender", "employment_status", "grade",
          "debt_to_income_ratio_bin5", "credit_score_bin5", "loan_amount_bin5",
          "credit_score_freq", "employment_status_freq"]
df2, test2 = df2.drop(columns = remove), test2.drop(columns = remove)

# Dropping ID and duplicates
df2.drop(columns="id", inplace=True)
df2.drop_duplicates(inplace=True)

In [None]:
## LightGBM
import lightgbm as lgb

#Cross Verification(K-fold)
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier, early_stopping, log_evaluation
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

X = df2.drop(columns=[target])
y = df2[target]
lgb_train = lgb.Dataset(X, label=y)

lgb_params = {
    'objective': 'binary', 'metric': 'auc', 'boosting_type': 'gbdt',
    'max_depth': 6, 'num_leaves': 50, 'learning_rate': 0.03,
    'colsample_bytree': 0.8, 'subsample': 0.8,
    'subsample_freq': 1, 'min_child_samples': 20, 'reg_alpha': 0.05,
    'reg_lambda': 0.1, 'random_state': 42,
    'n_jobs': -1, 
    'verbose': -1,
}

cv_results = lgb.cv(
    params=lgb_params,
    train_set=lgb_train,
    num_boost_round=20000,
    nfold=7,
    stratified=True,
    callbacks=[early_stopping(stopping_rounds=50), log_evaluation(period = 100)],
    seed=42
)

cv_df = pd.DataFrame(cv_results)
print(cv_df.tail())

best_round = len(cv_results['valid auc-mean'])
best_auc = cv_results['valid auc-mean'][-1]
print(f"Best round: {best_round}, Best CV AUC: {best_auc:.7f}")

#1feature Accuracy: 0.8044
#2feature Accuracy: 0.8119
#3feature Accuracy: 0.8120
#LGBM with 5feature: train score : 0.9055616349402951 test score : 0.9046122593280546
#Cross Verification: Overall CV AUC: 0.919931
#outliertreatment+annual_income -> Overall   CV AUC: 0.920096   pub=0.92047
#LGBM parameter tune: Best round: 1472, Best CV AUC: 0.9224443  pub=0.92274
#feature engineering(from simple-lightgbm-only-competition-data-s5e11): Best round: 697, Best CV AUC: 0.9259890  pub=0.92600
#delete Outlier Treatment from above                                  : Best round: 704, Best CV AUC: 0.9266021  pub=0.xxxxx

In [None]:
# putting the n_estimator at the average early stopping point to avoid overfitting
lgb_params["n_estimators"] = best_round + 100
print(best_round)

In [None]:
# Train LGBM model
model = LGBMClassifier(
    **lgb_params,
)
model.fit(X, y)

In [None]:
y_sub_proba = model.predict_proba(test2.drop(columns = "id"))[:, 1]

# make submission.csv
submission = pd.DataFrame({
    "id": test["id"],
    "loan_paid_back": y_sub_proba
})

submission.to_csv("submission.csv", index=False)
submission.head()
