In [None]:
import pandas as pd
import numpy as np
import os 
import time 
import math
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import plotly.express as px
import plotly.graph_objects as go
from sklearn.model_selection import StratifiedKFold
from plotly.subplots import make_subplots
import plotly.io as pio
import plotly.subplots as sp
import optuna
import plotly.figure_factory as ff  
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, roc_curve
import warnings

warnings.filterwarnings('ignore')
sns.set(style='darkgrid')

pio.renderers.default = 'iframe_connected'
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

# <p style="background-color:#0ea5e9; font-family:'Trebuchet MS',sans-serif; font-weight:bold; color:white; font-size:80%; text-align:center; border-radius:10px; padding:10px;">ðŸ“‚ Data Acquisition & Loading</p>

In [None]:
train = pd.read_csv('/kaggle/input/playground-series-s5e11/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s5e11/test.csv')
submission = pd.read_csv('/kaggle/input/playground-series-s5e11/sample_submission.csv')

# <p style="background-color:#0ea5e9; font-family:'Trebuchet MS',sans-serif; font-weight:bold; color:white; font-size:80%; text-align:center; border-radius:10px; padding:10px;">ðŸ§¹ Data Cleaning & Preprocessing</p>


In [None]:
train.head()

In [None]:
train.info()

In [None]:
train.describe().round(2)

In [None]:
print("Duplicated Rows:",train.duplicated().sum())
print("-"*30)
print("Number of Rows:",train.shape[0])
print("-"*30)
print("Number of Columns:",train.shape[1])

In [None]:
train.isnull().sum()

In [None]:
print("Numeric Col Names",train.select_dtypes(include=['number']).columns)

In [None]:
print("Categorical Col Names",train.select_dtypes(include=['object']).columns)

In [None]:
num_col = ['annual_income', 'debt_to_income_ratio', 'credit_score',
       'loan_amount', 'interest_rate']

cat_col = ['gender', 'marital_status', 'education_level', 'employment_status',
       'loan_purpose', 'grade_subgrade']
target_col = 'loan_paid_back'
color_palette = ['#f2f0f7', '#dadaeb', '#bcbddc', '#9e9ac8', '#807dba', '#6a51a3', '#54278f', '#3f007d']

In [None]:
for col in cat_col:
    print(f"Unique categories in '{col}' column: {train[col].unique()}")
    print("<--- --- --- --- --- --- --- --- --- --->\n")

# <p style="background-color:#0ea5e9; font-family:'Trebuchet MS',sans-serif; font-weight:bold; color:white; font-size:80%; text-align:center; border-radius:10px; padding:10px;">ðŸ“Š Univariate Analysis â€” Understanding Individual Features</p>


In [None]:
gender_count = train['gender'].value_counts().reset_index()
gender_count.columns = ['gender', 'Count']

fig = px.bar(
    gender_count,
    x='gender',               
    y='Count',                
    color='gender',
    color_discrete_sequence=px.colors.sequential.Purp,
    title="Gender Distribution",
    text='Count'
)

fig.update_layout(width=600, height=400)
fig.show()

In [None]:
marital_count = train['marital_status'].value_counts().reset_index()
marital_count.columns = ['marital_status', 'Count']

fig = px.bar(
    marital_count,
    x='marital_status',
    y='Count',
    color='marital_status',
    color_discrete_sequence=px.colors.sequential.Purp,
    title="Marital Status Distribution",
    text='Count'  
)

fig.update_layout(width=600, height=400)
fig.show()

In [None]:
education_count = train['education_level'].value_counts().reset_index()
education_count.columns = ['education_level', 'Count']

fig = px.bar(
    education_count,
    x='education_level',
    y='Count',
    color='education_level',
    color_discrete_sequence=px.colors.sequential.Purp,
    title="Education Level Distribution",
    text='Count'
)

fig.update_layout(width=600, height=400)
fig.show()

In [None]:
employment_count = train['employment_status'].value_counts().reset_index()
employment_count.columns = ['employment_status', 'Count']

fig = px.bar(
    employment_count,
    x='employment_status',
    y='Count',
    color='employment_status',
    color_discrete_sequence=px.colors.sequential.Purp,
    title="Employment Status Distribution",
    text='Count'
)

fig.update_layout(width=600, height=400)
fig.show()

In [None]:
loan_purpose_count = train['loan_purpose'].value_counts().reset_index()
loan_purpose_count.columns = ['loan_purpose', 'Count']

fig = px.bar(
    loan_purpose_count,
    x='loan_purpose',
    y='Count',
    color='loan_purpose',
    color_discrete_sequence=px.colors.sequential.Purp,
    title="Loan Purpose Distribution",
    text='Count'
)

fig.update_layout(width=700, height=400)
fig.show()

In [None]:
top10 = train['grade_subgrade'].value_counts().head(10)

fig = px.bar(
    top10,
    x=top10.index,
    y=top10.values,
    text=top10.values,  
    title='Top 10 Frequent Grade_Subgrade Categories',
    color=top10.values,
    color_continuous_scale=px.colors.sequential.Purp
)

fig.update_layout(
    width=700, 
    height=400,
    xaxis_title='Grade_Subgrade',
    yaxis_title='Count'
)
fig.show()

In [None]:
fig, axes = plt.subplots(nrows=len(num_col), ncols=1, figsize=(8, 18))

for i, col in enumerate(num_col):
    sns.histplot(
        train[col],
        bins=50,
        kde=True,
        ax=axes[i],
        color=color_palette[i % len(color_palette)],
        edgecolor='black',
        linewidth=0.5
    )
    axes[i].set_title(f'Distribution of {col}', fontsize=14)
    axes[i].set_xlabel(col, fontsize=12)
    axes[i].set_ylabel('Frequency', fontsize=12)
    axes[i].grid(True, linestyle='--', alpha=0.5)

plt.tight_layout()
plt.show()

In [None]:
loan_status_counts = train['loan_paid_back'].value_counts().reset_index()
loan_status_counts.columns = ['loan_paid_back', 'count']

fig = px.pie(
    loan_status_counts, 
    values='count', 
    names='loan_paid_back', 
    title='Target Distribution',
    color='loan_paid_back', 
    color_discrete_sequence=px.colors.sequential.Purp
)

fig.update_layout(
    width=500,
    height=400
)

fig.show()

In [None]:
corr_matrix = train[num_col].corr()

plt.figure(figsize=(7, 6))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="YlGnBu", 
    cbar=True, square=True, linewidths=0.5
)
plt.title("Correlation Heatmap")
plt.show()

In [None]:
plt.figure(figsize=(7, 5))
sns.boxplot(x='loan_paid_back', y='debt_to_income_ratio', data=train, palette="YlGnBu")
plt.title("Debt-to-Income Ratio vs Loan Paid Back")
plt.xlabel("Loan Paid Back")
plt.ylabel("Debt-to-Income Ratio")
plt.show()

In [None]:
cat_cols = train.select_dtypes(include='object').columns.tolist()

for col in cat_cols:
    train[col] = train[col].astype('category')
    test[col] = test[col].astype('category')

In [None]:
train['income_to_loan_ratio'] = train['annual_income'] / (train['loan_amount'] + 1e-5)
train['income_to_interest_ratio'] = train['annual_income'] / (train['interest_rate'] + 1e-5)
train['loan_burden_index'] = train['loan_amount'] / (train['annual_income'] + 1e-5)
train['risk_score'] = (train['debt_to_income_ratio'] * train['interest_rate']) / (train['credit_score'] + 1e-5)
train['credit_utilization_flag'] = (train['debt_to_income_ratio'] > 0.3).astype(int)
train['is_high_credit'] = (train['credit_score'] > 700).astype(int)

In [None]:
test['income_to_loan_ratio'] = test['annual_income'] / (test['loan_amount'] + 1e-5)
test['income_to_interest_ratio'] = test['annual_income'] / (test['interest_rate'] + 1e-5)
test['loan_burden_index'] = test['loan_amount'] / (test['annual_income'] + 1e-5)
test['risk_score'] = (test['debt_to_income_ratio'] * test['interest_rate']) / (test['credit_score'] + 1e-5)
test['credit_utilization_flag'] = (test['debt_to_income_ratio'] > 0.3).astype(int)
test['is_high_credit'] = (test['credit_score'] > 700).astype(int)

In [None]:
train.head()

In [None]:
target_col = 'loan_paid_back'
X = train.drop(columns=[target_col, 'id'])
y = train[target_col]
X_test = test.drop(columns=['id'])

In [None]:
cat_cols = X.select_dtypes(include=['object']).columns.tolist()

for col in cat_cols:
    X[col] = X[col].astype('category')
    X_test[col] = X_test[col].astype('category')

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.1, random_state=42
)

In [None]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 1000, 5000),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'num_leaves': trial.suggest_int('num_leaves', 20, 512),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 200),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 50.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 50.0, log=True),
        'min_split_gain': trial.suggest_float('min_split_gain', 0.0, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
        'device': 'gpu',
        'random_state': 42,
        'n_jobs': -1
    }

    model = lgb.LGBMClassifier(**params)
    model.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        eval_metric='auc',
        categorical_feature=cat_cols,
        callbacks=[]
    )

    preds = model.predict_proba(X_valid)[:, 1]
    auc = roc_auc_score(y_valid, preds)
    return auc

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10, show_progress_bar=True)

In [None]:
best_params = study.best_params
print(best_params)

In [None]:
lgb_params = {
    'n_estimators': 2160,
    'learning_rate': 0.07433691415546852,
    'max_depth': 8,
    'num_leaves': 33,
    'min_child_samples': 176,
    'subsample': 0.9837752916144209,
    'colsample_bytree': 0.9563917438369198,
    'reg_alpha': 0.000625767578627813,
    'reg_lambda': 1.7645286382037818e-08,
    'min_split_gain': 0.41106435655713425,
    'bagging_freq': 4,
    'objective': 'binary',
    'metric': 'auc',
    'random_state': 42,
    'n_jobs': -1
}

In [None]:
#N_SPLITS = 5
#skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(X_test))
fold_scores = []
models = []

# --- Training loop ---
for fold, (train_index, val_index) in enumerate(skf.split(X, y)):
    print(f"\n========== FOLD {fold + 1} / {N_SPLITS} ==========")
    
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    
    model = lgb.LGBMClassifier(**lgb_params)
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric='auc'
    )
    
    models.append(model)
    
    val_preds = model.predict_proba(X_val)[:, 1]
    oof_preds[val_index] = val_preds
    
    fold_auc = roc_auc_score(y_val, val_preds)
    fold_scores.append(fold_auc)
    print(f"Fold {fold + 1} ROC AUC: {fold_auc:.5f}")

In [None]:
mean_auc = np.mean(fold_scores)
std_auc = np.std(fold_scores)
print("\n========== CV RESULTS ==========")
print(f"Mean ROC AUC: {mean_auc:.5f}")
print(f"STD ROC AUC:  {std_auc:.5f}")

In [None]:
submission = pd.DataFrame({'id': submission.id, 'target': test_preds})

In [None]:
submission.to_csv('submission.csv', index=False)
submission.head()
print("\nFile Saved")