# Simple Solution! | Predicting Loan Payback 
## Quick EDA NO Explanations!
### Single Model
#### Sudmission
# Score: Dvided by Zero

In [None]:
import logging
import math
import os
import time
import warnings

import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
import seaborn as sns
from lightgbm import LGBMClassifier
from optuna.samplers import TPESampler
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.model_selection import KFold,  train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

warnings.filterwarnings("ignore")

rand_seed = 111

In [None]:
train_df = pd.read_csv("/kaggle/input/playground-series-s5e11/train.csv")
train_df = train_df.drop("id", axis=1)
test_df = pd.read_csv("/kaggle/input/playground-series-s5e11/test.csv")
test_df = test_df.drop("id", axis=1)

submission = pd.read_csv("/kaggle/input/playground-series-s5e11/sample_submission.csv")

In [None]:
train_df.head()

In [None]:
test_df.head()

## EDA

In [None]:
print(train_df.info())
print("i" * 100)
print(test_df.info())
print("i" * 100)

In [None]:
train_df.describe().style.background_gradient(cmap='Greens')

In [None]:
test_df.describe().style.background_gradient(cmap='Blues')

### Null Values

In [None]:
print(" ")
print(" Train Data")
print(" ")
print(train_df.isnull().sum())
print(" ")
print(" Test Data")
print(" ")
print(test_df.isnull().sum())
print(" ")

### Categorical Data Type!

In [None]:
cat_cols = train_df.select_dtypes(include="object").columns
num_cols = train_df.select_dtypes(include="number").columns

for col_name in cat_cols:
    print(
        f"{col_name} \n ***Train_df***-> {train_df[col_name].value_counts()} \n\n ***Test_df*** -> {test_df[col_name].value_counts()} \n\n"
    )

In [None]:
strata_sample = train_df.groupby('loan_paid_back', sort = False).apply(
    lambda x: x.sample(frac=0.15)
).droplevel(0).sample(frac=1, random_state=rand_seed)
 
strata_sample

In [None]:
plt.figure(figsize=(8, 6))
ax = sns.countplot(
    y="loan_paid_back", data=train_df, hue="loan_paid_back", palette="Greens"
)
plt.title("Distribution of Target Variable")
plt.ylabel("Loan Paid Back")
plt.xlabel("Count")
plt.legend().remove()
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize = (12, 6))
sns.heatmap(data=strata_sample[num_cols].corr(), annot = True, fmt='.3g', cmap = "Greens_r" )
plt.show()

In [None]:
def plotlm(x_var, y_var, row_var, scatt_col):
    sns.lmplot(
        data=strata_sample,
        x=x_var,
        y=y_var,
        row=row_var,
        lowess=True,
        height=2,
        aspect=3,
        col="loan_paid_back",
        line_kws={"color": "#23272a"},
        scatter_kws={"color": scatt_col, "alpha": 0.6},
    )
    plt.suptitle(
        f"{x_var.replace('_', ' ').title()} VS {y_var.replace('_', ' ').title()} And {row_var.replace('_', ' ').title()} by Target Variable",
        y=1.02,
        fontsize=16,
    )
    plt.show()

# Marital Status

In [None]:
plotlm(x_var = "loan_amount" , y_var = "annual_income", row_var = "marital_status", scatt_col = "#23b8a7" )

In [None]:
plotlm(x_var = "interest_rate" , y_var = "credit_score", row_var = "marital_status", scatt_col = "#23b8a7" )

In [None]:
plotlm(x_var = "interest_rate" , y_var = "loan_amount", row_var = "marital_status", scatt_col = "#23b8a7" )

In [None]:
plotlm(x_var = "interest_rate" , y_var = "debt_to_income_ratio", row_var = "marital_status", scatt_col = "#23b8a7" )

# Gender

In [None]:
plotlm(x_var = "interest_rate" , y_var = "debt_to_income_ratio", row_var = "gender", scatt_col = "#7e5a9b" )

In [None]:
plotlm(x_var = "interest_rate" , y_var = "loan_amount", row_var = "gender", scatt_col = "#7e5a9b")

In [None]:
plotlm(x_var = "interest_rate" , y_var = "credit_score", row_var = "gender", scatt_col = "#7e5a9b")

In [None]:
plotlm(x_var = "loan_amount" , y_var = "annual_income", row_var = "gender", scatt_col = "#7e5a9b")

# Model

## Preprocessing

In [None]:
def convert_dtype(df):
    df = df.copy()
    df = df.astype({col: "category" for col in df.select_dtypes("object").columns})
    
    return df


train_df_preproc = convert_dtype(train_df)
test_df_preproc = convert_dtype(test_df)

num_cols_preproc = test_df_preproc.select_dtypes(include="number").columns

In [None]:
def encode_categorical(tr_df, ts_df, cols):
    for col in cols:
        le = LabelEncoder()
        tr_df[col] = le.fit_transform(tr_df[col].astype(str))
        ts_df[col] = le.transform(
            ts_df[col]
            .astype(str)
            .map(lambda x: x if x in le.classes_ else le.classes_[0])
        )
    return tr_df, ts_df


# Apply encoding
train, test = encode_categorical(train_df_preproc, test_df_preproc, cat_cols)


scaler = StandardScaler()
train[num_cols_preproc] = scaler.fit_transform(train[num_cols_preproc])
test[num_cols_preproc] = scaler.transform(test[num_cols_preproc])

In [None]:
X = train.drop(columns=["loan_paid_back"])
y = train["loan_paid_back"]
X_test = test

### LGBMRegressor Model

In [None]:
### Hyperparameter Tuning (Optuna)

def objective_lgbm(trial):

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.25, random_state=rand_seed
    )

    lgbm_params = {
        "device": (
            "gpu" if LGBMClassifier().get_params().get("device") == "gpu" else "cpu"
        ),
        "metric": "auc",
        "learning_rate": 0.08118977030267217,
        "n_estimators": trial.suggest_int("n_estimators", 750, 1500),
        "num_leaves": trial.suggest_int("num_leaves", 90, 200),
        "max_depth": trial.suggest_int("max_depth", 4, 10),
        #"learning_rate": trial.suggest_float("learning_rate", 1e-2, 0.1),
        "subsample": trial.suggest_uniform("subsample", 0.8, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.7, 0.9),
        "reg_alpha": trial.suggest_float("reg_alpha", 3.1, 4.2),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.01, 0.04),
    }

    # Fit the model
    model_lgbm = LGBMClassifier(**lgbm_params, random_state=rand_seed, verbose=-1)

    model_lgbm.fit(X_train, y_train)

    # Predict and calculate accuracy score
    y_pred = model_lgbm.predict(X_test)

    return accuracy_score(y_test, y_pred)

### Create a study object | optimization

In [None]:
study_lgbm = optuna.create_study(study_name="LGBM_Loan", direction="maximize")
optuna.logging.set_verbosity(optuna.logging.WARNING)
study_lgbm.optimize(objective_lgbm, n_trials=200, show_progress_bar=True)

## Best trial

In [None]:
print("Best trial:", study_lgbm.best_trial)

## Best parameters

In [None]:
print("Best parameters:", study_lgbm.best_params)

In [None]:
# Train model
model = LGBMClassifier(
    **study_lgbm.best_params,
    learning_rate = 0.08118977030267217
)

model.fit(
    X,
    y
)

pred_lgb = model.predict_proba(X_test)[:,1]

pred_lgb

### Submission

In [None]:
submission["loan_paid_back"] = pred_lgb

### Submit

In [None]:
submission.to_csv("submission.csv", index=False)
submission.head()