In [1]:
import os
import shutil

os.makedirs("/root/.kaggle", exist_ok=True)
shutil.move("kaggle.json", "/root/.kaggle/kaggle.json")
os.chmod("/root/.kaggle/kaggle.json", 600)

In [2]:
!kaggle datasets download -d adarshsng/lending-club-loan-data-csv

Dataset URL: https://www.kaggle.com/datasets/adarshsng/lending-club-loan-data-csv
License(s): DbCL-1.0
Downloading lending-club-loan-data-csv.zip to /content
 91% 307M/339M [00:00<00:00, 636MB/s]
100% 339M/339M [00:00<00:00, 658MB/s]


In [3]:
!unzip lending-club-loan-data-csv.zip

Archive:  lending-club-loan-data-csv.zip
  inflating: LCDataDictionary.xlsx   
  inflating: loan.csv                


In [4]:
!pip install xgboost joblib



In [5]:
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor

In [8]:
df = pd.read_csv("loan.csv", low_memory=False)
df.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,,,2500,2500,2500.0,36 months,13.56,84.92,C,C1,...,,,Cash,N,,,,,,
1,,,30000,30000,30000.0,60 months,18.94,777.23,D,D2,...,,,Cash,N,,,,,,
2,,,5000,5000,5000.0,36 months,17.97,180.69,D,D1,...,,,Cash,N,,,,,,
3,,,4000,4000,4000.0,36 months,18.94,146.51,D,D2,...,,,Cash,N,,,,,,
4,,,30000,30000,30000.0,60 months,16.14,731.78,C,C4,...,,,Cash,N,,,,,,


In [9]:
cols = [
    "loan_amnt",
    "term",
    "int_rate",
    "installment",
    "annual_inc",
    "dti",
    "emp_length",
    "home_ownership"
]

df = df[cols]
df.dropna(inplace=True)

In [10]:
# interest rate
df["int_rate"] = (
    df["int_rate"]
    .astype(str)
    .str.replace("%","", regex=False)
    .astype(float)
)

# term
df["term"] = df["term"].astype(str).str.extract('(\d+)').astype(int)

# employment length
df["emp_length"] = (
    df["emp_length"]
    .astype(str)
    .str.extract('(\d+)')
    .fillna(0)
    .astype(int)
)

# encode home ownership
df = pd.get_dummies(df, columns=["home_ownership"], drop_first=True)

  df["term"] = df["term"].astype(str).str.extract('(\d+)').astype(int)
  .str.extract('(\d+)')


In [11]:
df["monthly_income"] = df["annual_inc"] / 12
df["emi_ratio"] = df["installment"] / df["monthly_income"]

df["emi_stress_score"] = (
    df["emi_ratio"] * 50
    + df["dti"] * 0.5
    + df["int_rate"] * 0.5
)

df["emi_stress_score"] = np.clip(df["emi_stress_score"], 0, 100)

In [12]:
df["default_probability"] = (
    0.3 * df["emi_ratio"]
    + 0.02 * df["dti"]
    + 0.01 * df["int_rate"]
)

df["default_probability"] = np.clip(df["default_probability"], 0, 1)

In [13]:
growth_rate = 0.05
df["future_income_5yr"] = df["annual_inc"] * ((1 + growth_rate) ** 5)

In [14]:
df["annual_expense"] = df["annual_inc"] * 0.55

In [15]:
df["months_active"] = np.minimum(df["term"], 60)
df["total_emi_paid_5yr"] = df["installment"] * df["months_active"]

In [16]:
df["total_income_5yr"] = df["annual_inc"] * 5
df["total_expense_5yr"] = df["annual_expense"] * 5

df["savings_5yr"] = (
    df["total_income_5yr"]
    - df["total_expense_5yr"]
    - df["total_emi_paid_5yr"]
)

In [17]:
df["monthly_expense"] = df["annual_expense"] / 12
df["emergency_buffer_months"] = df["savings_5yr"] / (df["monthly_expense"] * 12)
df["emergency_buffer_months"] = np.clip(df["emergency_buffer_months"], 0, 24)

In [18]:
df["health_score"] = (
    0.30 * (df["savings_5yr"] / df["savings_5yr"].max()) * 100
    + 0.20 * (1 - df["default_probability"]) * 100
    + 0.20 * (1 - df["emi_stress_score"] / 100) * 100
    + 0.15 * (df["future_income_5yr"] / df["future_income_5yr"].max()) * 100
    + 0.15 * (df["emergency_buffer_months"] / 24) * 100
)

df["health_score"] = np.clip(df["health_score"], 0, 100)

In [19]:
X = df.drop("health_score", axis=1)
y = df["health_score"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [20]:
model3 = XGBRegressor(
    n_estimators=400,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=42
)

model3.fit(X_train, y_train)

In [21]:
y_pred = model3.predict(X_test)

print("MSE:", mean_squared_error(y_test, y_pred))
print("R2:", r2_score(y_test, y_pred))

MSE: 0.016668187640447953
R2: 0.9993994891207536


In [22]:
joblib.dump(model3, "financial_health_model.pkl")
joblib.dump(X.columns.tolist(), "health_model_features.pkl")

['health_model_features.pkl']