In [1]:
import numpy as np
import pandas as pd
from scipy.optimize import minimize

In [2]:
loan_data = pd.read_csv("Loan_Data.csv")

In [3]:
loan_data.fico_score.min()

408

In [4]:
loan_data.fico_score.max()

850

In [5]:
# Number of buckets
k = 5

# Initial boundaries (simple initialization)
initial_boundaries = np.linspace(loan_data.fico_score.min(), loan_data.fico_score.max(), k+1)

def calculate_log_likelihood(boundaries):
    boundaries = np.sort(boundaries)
    log_likelihood = 0
    for i in range(len(boundaries) - 1):
        bucket_data = loan_data[(loan_data['fico_score'] >= boundaries[i]) & (loan_data['fico_score'] < boundaries[i+1])]
        ni = len(bucket_data)
        ki = bucket_data['default'].sum()
        pi = ki / ni if ni > 0 else 0.00001
        log_likelihood += ni * np.log(pi) + (ni - ki) * np.log(1 - pi)
    return -log_likelihood  # Minimize negative log-likelihood

# Optimize boundaries
result = minimize(calculate_log_likelihood, initial_boundaries[1:-1], bounds=[(300, 850)] * (k - 1), method='L-BFGS-B')
optimal_boundaries = np.concatenate(([300], result.x, [850]))

print("Optimal boundaries:", optimal_boundaries)


Optimal boundaries: [300.  496.4 584.8 673.2 761.6 850. ]


In [11]:
optimal_boundaries

array([300. , 496.4, 584.8, 673.2, 761.6, 850. ])