In [3]:
import numpy as np
import pandas as pd

def optimal_bucketing(df, B):
    # 1) collapse identical scores
    g = df.groupby('fico_score')['default'].agg(['count', 'sum']).reset_index()
    scores   = g['fico_score' ].to_numpy()
    totals   = g['count'     ].to_numpy()
    defaults = g['sum'       ].to_numpy()
    m        = len(scores)                  # ≤ 551

    # 2) prefix sums
    cum_tot = np.concatenate(([0], np.cumsum(totals   )))
    cum_def = np.concatenate(([0], np.cumsum(defaults )))

    def bucket_ll(s, e):                    # half-open [s,e)
        k = cum_def[e] - cum_def[s]
        n = cum_tot[e] - cum_tot[s]
        if k == 0 or k == n:
            return 0.0
        p = k / n
        return k*np.log(p) + (n-k)*np.log(1-p)

    # 3) DP  (m is now small)
    dp        = np.full((m+1, B+1), -np.inf)
    backtrack = np.full((m+1, B+1), -1, dtype=int)
    dp[0, 0]  = 0.0

    for b in range(1, B+1):
        for i in range(b, m+1):
            best = -np.inf
            best_j = -1
            for j in range(b-1, i):
                cand = dp[j, b-1] + bucket_ll(j, i)
                if cand > best:
                    best, best_j = cand, j
            dp[i, b] = best
            backtrack[i, b] = best_j

    # 4) recover boundaries
    cuts = []
    i, b = m, B
    while b > 0:
        j = backtrack[i, b]
        cuts.append(scores[j])    # first score in bucket
        i, b = j, b-1
    cuts = [float(c) for c in cuts]
    return sorted(cuts)

# ---------------- driver ----------------
df  = pd.read_csv("3_and_4_Loan_Data.csv")
B   = 5
cuts = optimal_bucketing(df[['fico_score', 'default']], B)
print("Bucket boundaries:", cuts)


Bucket boundaries: [408.0, 521.0, 581.0, 641.0, 697.0]
