In [1]:
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import numpy as np

In [2]:
def optimized_quantize_fico_mse(data, fico_column, num_buckets=10):
    """
    Optimized quantization of FICO scores into buckets using Mean Squared Error (MSE).
    
    Parameters:
        data (pd.DataFrame): Dataset containing FICO scores.
        fico_column (str): Column name for FICO scores.
        num_buckets (int): Number of buckets to create.
        
    Returns:
        list: Optimized bucket boundaries.
        pd.DataFrame: Dataset with bucket assignments.
    """
    # Sort the data by FICO score
    data_sorted = data.sort_values(by=fico_column).reset_index(drop=True)
    fico_scores = data_sorted[fico_column].values
    n = len(fico_scores)

    # Compute cumulative sums for FICO scores and squared values
    cum_sum = np.cumsum(fico_scores)
    cum_sq_sum = np.cumsum(fico_scores**2)

    # Helper function to compute MSE for a range [start, end]
    def get_mse(start, end):
        total = cum_sum[end] - (cum_sum[start - 1] if start > 0 else 0)
        total_sq = cum_sq_sum[end] - (cum_sq_sum[start - 1] if start > 0 else 0)
        count = end - start + 1
        mean = total / count
        mse = total_sq - 2 * mean * total + count * mean**2
        return mse

    # Initialize DP arrays
    dp = np.zeros((num_buckets, n))
    splits = np.zeros((num_buckets, n), dtype=int)

    # Base case: single bucket
    for i in range(n):
        dp[0][i] = get_mse(0, i)

    # Recursive case: multiple buckets
    for k in range(1, num_buckets):
        for i in range(k, n):
            min_mse, min_split = float('inf'), -1
            for j in range(k - 1, i):
                mse = dp[k - 1][j] + get_mse(j + 1, i)
                if mse < min_mse:
                    min_mse, min_split = mse, j
            dp[k][i] = min_mse
            splits[k][i] = min_split

    # Backtrack to find the bucket boundaries
    boundaries = []
    end = n - 1
    for k in range(num_buckets - 1, -1, -1):
        boundaries.append(fico_scores[splits[k][end] + 1])
        end = splits[k][end]
    boundaries = sorted(set(boundaries))

    # Assign buckets to data
    data_sorted['bucket'] = np.digitize(data_sorted[fico_column], boundaries, right=False)

    return boundaries, data_sorted



In [3]:
# Load the dataset
file_path = 'data/Task 3 and 4_Loan_Data.csv'
loan_data = pd.read_csv(file_path)

# Apply K-Means quantization for FICO scores
fico_boundaries_kmeans, fico_buckets_kmeans = optimized_quantize_fico_mse(
    loan_data, fico_column="fico_score", num_buckets=5
)

# Print the results
print("Bucket Boundaries (K-Means):", fico_boundaries_kmeans)
print("\nSample Data with Buckets:")
print(fico_buckets_kmeans.head())


Bucket Boundaries (K-Means): [409, 553, 608, 655, 707]

Sample Data with Buckets:
   customer_id  credit_lines_outstanding  loan_amt_outstanding  \
0      7264776                         1           4457.914800   
1      6901345                         3           5281.352243   
2      2585781                         4           6734.984475   
3      1252008                         5           5176.915602   
4      1337395                         5           4271.314690   

   total_debt_outstanding       income  years_employed  fico_score  default  \
0             12233.49501  98913.32028               3         408        0   
1             16411.51801  79905.09892               1         409        1   
2             26384.58439  97668.03091               2         418        1   
3             22990.26543  82417.59227               2         425        1   
4             22756.28103  83475.30929               4         438        1   

   bucket  
0       0  
1       1  
2       1 