In [8]:
import numpy as np
import pandas as pd
from scipy.optimize import minimize
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

# Load data
df = pd.read_csv('Task 3 and 4_Loan_Data.csv')

train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

# Train a logistic regression model (for demonstration purposes)
X_train = train_data[['fico_score']]
y_train = train_data['default']

model = LogisticRegression()
model.fit(X_train, y_train)

# Predict default probabilities for the training set
train_data['Predicted_Prob'] = model.predict_proba(X_train)[:, 1]

# number o buckets
num_buckets = 5

# Initialize bucket boundaries
initial_boundaries = np.linspace(df['fico_score'].min(), df['fico_score'].max(), num_buckets + 1)[1:-1]

# Optimization objective: Negative log-likelihood
def neg_log_likelihood(boundaries):
    if any(np.diff(boundaries) <= 0):
        return np.inf
    buckets = np.concatenate([[df['fico_score'].min()], boundaries, [df['fico_score'].max()]])
    
    train_data['Bucket'] = pd.cut(train_data['fico_score'], bins=buckets, labels=False)
    
    bucket_counts = train_data.groupby('Bucket')['default'].agg(['count', 'sum']).fillna(0)
    
    bucket_probs = bucket_counts['sum'] / bucket_counts['count']
    
    neg_ll = -np.sum(np.log(bucket_probs))
    
    return neg_ll

# Perform optimization
result = minimize(neg_log_likelihood, initial_boundaries, method='Nelder-Mead', bounds=[(df['fico_score'].min(), df['fico_score'].max())] * (num_buckets - 1))

# Get optimal boundaries
optimal_boundaries = np.concatenate([[df['fico_score'].min()], result.x, [df['fico_score'].max()]])
train_data['Optimal_Bucket'] = pd.cut(train_data['fico_score'], bins=optimal_boundaries, labels=False)

# Calculate log loss using optimal boundaries
logloss = log_loss(train_data['default'], train_data['Predicted_Prob'])
print('Optimal Bucket Boundaries:', optimal_boundaries)
print('Log Loss with Optimal Boundaries:', logloss)


  result = getattr(ufunc, method)(*inputs, **kwargs)


Optimal Bucket Boundaries: [408.         476.5251898  477.03575435 601.46732594 809.41752429
 850.        ]
Log Loss with Optimal Boundaries: 0.4277804350092782
