In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt



# Load the dataset
dataframe = pd.read_csv('Task 3 and 4_Loan_Data.csv')

# Create new features: Payment to Income Ratio and Debt to Income Ratio
dataframe['payment_to_income'] = dataframe['loan_amt_outstanding'] / dataframe['income']
dataframe['debt_to_income'] = dataframe['total_debt_outstanding'] / dataframe['income']

# Features for supervised model training
features = ['credit_lines_outstanding', 'debt_to_income', 'payment_to_income', 'years_employed', 'fico_score']
# Take default (1/0) as target
target = 'default'

# Train-test split
xTrain, xTest, yTrain, yTest = train_test_split(dataframe[features], dataframe[target], test_size=0.2, random_state=42)

# Standardize the features (important for many machine learning algorithms)
scaler = StandardScaler()
xTrain_scaled = scaler.fit_transform(xTrain)
xTest_scaled = scaler.transform(xTest)

# Models to be compared
models = {
    'Logistic Regression': LogisticRegression(random_state=42, solver='liblinear', max_iter=10000),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42)
}

# Train each model and evaluate performance
for name, model in models.items():
    # Train the model
    model.fit(xTrain_scaled, yTrain)

    # Predict the probability of default being 1
    y_pred_proba = model.predict_proba(xTest_scaled)[:,1]

    # Calculate AUC-ROC score
    auc = roc_auc_score(yTest, y_pred_proba)

    # Print performance
    print(f"{name} AUC-ROC: {auc:.4f}")

Logistic Regression AUC-ROC: 1.0000
Decision Tree AUC-ROC: 0.9852
Random Forest AUC-ROC: 0.9999


In [9]:
# Final Function: Expected Loss Calculation based on PD
def calculate_expected_loss(loan_data, model, scaler, recovery_rate=0.10):
    # Prepare input data
    loan_df = pd.DataFrame([loan_data])

    # Select the same features as used in training
    loan_features = loan_df[features]

    # Standardize the features
    loan_features_scaled = scaler.transform(loan_features)

    # Predict the probability of default being 1
    pd_pred = model.predict_proba(loan_features_scaled)[:,1][0]

    # Calculate the expected loss
    loan_amount = loan_df['loan_amt_outstanding'].values[0]
    expected_loss = pd_pred * (1 - recovery_rate) * loan_amount

    return expected_loss, pd_pred

In [10]:
# Example loan data
credit_lines_outstanding = 3
payment_to_income = 0.5
years_employed = 30
fico_score = 700
income = 20000
loan_amt_outstanding = 15000
total_debt_outstanding = 20000

debt_to_income = total_debt_outstanding/income
payment_to_income = loan_amt_outstanding/income

test_loan = {
    'credit_lines_outstanding': credit_lines_outstanding,
    'debt_to_income': debt_to_income,
    'payment_to_income': payment_to_income,
    'years_employed': years_employed,
    'fico_score': fico_score,
    'income': income,
    'loan_amt_outstanding': loan_amt_outstanding,
    'total_debt_outstanding': total_debt_outstanding
}

# Use the best model (adjust based on performance)
best_model = models['Logistic Regression']
expected_loss, predicted_pd = calculate_expected_loss(test_loan, best_model, scaler)

print(f"Predicted PD: {predicted_pd:.4f}")
print(f"Expected Loss: ${expected_loss:.2f}")

Predicted PD: 0.8048
Expected Loss: $10865.44
