In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
import numpy as np
import warnings

# Suppress feature name warning for cleaner output
warnings.filterwarnings('ignore', category=FutureWarning)

# --- CONFIGURATION ---
FILE_PATH = "/content/Task 3 and 4_Loan_Data.csv"
RECOVERY_RATE = 0.10
LGD = 1.0 - RECOVERY_RATE
FEATURES = [
    'credit_lines_outstanding',
    'loan_amt_outstanding',
    'total_debt_outstanding',
    'income',
    'years_employed',
    'fico_score'
]

# Initialize the trained model placeholder and feature names
pd_model = None
feature_columns = None

def train_pd_model(file_path):
    """
    Loads data, prepares features, and trains a Random Forest model
    to predict the Probability of Default (PD).
    """
    global pd_model, feature_columns

    # 1. Load Data
    try:
        data = pd.read_csv(file_path)
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return

    # 2. Prepare Data (Dropping non-predictive/target columns)
    X = data[FEATURES]
    y = data['default']
    feature_columns = X.columns.tolist()

    # Split data for training and testing
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # 3. Train Random Forest Classifier for PD Prediction
    # Random Forest is chosen for its good handling of non-linear relationships
    # and minimal need for feature scaling.
    pd_model = RandomForestClassifier(
        n_estimators=100,
        max_depth=10,
        random_state=42,
        class_weight='balanced' # Helps with imbalanced default/non-default classes
    )
    pd_model.fit(X_train, y_train)

    # Evaluation (Optional but good practice)
    y_pred_proba = pd_model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_pred_proba)

    print(f"--- Model Training Complete ---")
    print(f"Random Forest PD Model trained on {len(X_train)} samples.")
    print(f"Model AUC on test set: {auc:.4f}")

    return pd_model

def calculate_expected_loss(loan_details: dict) -> float:
    """
    Predicts the Probability of Default (PD) and calculates the Expected Loss (EL)
    for a single loan application.

    EL = PD * EAD * LGD

    Args:
        loan_details: A dictionary containing all required features and the
                      loan amount outstanding (EAD).
                      Example: {
                          'credit_lines_outstanding': 1,
                          'loan_amt_outstanding': 5000.00,
                          'total_debt_outstanding': 15000.00,
                          'income': 75000.00,
                          'years_employed': 5,
                          'fico_score': 650
                      }

    Returns:
        The calculated Expected Loss (float).
    """
    if pd_model is None:
        return "Error: PD Model has not been trained. Please run train_pd_model() first."

    # 1. Prepare EAD and Feature Vector

    # EAD (Exposure at Default) is the loan amount outstanding
    EAD = loan_details.get('loan_amt_outstanding', 0.0)

    # Create a DataFrame from the input dictionary, ensuring column order matches training data
    try:
        input_data = pd.DataFrame([loan_details], columns=feature_columns)
    except ValueError as e:
        return f"Error: Missing features in input. Required features are: {feature_columns}"

    # 2. Predict Probability of Default (PD)
    # predict_proba returns [[Prob_NoDefault, Prob_Default]]
    pd_proba = pd_model.predict_proba(input_data)[:, 1]
    PD = pd_proba[0]

    # 3. Calculate Expected Loss (EL)
    # EL = PD * EAD * LGD
    expected_loss = PD * EAD * LGD

    print(f"\n--- Expected Loss Calculation ---")
    print(f"Loan Amount Outstanding (EAD): ${EAD:,.2f}")
    print(f"Predicted Probability of Default (PD): {PD:.4f}")
    print(f"Loss Given Default (LGD, 1 - 10% Recovery): {LGD:.2f}")

    return expected_loss

# --- EXECUTION ---

# Step 1: Train the Probability of Default (PD) model
train_pd_model(FILE_PATH)

# Step 2: Define a sample borrower's details for testing
# Example Borrower 1 (High Risk - Low income, high debt, low FICO)
borrower_high_risk = {
    'credit_lines_outstanding': 6,
    'loan_amt_outstanding': 8000.00,
    'total_debt_outstanding': 30000.00,
    'income': 20000.00,
    'years_employed': 1,
    'fico_score': 550
}

# Example Borrower 2 (Low Risk - High income, low debt, high FICO)
borrower_low_risk = {
    'credit_lines_outstanding': 1,
    'loan_amt_outstanding': 1000.00,
    'total_debt_outstanding': 5000.00,
    'income': 150000.00,
    'years_employed': 10,
    'fico_score': 780
}


# Step 3: Calculate the Expected Loss for the samples

el_high_risk = calculate_expected_loss(borrower_high_risk)
print(f"Expected Loss for High-Risk Borrower: ${el_high_risk:,.2f}")

el_low_risk = calculate_expected_loss(borrower_low_risk)
print(f"Expected Loss for Low-Risk Borrower: ${el_low_risk:,.2f}")

--- Model Training Complete ---
Random Forest PD Model trained on 8000 samples.
Model AUC on test set: 0.9999

--- Expected Loss Calculation ---
Loan Amount Outstanding (EAD): $8,000.00
Predicted Probability of Default (PD): 1.0000
Loss Given Default (LGD, 1 - 10% Recovery): 0.90
Expected Loss for High-Risk Borrower: $7,200.00

--- Expected Loss Calculation ---
Loan Amount Outstanding (EAD): $1,000.00
Predicted Probability of Default (PD): 0.0000
Loss Given Default (LGD, 1 - 10% Recovery): 0.90
Expected Loss for Low-Risk Borrower: $0.00
