In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import warnings

# Suppress warnings for cleaner output (e.g., from scikit-learn)
warnings.filterwarnings('ignore')

# --- Constants ---
FILE_PATH = 'JPMC3.csv'
RECOVERY_RATE = 0.10
LOSS_GIVEN_DEFAULT = 1.0 - RECOVERY_RATE

# Define the columns we will use as features (predictors)
# 'customer_id' is excluded as it's just an identifier.
FEATURE_COLS = [
    'credit_lines_outstanding',
    'loan_amt_outstanding',
    'total_debt_outstanding',
    'income',
    'years_employed',
    'fico_score'
]
TARGET_COL = 'default' # This is what we want to predict (0 = no default, 1 = default)


def calculate_expected_loss(loan_details, model, scaler, feature_cols, lgd):
    """
    Calculates the Probability of Default (PD) and Expected Loss (EL) for a single borrower.

    Args:
        loan_details (dict): A dictionary containing the borrower's features.
                             (e.g., {'income': 50000, 'fico_score': 650, ...})
        model (LogisticRegression): The trained logistic regression model.
        scaler (StandardScaler): The fitted scaler used for training.
        feature_cols (list): The list of feature names in the correct order.
        lgd (float): The Loss Given Default (1 - recovery_rate).

    Returns:
        tuple: A tuple containing (probability_of_default, expected_loss)
    """
    try:
        # 1. Create a pandas DataFrame from the dict to ensure column order
        # This is a robust way to handle the input
        # We MUST use 'pandas' here, not 'pd', because we will assign a
        # local variable named 'pd' later, which causes a namespace conflict.
        # FIX: Changed local variable from 'pd' to 'prob_default'.
        # We can now safely use the global 'pd' alias for pandas.
        input_df = pd.DataFrame([loan_details])

        # Reorder columns to match the exact order the model was trained on
        input_data = input_df[feature_cols]

        # 2. Scale the input data using the *same* scaler from training
        input_scaled = scaler.transform(input_data)

        # 3. Predict probability of default (PD)
        # model.predict_proba() returns [[prob_class_0, prob_class_1]]
        # We want the probability of class 1 (default)
        prob_default = model.predict_proba(input_scaled)[0][1]

        # 4. Calculate expected loss
        # Expected Loss = Probability of Default * Loss Given Default
        expected_loss = prob_default * lgd

        return prob_default, expected_loss

    except Exception as e:
        print(f"Error in calculating expected loss: {e}")
        return None, None

def main():
    """
    Main function to load data, train the model, and demonstrate the predictor.
    """
    print("--- Loan Default Prediction Model ---")

    # --- 1. Load Data ---
    try:
        data = pd.read_csv("JPMC3.csv")
        print(f"Successfully loaded data from '{FILE_PATH}'. Found {len(data)} records.")
    except FileNotFoundError:
        print(f"Error: Could not find the file '{FILE_PATH}'.")
        print("Please make sure the file is in the same directory as this script.")
        return
    except Exception as e:
        print(f"An error occurred while loading the data: {e}")
        return

    # --- 2. Data Preprocessing ---

    # Check for missing values
    if data.isnull().values.any():
        print("Warning: Missing values found. Dropping rows with NaNs.")
        data = data.dropna()

    # Define features (X) and target (y)
    X = data[FEATURE_COLS]
    y = data[TARGET_COL]

    # Split data into training and testing sets (80% train, 20% test)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    print(f"Data split: {len(X_train)} training samples, {len(X_test)} testing samples.")

    # --- 3. Feature Scaling ---
    # We scale features so that all have a similar range.
    # This is important for logistic regression to perform well.
    scaler = StandardScaler()

    # Fit the scaler ONLY on the training data
    X_train_scaled = scaler.fit_transform(X_train)

    # Transform the test data using the *same* fitted scaler
    X_test_scaled = scaler.transform(X_test)

    # --- 4. Model Training (Logistic Regression) ---

    # Note on model choice:
    # We use Logistic Regression because we need to predict a PROBABILITY (a value
    # between 0 and 1). Linear Regression is not suitable as it predicts
    # continuous values (e.g., -infinity to +infinity) and cannot be
    # interpreted as a probability.

    print("\nTraining Logistic Regression model...")
    # class_weight='balanced' helps the model handle cases where one class
    # (e.g., non-default) is much more common than the other (e.g., default).
    model = LogisticRegression(class_weight='balanced', random_state=42)
    model.fit(X_train_scaled, y_train)
    print("Model training complete.")

    # --- 5. Model Evaluation ---
    print("\n--- Model Performance on Test Data ---")
    y_pred = model.predict(X_test_scaled)

    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    # --- 6. Demonstrate the Expected Loss Function ---
    print("\n--- Expected Loss Calculation Demo ---")

    # Example 1: A "good" borrower profile (high FICO, high income, low debt)
    good_borrower = {
        'credit_lines_outstanding': 1,
        'loan_amt_outstanding': 2000,
        'total_debt_outstanding': 5000,
        'income': 80000,
        'years_employed': 10,
        'fico_score': 750
    }

    prob_default_good, el_good = calculate_expected_loss(good_borrower, model, scaler, FEATURE_COLS, LOSS_GIVEN_DEFAULT)
    if prob_default_good is not None:
        print(f"\nExample 1 (Good Borrower - FICO: 750):")
        print(f"  > Predicted Probability of Default (PD): {prob_default_good:.2%}")
        print(f"  > Expected Loss (at {LOSS_GIVEN_DEFAULT*100}% LGD): {el_good:.2%}")

    # Example 2: A "risky" borrower profile (low FICO, low income, high debt)
    risky_borrower = {
        'credit_lines_outstanding': 5,
        'loan_amt_outstanding': 15000,
        'total_debt_outstanding': 25000,
        'income': 35000,
        'years_employed': 2,
        'fico_score': 580
    }

    prob_default_risky, el_risky = calculate_expected_loss(risky_borrower, model, scaler, FEATURE_COLS, LOSS_GIVEN_DEFAULT)
    if prob_default_risky is not None:
        print(f"\nExample 2 (Risky Borrower - FICO: 580):")
        print(f"  > Predicted Probability of Default (PD): {prob_default_risky:.2%}")
        print(f"  > Expected Loss (at {LOSS_GIVEN_DEFAULT*100}% LGD): {el_risky:.2%}")

if __name__ == "__main__":
    main()

--- Loan Default Prediction Model ---
Successfully loaded data from 'JPMC3.csv'. Found 10000 records.
Data split: 8000 training samples, 2000 testing samples.

Training Logistic Regression model...
Model training complete.

--- Model Performance on Test Data ---
Accuracy: 0.9955

Confusion Matrix:
[[1621    9]
 [   0  370]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      1.00      1630
           1       0.98      1.00      0.99       370

    accuracy                           1.00      2000
   macro avg       0.99      1.00      0.99      2000
weighted avg       1.00      1.00      1.00      2000


--- Expected Loss Calculation Demo ---

Example 1 (Good Borrower - FICO: 750):
  > Predicted Probability of Default (PD): 0.00%
  > Expected Loss (at 90.0% LGD): 0.00%

Example 2 (Risky Borrower - FICO: 580):
  > Predicted Probability of Default (PD): 100.00%
  > Expected Loss (at 90.0% LGD): 90.00%
