In [1]:
import numpy as np
import pandas as pd

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('Loan_Data.csv')
df.head()

Unnamed: 0,customer_id,credit_lines_outstanding,loan_amt_outstanding,total_debt_outstanding,income,years_employed,fico_score,default
0,8153374,0,5221.545193,3915.471226,78039.38546,5,605,0
1,7442532,5,1958.928726,8228.75252,26648.43525,2,572,1
2,2256073,0,3363.009259,2027.83085,65866.71246,4,602,0
3,4885975,0,4766.648001,2501.730397,74356.88347,5,612,0
4,4700614,1,1345.827718,1768.826187,23448.32631,6,631,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   customer_id               10000 non-null  int64  
 1   credit_lines_outstanding  10000 non-null  int64  
 2   loan_amt_outstanding      10000 non-null  float64
 3   total_debt_outstanding    10000 non-null  float64
 4   income                    10000 non-null  float64
 5   years_employed            10000 non-null  int64  
 6   fico_score                10000 non-null  int64  
 7   default                   10000 non-null  int64  
dtypes: float64(3), int64(5)
memory usage: 625.1 KB


In [4]:
unique_customer_ids = df['customer_id'].nunique()
print(f"Number of unique customer IDs: {unique_customer_ids}")

Number of unique customer IDs: 10000


In [9]:
default_counts = df['default'].value_counts()
print(default_counts)

default
0    8149
1    1851
Name: count, dtype: int64


In [5]:
df = df.drop("customer_id", axis=1)

In [6]:
X = df.drop("default", axis=1)  # Features
y = df["default"]               # Target

In [10]:
# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [12]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, stratify=y, random_state=42)

In [13]:
# Check class distribution
print("Train class distribution:")
print(y_train.value_counts(normalize=True))

print("Test class distribution:")
print(y_test.value_counts(normalize=True))

Train class distribution:
default
0    0.814875
1    0.185125
Name: proportion, dtype: float64
Test class distribution:
default
0    0.815
1    0.185
Name: proportion, dtype: float64


In [14]:
# Check class distribution
print("Train class distribution:")
print(y_train.value_counts(normalize=False))

print("Test class distribution:")
print(y_test.value_counts(normalize=False))

Train class distribution:
default
0    6519
1    1481
Name: count, dtype: int64
Test class distribution:
default
0    1630
1     370
Name: count, dtype: int64


In [15]:
DTmodel = DecisionTreeClassifier()
DTmodel.fit(X_train, y_train)

In [17]:
# Predictions
y_pred = DTmodel.predict(X_test)

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

Confusion Matrix:
[[1625    5]
 [   5  365]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1630
           1       0.99      0.99      0.99       370

    accuracy                           0.99      2000
   macro avg       0.99      0.99      0.99      2000
weighted avg       0.99      0.99      0.99      2000



In [39]:
def calculate_expected_loss(loan_features, loan_amount, recovery_rate=0.1):
    """
    Calculate the expected loss on a loan based on predicted probability of default (PD).
    
    Parameters:
    loan_features (pd.DataFrame): Single-row DataFrame containing borrower's loan features.
    loan_amount (float): The loan amount in currency units.
    recovery_rate (float): The recovery rate (default is 0.1, i.e., 10%).
    
    Returns:
    float: The expected loss on the loan.
    """
    # Predict probability of default (class 1)
    probability_of_default = DTmodel.predict_proba(loan_features)[0][1]
    print("Default Probability is ", probability_of_default)
    
    # Calculate expected loss
    expected_loss = loan_amount * probability_of_default * (1 - recovery_rate)
    
    return expected_loss

In [37]:
# Example Usage:
# Borrower's loan details as a single-row dataframe
borrower_details = pd.DataFrame({
    'credit_lines_outstanding': [5],
    'loan_amt_outstanding': [1958.928726],
    'total_debt_outstanding': [8228.75252],
    'income': [26648.43525],
    'years_employed': [2],
    'fico_score': [572]
})

In [40]:
# Loan amount and recovery rate
loan_amount = 50000
recovery_rate = 0.1  # 10%

# Calculate expected loss
expected_loss = calculate_expected_loss(borrower_details, loan_amount, recovery_rate)
print(f"Expected Loss: ${expected_loss:.2f}")

Default Probability is  1.0
Expected Loss: $45000.00


