In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

In [11]:
df = pd.read_csv('Task 3 and 4_Loan_Data.csv')
X = df.drop(columns=['customer_id', 'default'])
y = df['default']

In [12]:
# Display basic info
df.info()
print(df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   customer_id               10000 non-null  int64  
 1   credit_lines_outstanding  10000 non-null  int64  
 2   loan_amt_outstanding      10000 non-null  float64
 3   total_debt_outstanding    10000 non-null  float64
 4   income                    10000 non-null  float64
 5   years_employed            10000 non-null  int64  
 6   fico_score                10000 non-null  int64  
 7   default                   10000 non-null  int64  
dtypes: float64(3), int64(5)
memory usage: 625.1 KB
   customer_id  credit_lines_outstanding  loan_amt_outstanding  \
0      8153374                         0           5221.545193   
1      7442532                         5           1958.928726   
2      2256073                         0           3363.009259   
3      4885975             

In [13]:
df.dropna(inplace=True)

In [14]:
# Normalize numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [15]:
# Train Logistic Regression Model
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
y_pred_log = log_reg.predict(X_test)
y_prob_log = log_reg.predict_proba(X_test)[:, 1]  # Get probability scores


In [16]:
# Train Decision Tree Model
dec_tree = DecisionTreeClassifier(max_depth=4)
dec_tree.fit(X_train, y_train)
y_pred_tree = dec_tree.predict(X_test)
y_prob_tree = dec_tree.predict_proba(X_test)[:, 1]


In [17]:
# Performance Evaluation
def evaluate_model(y_true, y_pred, y_prob, model_name):
    print(f"\n--- {model_name} Performance ---")
    print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
    print(f"Precision: {precision_score(y_true, y_pred):.4f}")
    print(f"Recall: {recall_score(y_true, y_pred):.4f}")
    print(f"AUC Score: {roc_auc_score(y_true, y_prob):.4f}")

evaluate_model(y_test, y_pred_log, y_prob_log, "Logistic Regression")
evaluate_model(y_test, y_pred_tree, y_prob_tree, "Decision Tree")



--- Logistic Regression Performance ---
Accuracy: 0.9960
Precision: 0.9971
Recall: 0.9799
AUC Score: 1.0000

--- Decision Tree Performance ---
Accuracy: 0.9935
Precision: 0.9855
Recall: 0.9770
AUC Score: 0.9995


In [19]:
# Function to predict PD and Expected Loss
def predict_default(loan_details, model, scaler, recovery_rate=0.1):
    loan_array = np.array(loan_details).reshape(1, -1)
    loan_scaled = scaler.transform(loan_array)
    pd_score = model.predict_proba(loan_scaled)[:, 1][0]  # Probability of Default
    expected_loss = pd_score * (1 - recovery_rate) * loan_details[1]  # Loan Amount at risk
    return round(pd_score, 4), round(expected_loss, 2)



In [21]:
# Example Loan: Adjusted dynamically based on feature count
feature_count = X.shape[1]
sample_loan = [55000, 15000] + [1] * (feature_count - 2)  # Adjusted to match dataset features

pd_score, expected_loss = predict_default(sample_loan, log_reg, scaler, feature_count)

print(f"\nPredicted Probability of Default: {pd_score}")
print(f"Expected Loss: ${expected_loss}")



Predicted Probability of Default: 1.0
Expected Loss: $-75000.0


