In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Load the dataset
file_path = 'Task 3 and 4_Loan_Data.csv'
df = pd.read_csv(file_path)

# Data preprocessing
X = df.drop(columns=['default', 'customer_id'])  # Features excluding 'default' and 'customer_id'
y = df['default']  # Target variable

# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Logistic Regression model
log_reg_model = LogisticRegression(random_state=42)
log_reg_model.fit(X_train_scaled, y_train)

# Train Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)

# Function to predict probability of default
def predict_default_probability(model, borrower_details, scaler):
    borrower_arr = np.array(borrower_details).reshape(1, -1)
    borrower_arr_scaled = scaler.transform(borrower_arr)
    prob_default = model.predict_proba(borrower_arr_scaled)[:, 1]
    return prob_default[0]

# Example usage of the function
borrower_details = [0, 5221.545, 3915.471, 78039.39, 5, 605]  # Ensure the order matches the X_train columns
log_reg_prob_default = predict_default_probability(log_reg_model, borrower_details, scaler)
rf_prob_default = predict_default_probability(rf_model, borrower_details, scaler)

print(f"Estimated probability of default (Logistic Regression): {log_reg_prob_default:.4f}")
print(f"Estimated probability of default (Random Forest): {rf_prob_default:.4f}")



Estimated probability of default (Logistic Regression): 0.0000
Estimated probability of default (Random Forest): 0.0000


