In [3]:
#Load and preprocess the Data
#Import the libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

#pip install fastapi uvicorn (for FastAPI and Uvicorn)
#We are adding an API endpoint so the model can male real-time load risk predictions


In [4]:
# Load the dataset
df = pd.read_csv("lending_club_loan_two.csv")

# Display the column names to verify them
print(df.columns)

# Drop rows with missing values (if necessary)
df = df.dropna()  

# Or you could use df.fillna() to replace NaN values with a specific value

#Size and head of the df
print(df.head)
print(df.info)

Index(['loan_amnt', 'term', 'int_rate', 'installment', 'grade', 'sub_grade',
       'emp_title', 'emp_length', 'home_ownership', 'annual_inc',
       'verification_status', 'issue_d', 'loan_status', 'purpose', 'title',
       'dti', 'earliest_cr_line', 'open_acc', 'pub_rec', 'revol_bal',
       'revol_util', 'total_acc', 'initial_list_status', 'application_type',
       'mort_acc', 'pub_rec_bankruptcies', 'address'],
      dtype='object')
<bound method NDFrame.head of         loan_amnt        term  int_rate  installment grade sub_grade  \
0         10000.0   36 months     11.44       329.48     B        B4   
1          8000.0   36 months     11.99       265.68     B        B5   
2         15600.0   36 months     10.49       506.97     B        B3   
3          7200.0   36 months      6.49       220.65     A        A2   
4         24375.0   60 months     17.27       609.33     C        C5   
...           ...         ...       ...          ...   ...       ...   
396024     6000.0   36 

In [5]:
# Check unique values in the column
print(df['term'].unique())  
print(df['term'].dtype)  # Check data type

[' 36 months' ' 60 months']
object


Apply Standardization

In [7]:
from sklearn.preprocessing import StandardScaler

# Define columns to scale
features_to_scale = ['loan_amnt', 'int_rate', 'installment', 'emp_length', 
                     'annual_inc', 'dti', 'term']

# Initialize scaler
scaler = StandardScaler()

# Apply standardization (fit & transform)
df[features_to_scale] = scaler.fit_transform(df[features_to_scale])

# Print transformed data
print(df.head())

ValueError: could not convert string to float: '10+ years'

Load & preprocess the Data:
- We will use the lendingClub dataset for real world load data

In [None]:
# Select relevant columns
columns_to_use = ['loan_amnt', 'term', 'int_rate', 'installment', 'emp_length', 
                  'annual_inc', 'dti', 'loan_status']
df = df[columns_to_use]


# Ensure term column is string before splitting
df['term'] = df['term'].fillna("0 months")  # Replace NaN with default

# Convert the integers into string before proceeding
# The .split()[0] function only works if x is a string, but some values in your dataset might already be integers (or missing/NaN)

df['term'] = df['term'].astype(str).apply(lambda x: int(x.split()[0]) if 'month' in x else x)
# Convert '36 months' -> 36

# Encode categorical variables
df['loan_status'] = df['loan_status'].fillna("Unknown")  # Handle missing values
df['loan_status'] = df['loan_status'].apply(lambda x: 1 if x == 'Fully Paid' else 0)  # Default = 0

# Converting the strings into integer 
df['term'] = df['term'].astype(str).str.extract('(\d+)').astype(float)  # Extract numeric values safely

print(df.dtypes)

# Handle missing values
df.fillna(df.median(), inplace=True)

Split the data into training and testing sets:

In [None]:
# Split into training and testing sets
X = df.drop(columns=['loan_status'])
y = df['loan_status']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize numeric features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

Train Credit Risk Prediction Model:
- We will use Random Forest Classifier to predict loan default risk

In [None]:
# Train Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])

print(f"Accuracy: {accuracy:.2f}")
print(f"ROC AUC Score: {roc_auc:.2f}")
print(classification_report(y_test, y_pred))

Load portfolio optimisation (Monte Carlo Simulation)
 - We will use Monte Carlo simulation to optimize the loan portfolio

In [None]:
# Simulate different loan portfolios
num_simulations = 10000
portfolio_returns = []

for _ in range(num_simulations):
    weights = np.random.dirichlet(np.ones(X_test.shape[1]), size=1)  # Random weights
    simulated_return = np.dot(weights, X_test.mean(axis=0))  # Portfolio return
    portfolio_returns.append(simulated_return[0])

# Plot the simulated distribution of portfolio returns
sns.histplot(portfolio_returns, bins=50, kde=True)
plt.xlabel("Simulated Portfolio Return")
plt.ylabel("Frequency")
plt.title("Monte Carlo Simulation of Loan Portfolio")
plt.show()

Saving the model after training:
- We add this code to save the model & scaler
- This ensures that the model trained can be used later without retaining it

In [None]:
# pickle aka .pkl 
# #Used to store Python objects (lists, dictionaries, models, etc.) in a file for later use
import pickle 

# Save the trained model
with open("credit_risk_model.pkl", "wb") as f: #'wb' = Write binary
    pickle.dump(model, f)

# Save the feature scaler (important for consistent input transformation)
with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

#avoid loading .pkl files from untrusted sources (they can execute malicious code)
