In [None]:
import numpy
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("Loan_status_2007-2020Q3.csv", nrows = 100000)

In [None]:
# Display the first few rows of the dataset
df.head()

# save the head of the dataset to a csv file
df.head().to_csv('head.csv')

# Question 1 : Quick EDA

In [None]:
# Display the column names
df.columns

In [None]:
df.info()

In [None]:
pd.set_option('display.max_rows', None)
df.isna().sum()

In [None]:
# Columns to drop
columns_to_drop = [
    'Unnamed: 0',  # Index or row number
    'id',          # Unique identifier
    'grade',       # Risk category
    'sub_grade',   # Sub-category of risk
    'int_rate',    # Interest rate
    #'last_pymnt_amnt', 
    #'last_fico_range_high', 'last_fico_range_low',
    'funded_amnt', 'funded_amnt_inv',  # Funding amounts
    'total_pymnt', 'total_rec_prncp', 'total_rec_int',  # Total payments
    'hardship_start_date', 'hardship_end_date', 'payment_plan_start_date',  
    'hardship_length', 'hardship_dpd', 'hardship_loan_status',  
    'orig_projected_additional_accrued_interest', 'hardship_payoff_balance_amount', 'hardship_last_payment_amount', 
    'debt_settlement_flag' # Debt settlement indicator
]

# Drop the columns
df_cleaned = df.drop(columns=columns_to_drop)

# Filter the DataFrame
df_cleaned = df_cleaned[df_cleaned['loan_status'].isin(['Fully Paid', 'Charged Off'])]

the last_pymtn_amount, last_fico_range_high, last_fico_range_low have a strong influence on the loan status.but if we assume that we are within the loan repayment period for prediction after the loan is approved , they can be used to predict the loan status. as they are the most 3 important features in the dataset, they can be used to predict the loan status. keepin them will give us good models as we will say later in the notebook.
This works under the assumptiom that we are within the loan repayment period for prediction after the loan is approved.

In [None]:
# Print all column names
print(df_cleaned.columns.tolist())

In [None]:
# Additional columns to drop
additional_columns_to_drop = [
    'url', 'issue_d', 
    'last_pymnt_d', 'next_pymnt_d', 'last_credit_pull_d',
    'out_prncp', 'out_prncp_inv', 'total_pymnt_inv', 'total_rec_late_fee',
    'recoveries', 'collection_recovery_fee',
    'annual_inc_joint', 'dti_joint', 'verification_status_joint', 'revol_bal_joint',
    'sec_app_fico_range_low', 'sec_app_fico_range_high', 'sec_app_earliest_cr_line',
    'total_rev_hi_lim', 'tot_hi_cred_lim', 'total_bal_ex_mort', 'total_bc_limit',
    'total_il_high_credit_limit',
    'hardship_flag', 'hardship_type', 'hardship_reason', 'hardship_status',
    'deferral_term', 'hardship_amount',
    'policy_code' # contains only 1 and nan
]

# Drop the columns
df_cleaned = df_cleaned.drop(columns=additional_columns_to_drop)


In [None]:
df_cleaned.columns

In [None]:
df_cleaned.shape

In [None]:
# Set a threshold for missing values (e.g., more than 50% missing)
threshold = 0.5

# Calculate the percentage of missing values for each column
missing_percentage = df_cleaned.isna().mean()

# Create a list of columns with missing values greater than the threshold
columns_with_too_many_missing_values = missing_percentage[missing_percentage > threshold].index.tolist()

df_cleaned = df_cleaned.drop(columns=columns_with_too_many_missing_values)

In [None]:
df_cleaned.shape

In [None]:
df_cleaned.describe()

In [None]:
df_cleaned.info()

In [None]:
df_cleaned.loan_status

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Separate the data based on loan_status
fully_paid = df_cleaned[df_cleaned['loan_status'] == 'Fully Paid']
charged_off = df_cleaned[df_cleaned['loan_status'] == 'Charged Off']

# Create a figure with two subplots
plt.figure(figsize=(14, 6))

# Plot histogram for Fully Paid loans
plt.subplot(1, 2, 1)
sns.histplot(fully_paid['loan_amnt'], bins=50, kde=True, color='blue', stat='density')
plt.title('Distribution of Loan Amount (Fully Paid)')
plt.xlabel('Loan Amount')
plt.ylabel('Density')

# Plot histogram for Charged Off loans
plt.subplot(1, 2, 2)
sns.histplot(charged_off['loan_amnt'], bins=50, kde=True, color='red', stat='density')
plt.title('Distribution of Loan Amount (Charged Off)')
plt.xlabel('Loan Amount')
plt.ylabel('Density')

# Adjust layout for better spacing
plt.tight_layout()

# Show the plots
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x='loan_status', y='loan_amnt', data=df_cleaned)
plt.title('Loan Amount by Loan Status')
plt.xlabel('Loan Status')
plt.ylabel('Loan Amount')
# Rotate x-axis labels to vertical
plt.xticks(rotation=90)
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Count the occurrences of each category
loan_status_counts = df_cleaned['loan_status'].value_counts()

# Create a bar plot
plt.figure(figsize=(8, 6))
plt.bar(loan_status_counts.index, loan_status_counts.values, color='skyblue', edgecolor='black')
plt.title('Distribution of Loan Status')
plt.xlabel('Loan Status')
plt.ylabel('Frequency')
plt.xticks(ticks=[0, 1], labels=['Fully Paid', 'Charged Off'])  # Assuming 0 is Fully Paid and 1 is Charged Off
plt.show()


Imbalanced data we will undersample the majority class to balance the data later.

In [None]:
# Map loan status to binary values
df_cleaned['loan_status'] = df_cleaned['loan_status'].map({'Fully Paid': 0, 'Charged Off': 1})

# Verify the change
print(df_cleaned['loan_status'].unique())

In [None]:
df.shape

In [None]:
df_cleaned.shape

In [None]:
df_cleaned.info()

In [None]:
object_columns = df_cleaned.select_dtypes(include=['object']).columns
print(object_columns)

In [None]:
columns_to_encode = object_columns.drop(['zip_code', 'revol_util', 'earliest_cr_line', 'initial_list_status',  'emp_title', 'home_ownership',
       'verification_status', 'title', 'addr_state','application_type'])

In [None]:
columns_to_encode

In [None]:
df_encoded = pd.get_dummies(df_cleaned, columns=columns_to_encode, drop_first=True)

In [None]:
df_encoded.shape

In [None]:
df_encoded.info()

# Question 2 : Possible Modeling Approaches

### 1. Logistic Regression

**Example:** Implement a `LogisticRegression` model to predict `loan_status`. Logistic Regression is a simple yet effective model for binary classification problems.

**Pros:**
- **Interpretability:** Coefficients indicate how features influence the outcome, making it easy to understand.
- **Simplicity:** Quick to train and requires minimal computational resources.
- **Baseline Model:** Serves as a strong baseline for comparison against more complex models.

**Cons:**
- **Linearity Assumption:** Assumes a linear relationship between features and the log-odds of the target, which may not always be accurate.
- **Limited Complexity:** May struggle to capture non-linear relationships and feature interactions.

### 2. Random Forest

**Example:** Use a `RandomForestClassifier` to handle complex interactions between features and improve prediction accuracy.

**Pros:**
- **Performance:** Generally offers high accuracy due to its ensemble nature, which reduces overfitting.
- **Feature Importance:** Provides insights into which features are most important in making predictions.
- **Robustness:** Handles both numerical and categorical data well, and is less sensitive to outliers.

**Cons:**
- **Interpretability:** More challenging to interpret than Logistic Regression, as it involves multiple decision trees.
- **Computational Cost:** Requires more computational power and memory, especially with large datasets or many trees.

if the data is linearly separable, Logistic Regression is generally a good choice. However, if the relationship is more complex or non-linear, Random Forest may be a better option.

In [None]:
from sklearn.model_selection import train_test_split

# Step 1: Remove object variables
df_numeric = df_encoded.drop(columns=df_encoded.select_dtypes(include=['object']).columns)
# Drop rows with any NaN values
df_numeric = df_numeric.dropna()

In [None]:
df_numeric.info()

### Data Imbalance

Before training our classifier, we will address the data imbalance by using undersampling. This technique involves randomly reducing the number of instances in the "Fully Paid" class to balance the dataset.

In [None]:
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split

# Features and target
X = df_numeric.drop(columns='loan_status')  # Features
y = df_numeric['loan_status']  # Target


print(X.shape, y.shape)

In [None]:

# Apply undersampling to the entire dataset
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

# Display the new class distribution
print(f'Original class distribution:\n{y.value_counts()}')
print(f'Resampled class distribution:\n{y_resampled.value_counts()}')


In [None]:
# Split the resampled dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

# Verify the split
print(f'Training set size: {len(y_train)}')
print(f'Test set size: {len(y_test)}')

In [None]:
X.info()

In [None]:
# correlation with target variable (loan_status)
correlation = df_numeric.corr()['loan_status'].sort_values(ascending=False)

# plot the correlation the top 10 most positive and negative correlated features
plt.figure(figsize=(10, 12))
correlation.drop('loan_status').plot(kind='barh')
plt.title('Correlation with Loan Status')
plt.show()

### scaling
We scale X_test using the parameters used to scale X_train in order to avoid data bias

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


# Step 4: Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test) # 

# Question 3 : Model implementation and explanability

For the question 3, we will implement a logistic regression model and explain the coefficients of the model. as well as a random forest model with shap values to explain the model predictions.

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix


# Initialize and train the Logistic Regression model
model = LogisticRegression(max_iter=1000)  # Increase max_iter if needed
model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Confusion Matrix:\n{conf_matrix}')


In [None]:
from sklearn.metrics import classification_report
print('Classification Report:')
print(classification_report(y_test, y_pred))


In [None]:
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt

y_prob = model.predict_proba(X_test_scaled)[:, 1]  # Probabilities for the positive class
fpr, tpr, _ = roc_curve(y_test, y_prob)
auc = roc_auc_score(y_test, y_prob)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'Logistic Regression (AUC = {auc:.4f})')
plt.plot([0, 1], [0, 1], 'k--')  # Diagonal line (random chance)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()


we can conclude that the model perfoms well with an accuracy with more than 0.92 setting a baseline for the random forest model. And also that the features are lienarly related to the target variable.

`note` : dropping the features last_pymnt_amnt, last_fico_range_high, last_fico_range_low will decrease the accuracy of the model to approximately 0.68 .

In [None]:
import pandas as pd
feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': model.coef_[0]
}).sort_values(by='Coefficient', ascending=False)
print(feature_importance)


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 18))
plt.barh(feature_importance['Feature'], feature_importance['Coefficient'])
plt.xlabel('Coefficient Value')
plt.title('Feature Importance in Logistic Regression')
plt.show()


# Question 4 : Model explainability LR

## Interpretation and Feature Importance: 

The magnitude of the coefficient indicates the strength of the relationship. Larger coefficients (in absolute value) indicate that changes in the feature have a larger impact on the predicted probability.

The sign (positive or negative) of a coefficient indicates the direction of the relationship between the feature and the target. A positive coefficient means that as the feature value increases, the likelihood of the positive class (e.g., Fully Paid) increases. Conversely, a negative coefficient means that an increase in the feature value decreases the likelihood of the positive class.

Here the most important features are `last_pymnt_amnt`, `last_fico_range_high`, `last_fico_range_low`



## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10)

# Train the model
rf_model.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred = rf_model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')

print('Classification Report:')
print(classification_report(y_test, y_pred))

print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))


`note` : dropping the features last_pymnt_amnt, last_fico_range_high, last_fico_range_low will decrease the accuracy of the model to approximately 0.66 .

In [None]:
# import joblib
# # Save the model using joblib
# 
# model_filename = 'random_forest_model.joblib'
# joblib.dump(rf_model, model_filename)
# print(f'Model saved to {model_filename}')

In [None]:
# Get feature importance
importances = rf_model.feature_importances_

# Create a DataFrame to hold the feature names and their importance
import pandas as pd

feature_names = X_train.columns
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})

# Sort the DataFrame by importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Plot the feature importances
plt.figure(figsize=(12, 18))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df)
plt.title('Feature Importances in Random Forest Model')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()

# Question 4 : Model explainability RF

While the magnitude of the coefficient determines the strength of the effect of the feature on the target, we can't know in wich direction the feature will affect the target.

For this case we will use shap values to interpret the effect of the features on the target  
**warning** : shap values are computationally expensive and may take a long time to compute here we will use a small sample of the data to compute the shap values

In [None]:
import shap

X_test_scaled_subset = X_test_scaled[:50]

explainer = shap.Explainer(rf_model)
shap_values = explainer.shap_values(X_test)

shap.summary_plot(shap_values, X_test)


The summary plot shows the feature importance of each feature in the model. 

In [None]:
shap.summary_plot(shap_values[0], X_test)

The SHAP summary plot provides a comprehensive view of how each feature influences the predictions of your model for a given class. above for class 0

#### Understanding the Plot :

- **Features on the Y-Axis:** Each row represents a feature from the dataset. Features are sorted by their importance in influencing the model's predictions.

- **SHAP Values on the X-Axis:** The horizontal axis shows the SHAP values, which indicate the impact of each feature on the model's prediction. Positive SHAP values push the prediction higher, while negative values pull it lower.

- **Color Coding:** The color of each point represents the value of the feature. Blue for low values and red for high values is used.

- **Distribution of Points:** Each point represents a single prediction instance. The spread of points along the x-axis indicates how much variation there is in the feature's impact across different predictions. A wider spread suggests more variability in feature impact.

# Question 5 : Additional Steps for Model Enhancement

If I had more time or resources, I would take the following steps to enhance the model, improve its performance, and increase robustness:

### 1. Hyperparameter Tuning
To optimize the performance of the `RandomForestClassifier`, I would perform more thorough hyperparameter tuning using the following methods:
- **Grid Search or Random Search:** To find the optimal set of parameters by exhaustively searching through specified parameter values.
- **Bayesian Optimization:** To efficiently explore the hyperparameter space, which might be more effective than grid search.

### 2. Model Ensembling
I would create an ensemble of different models to capture various patterns in the data:
- **Combine Multiple Models:** Build an ensemble using models like Gradient Boosting Machines and XGBoost.
- **Stacking Models:** Implement a stacking approach where predictions from multiple models are combined using a meta-model.

### 3. Advanced Feature Selection
To improve the model's efficiency and interpretability, I would employ advanced feature selection techniques:
- **Recursive Feature Elimination (RFE):** Systematically select the most important features by recursively eliminating less important ones.
- **L1 Regularization (Lasso):** Use Lasso regression for feature selection by penalizing less important features.

### 4. Cross-Validation
To ensure model performance is consistent , I would:
- **K-Fold Cross-Validation:** Implement K-Fold cross-validation to assess model performance across multiple subsets.

### 5. Explainability and Interpretability
To enhance model transparency and stakeholder trust:
- **More Explainability Techniques:** Explore other explainability methods such as LIME (Local Interpretable Model-agnostic Explanations)

### 6. Robustness Checks
To ensure the robustness of the model:
- **Outlier Detection:** Perform outlier detection and removal to improve the model's performance.
- **Adversarial Validation:** Ensure the training and test data are from the same distribution, avoiding data leakage.

### 7. Model Deployment
To ensure the model is production-ready:
- **Deploy on Cloud:** Deploy the model as a microservice using cloud platforms like AWS, Google Cloud, or Azure.
- **Monitoring and Maintenance:** Implement monitoring for model performance in production, enabling detection and mitigation of model drift or data shift over time.

### Additional Datasets to Enrich the Original Dataset

To improve the model's predictive accuracy and provide more context, I would consider integrating the following datasets:


### 1. Economic Indicators
- **Macroeconomic Data:** Include macroeconomic indicators like unemployment rates, inflation, interest rates, and GDP growth from sources such as the Federal Reserve or World Bank.

### 2. Behavioral Data
- **Transaction Data:** If available, integrate transaction-level data from borrowers' bank accounts or credit cards to gain insights into spending patterns.

### 3. Web Scraping
- **Additional Sources:** I’d also consider scraping the web for additional relevant data, for example adding more years of data to the dataset.
I have checked and the Lending Club does not allow web scraping after 2020.


# Question 6 : Scalability Discussion

### a. Number of Loans/Rows in the Training Data

The scalability of my solution largely depends on the amount of data it needs to process during training. As the number of loans/rows in the dataset increases, several factors come into play:

1. **Memory Usage:** 
   - The `RandomForestClassifier` is an ensemble learning method that constructs multiple decision trees during training. The memory required increases with the size of the training data and the number of trees. To use fully the provided dataset, we need more high memory, potentially requiring more RAM or distributed computing resources.

2. **Training Time:**
   - Training time increases with the number of rows in the dataset. While `RandomForestClassifier` is relatively efficient, large datasets can still result in lengthy training times. To address this, I could:
     - **Use Parallel Processing:** Leverage multi-core CPUs to train trees in parallel, which can significantly reduce training time.
     - **Reduce Data Dimensionality:** Employ feature selection or dimensionality reduction techniques to minimize the number of features, thereby speeding up the training process.
     - **Use Cloud Resources:** Utilize cloud platforms with scalable resources (e.g., AWS EC2 instances with large memory or distributed computing like Spark) to handle larger datasets efficiently.


### b. Number of Predictions in Production

The scalability of the inference endpoint in production depends on the number of predictions it needs to make and the speed at which these predictions are required:

1. **Inference Latency:**
   - **Low Latency Requirements:** If the application requires real-time or near-real-time predictions, I need to ensure the model can make predictions quickly. `RandomForestClassifier` is generally fast at inference due to its tree-based structure, but latency could become an issue with an extremely large number of requests. To mitigate this:
     - **Optimized Deployment:** Deploy the model on a powerful server or use specialized hardware (e.g., GPUs or TPUs) to speed up inference.
     - **Model Simplification:** Consider simplifying the model by reducing the number of trees or using a different algorithm that offers faster inference times.

2. **Batch vs. Real-Time Predictions:**
   - **Batch Processing:** For scenarios where predictions can be made in bulk (e.g., processing thousands of loan applications overnight), I could batch the requests and process them in parallel, optimizing resource usage.
   - **Asynchronous Processing:** Implement asynchronous processing where predictions are queued and processed as resources become available, which is useful in managing bursts of prediction requests.

3. **Containerization and Microservices:**
   - **Docker:** Packaging the model in a Docker container ensures that it can be easily scaled across different environments. By deploying the container on a Kubernetes cluster, I can dynamically scale the number of replicas based on the load, ensuring high availability and scalability.
   - **Microservices Architecture:** Decompose the prediction service into microservices that can independently scale, allowing different components (e.g., feature preprocessing, model inference, post-processing) to scale according to demand.

4. **Monitoring and Auto-Scaling:**
   - Implement monitoring tools to track the performance of the inference endpoint in production. Using tools like Prometheus and Grafana, I can set up auto-scaling rules that trigger the deployment of additional resources when the load increases, ensuring the system remains responsive.



# Question 7

## PCA for Dimensionality Reduction

let's use pca to visualize the data in a 2D or 3D space

In [None]:
X_uns = scaler.transform(X_resampled)
X_uns.shape

In [None]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# For PCA (2D)
pca = PCA(n_components=3)
X_reduced_pca = pca.fit_transform(X_uns)

# print the variance ratio of the PCA
print(f'Explained variance ratio: {pca.explained_variance_ratio_}')

In [None]:
import plotly.graph_objs as go
import plotly.express as px
import pandas as pd




# Convert PCA results and labels to a DataFrame
df_pca = pd.DataFrame(X_reduced_pca, columns=['PC1', 'PC2', 'PC3'])
y_aligned = y_resampled.reset_index(drop=True)
df_pca['Loan Status'] = y_aligned


# Create a 3D scatter plot
fig = px.scatter_3d(
    df_pca, x='PC1', y='PC2', z='PC3',
    color='Loan Status',
    title='3D Scatter Plot of PCA (Principal Component Analysis)',
    labels={'Loan Status': 'Loan Status'}
)

# Customize the layout
fig.update_traces(marker=dict(size=4), selector=dict(mode='markers'))
fig.update_layout(
    scene=dict(
        xaxis_title='PC1',
        yaxis_title='PC2',
        zaxis_title='PC3'),
    width=800,
    height=600,
    margin=dict(l=0, r=0, b=0, t=40)
)

# Show the interactive plot
fig.show()



Based on the figure, we can see that the classes are well separated in the 3D space, indicating that the features are highly representative of the classes.We can hope that the clustering algorithms will perform well on this data.

## Clustering 



In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=2, random_state=42)
clusters = kmeans.fit_predict(X_uns)

In [None]:
# Display the cluster unique values
print(f'Cluster labels: {set(clusters)}')

using kmeans to cluster the data into 2 clusters and visualize the clusters in a 3D space

In [None]:
df_pca['Cluster'] = clusters  # Add clusters to the DataFrame

# Create a combined feature for Loan Status and Cluster
df_pca['LoanCluster'] = df_pca['Loan Status'].astype(str) + '_' + df_pca['Cluster'].astype(str)

# Define a color map for each unique combination of Loan Status and Cluster
color_map = {
    '0_0': 'blue',  # Loan Status 0, Cluster 0
    '0_1': 'green',  # Loan Status 0, Cluster 1
    '1_0': 'red',   # Loan Status 1, Cluster 0
    '1_1': 'purple'  # Loan Status 1, Cluster 1
}

# Create the 3D scatter plot
fig = px.scatter_3d(
    df_pca, x='PC1', y='PC2', z='PC3',
    color='LoanCluster',
    title='KMeans Clustering with Loan Status',
    color_discrete_map=color_map,
    labels={'color': 'LoanCluster'}
)

# Customize the layout
fig.update_traces(marker=dict(size=5), selector=dict(mode='markers'))
fig.update_layout(
    scene=dict(
        xaxis_title='PC1',
        yaxis_title='PC2',
        zaxis_title='PC3'),
    width=800,
    height=600,
    margin=dict(l=0, r=0, b=0, t=40)
)

# Show the interactive plot
fig.show()



In [None]:
# for each cluster, calculate the percentage of Fully Paid and Charged Off loans
cluster_status = df_pca.groupby(['Cluster', 'Loan Status']).size().unstack()
cluster_status['Total'] = cluster_status.sum(axis=1)
cluster_status['Fully Paid (%)'] = cluster_status[0] / cluster_status['Total']
cluster_status['Charged Off (%)'] = cluster_status[1] / cluster_status['Total']
cluster_status

The KMeans clustering here doesn't provide meaningful separation, with both clusters showing nearly identical distributions of fully paid and charged off loans. This suggests the data forms a single, indistinct cloud, making KMeans being a simple algorithm insufficient. If i have more ressources hierarchical clustering could be a better choice.

### Node Community Detection

If the dataset can be represented as a graph (e.g., users connected by similar loan characteristics), we can apply community detection algorithms to identify groups of similar nodes and see how the model's predictions vary across these communities. The issue is to find an edge relationship between the nodes in the graph that have a meaning.