In [1]:
import pandas as pd
import numpy as np
import random

# Set seed for reproducibility
np.random.seed(42)
random.seed(42)

# Number of records
num_records = 100

# Generate synthetic loan application data
data = {
    "Loan_ID": [f"L{str(i).zfill(4)}" for i in range(1, num_records + 1)],
    "Applicant_Age": np.random.randint(21, 70, num_records),
    "Applicant_Income": np.random.randint(30000, 200000, num_records),
    "Loan_Amount": np.random.randint(5000, 500000, num_records),
    "Credit_Score": np.random.randint(300, 850, num_records),
    "Loan_Term": np.random.choice([12, 24, 36, 48, 60, 72], num_records),
    "Interest_Rate": np.round(np.random.uniform(2.5, 15.0, num_records), 2),
    "Loan_Approval": np.random.choice(["Approved", "Denied"], num_records, p=[0.7, 0.3]),
    "Default_Flag": np.random.choice([0, 1], num_records, p=[0.85, 0.15]),
    "Loan_Scoring_Model_Output": np.round(np.random.uniform(0, 1, num_records), 2),
    "Debt_to_Income_Ratio": np.round(np.random.uniform(10, 50, num_records), 2),
    "Non_Performing_Loan_Flag": np.random.choice([0, 1], num_records, p=[0.9, 0.1]),
    "Delinquency_Rate": np.round(np.random.uniform(0, 1, num_records), 2),
    "Portfolio_Risk_Score": np.round(np.random.uniform(0, 100, num_records), 2),
    "High_Risk_Borrower": np.random.choice([0, 1], num_records, p=[0.8, 0.2]),
    "Macroeconomic_Risk_Index": np.round(np.random.uniform(0, 100, num_records), 2),
    "Approval_Rate_by_Demographics": np.round(np.random.uniform(50, 95, num_records), 2),
    "Interest_Rate_Disparity": np.round(np.random.uniform(0, 5, num_records), 2),
    "Loan_Denial_Reasons": np.random.choice(["Low Credit Score", "High DTI", "Unstable Income", "Insufficient Collateral", "Other"], num_records),
    "Disparate_Impact_Ratio": np.round(np.random.uniform(0.5, 1.5, num_records), 2),
    "Fair_Lending_Bias_Score": np.round(np.random.uniform(0, 1, num_records), 2),
    "False_Positive_Rate": np.round(np.random.uniform(0, 0.2, num_records), 2),
    "False_Negative_Rate": np.round(np.random.uniform(0, 0.2, num_records), 2),
    "Regulatory_Compliance_Violation": np.random.choice([0, 1], num_records, p=[0.95, 0.05]),
    "Model_Drift_Approval_Patterns": np.round(np.random.uniform(0, 10, num_records), 2),
    "Gender": np.random.choice(["Male", "Female"], num_records),
    "Race": np.random.choice(["White", "Black", "Asian", "Hispanic", "Other"], num_records)
}

# Create DataFrame
df = pd.DataFrame(data)

# Save to CSV in specified path
df.to_csv(r"C:\Users\Savvas\Desktop\Solutions\Loan Risk and Fair Lending\synthetic_loan_data.csv", index=False)

# Display sample data
print(df.head())


  Loan_ID  Applicant_Age  Applicant_Income  Loan_Amount  Credit_Score  \
0   L0001             59            163767       274536           719   
1   L0002             49            154375       322824           721   
2   L0003             35            166330       262426           403   
3   L0004             63             69504       182789           553   
4   L0005             28            194231       223164           526   

   Loan_Term  Interest_Rate Loan_Approval  Default_Flag  \
0         48          10.32      Approved             0   
1         48          12.76      Approved             0   
2         48          10.64      Approved             0   
3         60           5.08      Approved             0   
4         48           5.92      Approved             0   

   Loan_Scoring_Model_Output  ...  Interest_Rate_Disparity  \
0                       0.23  ...                     3.02   
1                       0.37  ...                     1.59   
2                   

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
import os

# Load the synthetic dataset
data_path = r"C:\Users\Savvas\Desktop\Solutions\Loan Risk and Fair Lending\synthetic_loan_data.csv"
df = pd.read_csv(data_path)

# Encode categorical variables
df = pd.get_dummies(df, columns=["Gender", "Race", "Loan_Approval", "Loan_Denial_Reasons"], drop_first=True)

# Define feature columns and target variables
target_loan_risk = "Default_Flag"
target_fair_lending = "Regulatory_Compliance_Violation"
features = [col for col in df.columns if col not in ["Loan_ID", target_loan_risk, target_fair_lending]]

# Split data into training and testing sets for loan risk model
X_train_lr, X_test_lr, y_train_lr, y_test_lr = train_test_split(df[features], df[target_loan_risk], test_size=0.2, random_state=42)

# Split data into training and testing sets for fair lending compliance model
X_train_fl, X_test_fl, y_train_fl, y_test_fl = train_test_split(df[features], df[target_fair_lending], test_size=0.2, random_state=42)

# Initialize and train XGBoost classifier for loan risk
model_loan_risk = xgb.XGBClassifier(n_estimators=100, max_depth=6, learning_rate=0.1, random_state=42)
model_loan_risk.fit(X_train_lr, y_train_lr)

# Initialize and train XGBoost classifier for fair lending compliance
model_fair_lending = xgb.XGBClassifier(n_estimators=100, max_depth=6, learning_rate=0.1, random_state=42)
model_fair_lending.fit(X_train_fl, y_train_fl)

# Make predictions for loan risk
y_pred_lr = model_loan_risk.predict(X_test_lr)
y_pred_prob_lr = model_loan_risk.predict_proba(X_test_lr)[:, 1]

# Make predictions for fair lending compliance
y_pred_fl = model_fair_lending.predict(X_test_fl)
y_pred_prob_fl = model_fair_lending.predict_proba(X_test_fl)[:, 1]

# Evaluate the models
accuracy_lr = accuracy_score(y_test_lr, y_pred_lr)
roc_auc_lr = roc_auc_score(y_test_lr, y_pred_prob_lr)
print(f"Loan Risk Model Accuracy: {accuracy_lr:.4f}")
print(f"Loan Risk ROC AUC Score: {roc_auc_lr:.4f}")

accuracy_fl = accuracy_score(y_test_fl, y_pred_fl)
roc_auc_fl = roc_auc_score(y_test_fl, y_pred_prob_fl)
print(f"Fair Lending Compliance Model Accuracy: {accuracy_fl:.4f}")
print(f"Fair Lending Compliance ROC AUC Score: {roc_auc_fl:.4f}")

# Save predictions to CSV for Power BI
predictions_lr = X_test_lr.copy()
predictions_lr["Predicted_Default"] = y_pred_lr
predictions_lr["Default_Probability"] = y_pred_prob_lr

predictions_fl = X_test_fl.copy()
predictions_fl["Predicted_Compliance_Violation"] = y_pred_fl
predictions_fl["Compliance_Violation_Probability"] = y_pred_prob_fl

output_path_lr = r"C:\Users\Savvas\Desktop\Solutions\Loan Risk and Fair Lending\loan_risk_predictions.csv"
output_path_fl = r"C:\Users\Savvas\Desktop\Solutions\Loan Risk and Fair Lending\fair_lending_predictions.csv"

predictions_lr.to_csv(output_path_lr, index=False)
predictions_fl.to_csv(output_path_fl, index=False)

print(f"Loan Risk Predictions saved to: {output_path_lr}")
print(f"Fair Lending Predictions saved to: {output_path_fl}")


Loan Risk Model Accuracy: 0.8500
Loan Risk ROC AUC Score: 0.5490


ValueError: Only one class present in y_true. ROC AUC score is not defined in that case.

In [3]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
import os

# Load the synthetic dataset
data_path = r"C:\Users\Savvas\Desktop\Solutions\Loan Risk and Fair Lending\synthetic_loan_data.csv"
df = pd.read_csv(data_path)

# Encode categorical variables
df = pd.get_dummies(df, columns=["Gender", "Race", "Loan_Approval", "Loan_Denial_Reasons"], drop_first=True)

# Define feature columns and target variables
target_loan_risk = "Default_Flag"
target_fair_lending = "Regulatory_Compliance_Violation"
features = [col for col in df.columns if col not in ["Loan_ID", target_loan_risk, target_fair_lending]]

# Split data into training and testing sets for loan risk model
X_train_lr, X_test_lr, y_train_lr, y_test_lr = train_test_split(df[features], df[target_loan_risk], test_size=0.2, random_state=42)

# Split data into training and testing sets for fair lending compliance model
X_train_fl, X_test_fl, y_train_fl, y_test_fl = train_test_split(df[features], df[target_fair_lending], test_size=0.2, random_state=42)

# Check if target variables have more than one class to avoid ROC AUC error
if len(np.unique(y_test_fl)) > 1:
    compute_auc_fl = True
else:
    compute_auc_fl = False

# Initialize and train XGBoost classifier for loan risk
model_loan_risk = xgb.XGBClassifier(n_estimators=100, max_depth=6, learning_rate=0.1, random_state=42)
model_loan_risk.fit(X_train_lr, y_train_lr)

# Initialize and train XGBoost classifier for fair lending compliance
model_fair_lending = xgb.XGBClassifier(n_estimators=100, max_depth=6, learning_rate=0.1, random_state=42)
model_fair_lending.fit(X_train_fl, y_train_fl)

# Make predictions for loan risk
y_pred_lr = model_loan_risk.predict(X_test_lr)
y_pred_prob_lr = model_loan_risk.predict_proba(X_test_lr)[:, 1]

# Make predictions for fair lending compliance
y_pred_fl = model_fair_lending.predict(X_test_fl)
y_pred_prob_fl = model_fair_lending.predict_proba(X_test_fl)[:, 1] if compute_auc_fl else np.zeros_like(y_test_fl)

# Evaluate the models
accuracy_lr = accuracy_score(y_test_lr, y_pred_lr)
roc_auc_lr = roc_auc_score(y_test_lr, y_pred_prob_lr)
print(f"Loan Risk Model Accuracy: {accuracy_lr:.4f}")
print(f"Loan Risk ROC AUC Score: {roc_auc_lr:.4f}")

accuracy_fl = accuracy_score(y_test_fl, y_pred_fl)
if compute_auc_fl:
    roc_auc_fl = roc_auc_score(y_test_fl, y_pred_prob_fl)
    print(f"Fair Lending Compliance ROC AUC Score: {roc_auc_fl:.4f}")
else:
    roc_auc_fl = "N/A"
    print("Fair Lending Compliance Model ROC AUC Score: Not computed due to single class in y_test_fl.")

print(f"Fair Lending Compliance Model Accuracy: {accuracy_fl:.4f}")

# Save predictions to CSV for Power BI
predictions_lr = X_test_lr.copy()
predictions_lr["Predicted_Default"] = y_pred_lr
predictions_lr["Default_Probability"] = y_pred_prob_lr

predictions_fl = X_test_fl.copy()
predictions_fl["Predicted_Compliance_Violation"] = y_pred_fl
predictions_fl["Compliance_Violation_Probability"] = y_pred_prob_fl

output_path_lr = r"C:\Users\Savvas\Desktop\Solutions\Loan Risk and Fair Lending\loan_risk_predictions.csv"
output_path_fl = r"C:\Users\Savvas\Desktop\Solutions\Loan Risk and Fair Lending\fair_lending_predictions.csv"

predictions_lr.to_csv(output_path_lr, index=False)
predictions_fl.to_csv(output_path_fl, index=False)

print(f"Loan Risk Predictions saved to: {output_path_lr}")
print(f"Fair Lending Predictions saved to: {output_path_fl}")

Loan Risk Model Accuracy: 0.8500
Loan Risk ROC AUC Score: 0.5490
Fair Lending Compliance Model ROC AUC Score: Not computed due to single class in y_test_fl.
Fair Lending Compliance Model Accuracy: 1.0000
Loan Risk Predictions saved to: C:\Users\Savvas\Desktop\Solutions\Loan Risk and Fair Lending\loan_risk_predictions.csv
Fair Lending Predictions saved to: C:\Users\Savvas\Desktop\Solutions\Loan Risk and Fair Lending\fair_lending_predictions.csv
