<a href="https://colab.research.google.com/github/Rdughan/A-Safer-Campus/blob/main/Predict_whether_a_student_entrepreneur_will_default_on_a_small_loan.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Import the important libraries, numpy for numerical analysis, pandas for data  manipulation
import numpy as np
import pandas as pd

# Randomly generating data with thousand entries with the numpy library and random function, optimizing for repeatability
np.random.seed(42)
n = 1000

#Creating a DataFrame with randomly generated synthetic data using numpy
data = pd.DataFrame({
    "monthly_income": np.random.uniform(300, 2500, n),
    "income_stability": np.random.uniform(0.2, 0.9, n),
    "savings_habit": np.random.binomial(1, 0.4, n),
    "business_type_risk": np.random.choice([0,1,2], size=n, p=[0.4,0.4,0.2]),
    "prior_repayment_score": np.random.uniform(0, 1, n),
    "loan_to_income_ratio": np.random.uniform(0.2, 2.5, n),
    "peer_guarantor": np.random.binomial(1, 0.5, n),
    "time_in_school_remaining": np.random.randint(1, 49, n)
})


In [None]:
# Assigning different weights to the features to tell the model what our priority features are.
# Positive values increase the score while negative values reduce the score.
# The higher the score, the higher the probability of default.

logit = (
    -2.5
    - 0.001 * data["monthly_income"]
    - 1.8 * data["income_stability"]
    - 1.2 * data["savings_habit"]
    + 0.9 * data["business_type_risk"]
    - 2.5 * data["prior_repayment_score"]
    + 1.4 * data["loan_to_income_ratio"]
    - 1.0 * data["peer_guarantor"]
    - 0.015 * data["time_in_school_remaining"]
)

prob_default = 1 / (1 + np.exp(-logit)) #Sigmoid Function converts logit to probability.
data["default"] = np.random.binomial(1, prob_default) #Simulating binary default outcomes based on probabilities.


In [None]:
# 0 = Prestigious University, 1 = Community/Regional University
data["university_type"] = np.random.binomial(1, 0.3, n)

# We RE-CALCULATE the risk, but this time we add a "Bias Penalty"
# Notice the + 2.0 * university_type. This is a HUGE penalty for Group 1.
logit_biased = (
    -2.5
    - 0.001 * data["monthly_income"]
    - 1.8 * data["income_stability"]
    + 2.0 * data["university_type"]  # <--- THIS IS THE INTENTIONAL BIAS
    - 2.5 * data["prior_repayment_score"]
)

# Re-generate the outcomes based on this biased logic
prob_default_biased = 1 / (1 + np.exp(-logit_biased))
data["default_biased"] = np.random.binomial(1, prob_default_biased)

In [None]:
#Using the sklearn librabry
#Splitting the data into training and testing samples
#Applying Logistic Model to accurately identify causality
#Using classification_report funtion to give us a report on our models perfomance.

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

X = data.drop("default_biased", axis=1) # Dropping the "default" feature from the data
y = data["default_biased"] # Making the default the target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)# Splitting the data sample into 25% test sample and 75% train sample, random_state=42 for repeatability

model_biased= LogisticRegression(max_iter=1000,class_weight='balanced')  # To account for potential imbalance in default vrs non default cases
model_biased.fit(X_train, y_train) #Training the model

y_pred = model_biased.predict(X_test) #The model predicting the test features.
print(classification_report(y_test, y_pred)) # Output precision, recall and F1-score to evaluate performance


              precision    recall  f1-score   support

           0       0.99      0.78      0.87       245
           1       0.05      0.60      0.10         5

    accuracy                           0.78       250
   macro avg       0.52      0.69      0.49       250
weighted avg       0.97      0.78      0.86       250



In [None]:
#Using the sklearn librabry
#Splitting the data into training and testing samples
#Applying Logistic Model to accurately identify causality
#Using classification_report funtion to give us a report on our models perfomance.

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

X = data.drop("default", axis=1) # Dropping the "default" feature from the data
y = data["default"] # Making the default the target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)# Splitting the data sample into 25% test sample and 75% train sample, random_state=42 for repeatability

model = LogisticRegression(max_iter=1000,class_weight='balanced')  # To account for potential imbalance in default vrs non default cases
model.fit(X_train, y_train) #Training the model

y_pred = model.predict(X_test) #The model predicting the test features.
print(classification_report(y_test, y_pred)) # Output precision, recall and F1-score to evaluate performance


              precision    recall  f1-score   support

           0       0.99      0.80      0.89       239
           1       0.16      0.82      0.27        11

    accuracy                           0.80       250
   macro avg       0.58      0.81      0.58       250
weighted avg       0.95      0.80      0.86       250



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
# This shows the "importance" the model assigned to each feature
coef_df = pd.DataFrame({"Feature": X.columns, "Coefficient": model.coef_[0]})
print(coef_df)

                    Feature  Coefficient
0            monthly_income    -0.000767
1          income_stability    -1.961049
2             savings_habit    -2.020779
3        business_type_risk     0.977382
4     prior_repayment_score    -2.627633
5      loan_to_income_ratio     0.530588
6            peer_guarantor    -1.177729
7  time_in_school_remaining     0.004894


In [None]:
# Side-by-side Coefficient Comparison

# Get the features and coefficients for the "Clean Model"
clean_model_features = X.columns
clean_model_coefficients = model.coef_[0]

# Get the features for the "Biased Model" and create a Series for its coefficients
# Note: biased_model_X_cols represents the features used to train model_biased
biased_model_X_cols = data.drop("default_biased", axis=1).columns
biased_model_coefficients_series = pd.Series(model_biased.coef_[0], index=biased_model_X_cols)

# Align the biased model coefficients with the clean model features for comparison
# Features present in clean_model_features but not in biased_model_X_cols will get NaN (e.g., 'default_biased')
# Features present in biased_model_X_cols but not in clean_model_features will be dropped (e.g., 'default')
aligned_biased_model_coefficients = biased_model_coefficients_series.reindex(clean_model_features)

comparison = pd.DataFrame({
    "Feature": clean_model_features,
    "Clean Model": clean_model_coefficients,
    "Biased Model": aligned_biased_model_coefficients.values # Convert Series to numpy array
})
print(comparison)

                    Feature  Clean Model  Biased Model
0            monthly_income    -0.000792     -0.005272
1          income_stability    -2.009511     -1.306058
2             savings_habit    -2.119550     -0.006720
3        business_type_risk     0.972681      0.401604
4     prior_repayment_score    -2.741376     -0.432101
5      loan_to_income_ratio     0.542531      0.655558
6            peer_guarantor    -1.170389     -0.089351
7  time_in_school_remaining     0.007203      0.034060
8           university_type     0.472424      2.108455
9            default_biased    -1.182459           NaN


In [None]:
# Here we are showing  the probability of default
data["predicted_default_prob"] = model.predict_proba(X)[:,1]


In [None]:
# A decision-making filter function based on the probability of default
def decision(prob):
    if prob <= 0.15:
        return "approve"
    elif prob <= 0.35:
        return "review"
    else:
        return "reject"

data["decision"] = data["predicted_default_prob"].apply(decision)


In [None]:
# A decision-making filter function based on the probability of default
def decision(prob):
    if prob <= 0.10:
        return "approve"
    elif prob <= 0.30:
        return "review"
    else:
        return "reject"

data["decision"] = data["predicted_default_prob"].apply(decision)


Policy Optimization Via Expected Value


In [None]:
import numpy as np

approve_thresholds = np.arange(0.05, 0.21, 0.02)   # 5% → 20%
review_thresholds  = np.arange(0.20, 0.51, 0.05)   # 20% → 50%


In [None]:
approve < review


In [None]:
results = []

for a in approve_thresholds:
    for r in review_thresholds:
        if a >= r:
            continue

        def policy(prob):
            if prob <= a:
                return "approve"
            elif prob <= r:
                return "review"
            else:
                return "reject"

        temp = data.copy()
        temp["decision"] = temp["predicted_default_prob"].apply(policy)

        portfolio = temp[temp["decision"] != "reject"]

        total_ev = portfolio["expected_value"].sum()
        avg_ev   = portfolio["expected_value"].mean()
        loans    = len(portfolio)

        results.append({
            "approve_th": round(a, 2),
            "review_th": round(r, 2),
            "total_ev": total_ev,
            "avg_ev": avg_ev,
            "loans": loans
        })


In [None]:
import pandas as pd

results_df = pd.DataFrame(results)

results_df = results_df.sort_values(
    by=["total_ev", "loans"],
    ascending=[False, False]
)

results_df.head(10)


In [None]:
#Calculating the expected value of the loan

LOSS = -700     # default loss
GAIN = 200      # successful repayment gain

def expected_value(prob):
    return (1 - prob) * GAIN + prob * LOSS

data["expected_value"] = data["predicted_default_prob"].apply(expected_value)


In [None]:
# Calculating the expected value of all loans that were not rejected, their average and the total number of them

portfolio = data[data["decision"] != "reject"]

total_ev = portfolio["expected_value"].sum()
avg_ev = portfolio["expected_value"].mean()

total_loans = len(portfolio)

total_ev, avg_ev, total_loans


(np.float64(60755.09449302764), np.float64(91.08709819044624), 667)

In [None]:
# Creating a copy of the data to stress test it under an economic recession where monthly income were averagely down by 20%.

shock_data = data.copy()
shock_data["monthly_income"] *= 0.8

shock_probs = model.predict_proba(shock_data[X.columns])[:,1]
shock_data["shock_prob"] = shock_probs
shock_data["shock_ev"] = shock_data["shock_prob"].apply(expected_value)

shock_data[shock_data["decision"] != "reject"]["shock_ev"].sum()


np.float64(46726.3248189415)

In [None]:
# Finding out the difference in expected value in normal times against recession times.

difference_ev = total_ev - shock_data[shock_data["decision"] != "reject"]["shock_ev"].sum()
print(f"Difference in total expected value: {difference_ev}")

Difference in total expected value: 14028.769674086136
