In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.model_selection import train_test_split

In [None]:
# Load the engineered dataset
df= pd.read_csv("localfilepath/featured_data.csv")

In [5]:
# Inspect the dataset
df.info

<bound method DataFrame.info of                              customerid   birthdate bank_account_type  \
0      8a858fd3552c75620155488cbd3f650d  1981-09-04             Other   
1      8a858f3455a0d8070155c5d69e9600d4  1982-03-29             Other   
2      8a858e885c87dee5015c884e7dc03958  1992-07-06           Savings   
3      8a858fd85b685607015b70cf46b509b2  1976-12-10           Savings   
4      8a858ff5562dc3b601562e96135108e3  1995-06-29             Other   
...                                 ...         ...               ...   
27341  8a858ff455a09f5a0155ba378c9b10bc  1974-06-04             Other   
27342  8a858899538ddb8e0153a2b555421fc5  1987-06-28           Savings   
27343  8a858899538ddb8e0153a2b555421fc5  1987-06-28           Savings   
27344  8a858899538ddb8e0153a2b555421fc5  1987-06-28           Savings   
27345  8a858faf5679a838015688de3028143d  1989-04-10             Other   

       longitude_gps  latitude_gps bank_name_clients  \
0           4.562290      8.480116 

In [None]:
# Logistic Regression
# Target: good_bad_flag_num
# Features: loanamount, loan_duration, repayment_delay, employment_status_clients

# Encoding Categiorical Varaiable 
df["employment_status_clients_encoded"] = df["employment_status_clients"].astype("category").cat.codes


In [7]:
#Inscpect data 
df.info

<bound method DataFrame.info of                              customerid   birthdate bank_account_type  \
0      8a858fd3552c75620155488cbd3f650d  1981-09-04             Other   
1      8a858f3455a0d8070155c5d69e9600d4  1982-03-29             Other   
2      8a858e885c87dee5015c884e7dc03958  1992-07-06           Savings   
3      8a858fd85b685607015b70cf46b509b2  1976-12-10           Savings   
4      8a858ff5562dc3b601562e96135108e3  1995-06-29             Other   
...                                 ...         ...               ...   
27341  8a858ff455a09f5a0155ba378c9b10bc  1974-06-04             Other   
27342  8a858899538ddb8e0153a2b555421fc5  1987-06-28           Savings   
27343  8a858899538ddb8e0153a2b555421fc5  1987-06-28           Savings   
27344  8a858899538ddb8e0153a2b555421fc5  1987-06-28           Savings   
27345  8a858faf5679a838015688de3028143d  1989-04-10             Other   

       longitude_gps  latitude_gps bank_name_clients  \
0           4.562290      8.480116 

In [8]:
# confirm the assign numeric for employment_status_clients 
print(dict(enumerate(df["employment_status_clients"].astype("category").cat.categories)))


{0: 'Contract', 1: 'Permanent', 2: 'Retired', 3: 'Self-Employed', 4: 'Student', 5: 'Unemployed'}


In [None]:
# Define features and target variable
X = df[["loanamount", "loan_duration", "repayment_delay", "employment_status_clients_encoded"]]
y = df["good_bad_flag_num"]


In [11]:
# Split the data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [13]:
# Train the Logistic Regression model
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [14]:
# Make predictions 
y_pred = model.predict(X_test)

In [16]:
# Evaluate Performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 score:", f1_score(y_test, y_pred))
print("\nClassification Report:", classification_report(y_test, y_pred))

Accuracy: 0.8148080438756855
precision: 0.8175638434686754
Recall: 0.9955257270693513
F1 score: 0.8978109553112075

Classification Report:               precision    recall  f1-score   support

           0       0.26      0.01      0.01      1000
           1       0.82      1.00      0.90      4470

    accuracy                           0.81      5470
   macro avg       0.54      0.50      0.46      5470
weighted avg       0.72      0.81      0.74      5470



"""
## Model Results & Notes

**Linear Regression Model:**
Works really well - gets 99.7% accuracy predicting how much someone will owe. 
The relationship between loan amount, charges, and duration is pretty straightforward and the model nailed it.

**Logistic Regression Model:**  
This one's broken. It basically calls every loan "good" because there are way more good loans 
in the data (4,470 good vs 1,000 bad). So it misses almost all the risky loans, which is 
exactly what you don't want if you're actually lending money.

**What needs fixing:**
The logistic model needs class balancing, better features (like payment history or credit scores), 
and probably different algorithms before it's useful. The linear model is solid and ready to use.

**Bottom line:** 
One model works great for predicting loan amounts, the other needs serious work before you'd 
trust it with actual loan decisions. Real-world deployment would need more data and better 
features to catch risky borrowers.
"""


In [20]:
# Class weight balancing
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(class_weight="balanced", max_iter=1000, random_state=42)
log_reg.fit(X_train, y_train)


In [21]:
# Make predictions 
y_pred = log_reg.predict(X_test)

In [22]:
# Evaluate Performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 score:", f1_score(y_test, y_pred))
print("\nClassification Report:", classification_report(y_test, y_pred))

Accuracy: 0.5468007312614259
precision: 0.8474694589877836
Recall: 0.5431767337807606
F1 score: 0.6620313565098841

Classification Report:               precision    recall  f1-score   support

           0       0.22      0.56      0.31      1000
           1       0.85      0.54      0.66      4470

    accuracy                           0.55      5470
   macro avg       0.53      0.55      0.49      5470
weighted avg       0.73      0.55      0.60      5470



## Model Results & Notes

**Linear Regression:** 99.7% accuracy predicting loan amounts - works great, ready to use.

**Logistic Regression:** 
- Original: 81% accuracy but missed 99% of bad loans (useless for lending)
- With class balancing: 55% accuracy but catches 56% of bad loans (much better)

**Way Forward:**
- Use linear model for amount predictions
- Improve logistic model with more features (payment history, income data)
- Try ensemble methods (Random Forest, XGBoost) for better classification
- Consider external credit data for production use

**Bottom line:** One model ready, one needs more work but heading in right direction.