In [27]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

import joblib


In [28]:
import os
os.listdir("../data")


['loan_data.csv', 'transactions.csv']

In [29]:
loan_df = pd.read_csv("../data/loan_data.csv")
print("Shape:", loan_df.shape)
loan_df.head()


Shape: (24000, 7)


Unnamed: 0,Text,Income,Credit_Score,Loan_Amount,DTI_Ratio,Employment_Status,Approval
0,I need a loan to pay for an international vaca...,26556,581,8314,79.26,employed,Rejected
1,I want to make home improvements like installi...,197392,389,111604,22.14,employed,Rejected
2,"I need a loan for home renovation, including a...",44561,523,34118,45.44,employed,Rejected
3,I need funds to buy new furniture and applianc...,190363,729,118757,10.22,unemployed,Rejected
4,I need a loan to start a small business.,61853,732,19210,44.13,employed,Approved


In [30]:
# Drop Text column permanently (NOT used in UI)
if "Text" in loan_df.columns:
    loan_df = loan_df.drop("Text", axis=1)


In [31]:
loan_df.info()
loan_df.isnull().sum()


<class 'pandas.DataFrame'>
RangeIndex: 24000 entries, 0 to 23999
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Income             24000 non-null  int64  
 1   Credit_Score       24000 non-null  int64  
 2   Loan_Amount        24000 non-null  int64  
 3   DTI_Ratio          24000 non-null  float64
 4   Employment_Status  24000 non-null  str    
 5   Approval           24000 non-null  str    
dtypes: float64(1), int64(3), str(2)
memory usage: 1.1 MB


Income               0
Credit_Score         0
Loan_Amount          0
DTI_Ratio            0
Employment_Status    0
Approval             0
dtype: int64

In [32]:
loan_df.info()
loan_df.isnull().sum()


<class 'pandas.DataFrame'>
RangeIndex: 24000 entries, 0 to 23999
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Income             24000 non-null  int64  
 1   Credit_Score       24000 non-null  int64  
 2   Loan_Amount        24000 non-null  int64  
 3   DTI_Ratio          24000 non-null  float64
 4   Employment_Status  24000 non-null  str    
 5   Approval           24000 non-null  str    
dtypes: float64(1), int64(3), str(2)
memory usage: 1.1 MB


Income               0
Credit_Score         0
Loan_Amount          0
DTI_Ratio            0
Employment_Status    0
Approval             0
dtype: int64

In [33]:
# Numeric columns → median
num_cols = loan_df.select_dtypes(include=['int64','float64']).columns
for col in num_cols:
    loan_df[col].fillna(loan_df[col].median(), inplace=True)

# Categorical columns → mode
cat_cols = loan_df.select_dtypes(include=['object']).columns
for col in cat_cols:
    loan_df[col].fillna(loan_df[col].mode()[0], inplace=True)

loan_df.isnull().sum()


C:\Users\Dell\AppData\Local\Temp\ipykernel_12008\3894894409.py:4: ChainedAssignmentError: A value is being set on a copy of a DataFrame or Series through chained assignment using an inplace method.
Such inplace method never works to update the original DataFrame or Series, because the intermediate object on which we are setting values always behaves as a copy (due to Copy-on-Write).

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' instead, to perform the operation inplace on the original object, or try to avoid an inplace operation using 'df[col] = df[col].method(value)'.

See the documentation for a more detailed explanation: https://pandas.pydata.org/pandas-docs/stable/user_guide/copy_on_write.html
  loan_df[col].fillna(loan_df[col].median(), inplace=True)
C:\Users\Dell\AppData\Local\Temp\ipykernel_12008\3894894409.py:4: ChainedAssignmentError: A value is being set on a copy of a DataFrame or Series through chained assi

Income               0
Credit_Score         0
Loan_Amount          0
DTI_Ratio            0
Employment_Status    0
Approval             0
dtype: int64

In [34]:
le = LabelEncoder()
for col in cat_cols:
    loan_df[col] = le.fit_transform(loan_df[col])

loan_df.head()


Unnamed: 0,Income,Credit_Score,Loan_Amount,DTI_Ratio,Employment_Status,Approval
0,26556,581,8314,79.26,0,1
1,197392,389,111604,22.14,0,1
2,44561,523,34118,45.44,0,1
3,190363,729,118757,10.22,1,1
4,61853,732,19210,44.13,0,0


In [35]:
X = loan_df.drop("Approval", axis=1)
y = loan_df["Approval"]

print("Features:", X.columns.tolist())
print("Target:", y.name)


Features: ['Income', 'Credit_Score', 'Loan_Amount', 'DTI_Ratio', 'Employment_Status']
Target: Approval


In [36]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [37]:
from sklearn.preprocessing import LabelEncoder, StandardScaler


In [38]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test  = scaler.transform(X_test)


In [39]:
from sklearn.metrics import confusion_matrix


In [40]:
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, y_train)

log_pred = log_model.predict(X_test)

print("Logistic Regression Accuracy:", accuracy_score(y_test, log_pred))
print(confusion_matrix(y_test, log_pred))
print(classification_report(y_test, log_pred))


Logistic Regression Accuracy: 0.9341666666666667
[[ 621  166]
 [ 150 3863]]
              precision    recall  f1-score   support

           0       0.81      0.79      0.80       787
           1       0.96      0.96      0.96      4013

    accuracy                           0.93      4800
   macro avg       0.88      0.88      0.88      4800
weighted avg       0.93      0.93      0.93      4800



In [41]:
dt_model = DecisionTreeClassifier(random_state=42, max_depth=5)
dt_model.fit(X_train, y_train)

dt_pred = dt_model.predict(X_test)

print("Decision Tree Accuracy:", accuracy_score(y_test, dt_pred))
print(confusion_matrix(y_test, dt_pred))
print(classification_report(y_test, dt_pred))


Decision Tree Accuracy: 0.9558333333333333
[[ 703   84]
 [ 128 3885]]
              precision    recall  f1-score   support

           0       0.85      0.89      0.87       787
           1       0.98      0.97      0.97      4013

    accuracy                           0.96      4800
   macro avg       0.91      0.93      0.92      4800
weighted avg       0.96      0.96      0.96      4800



In [42]:
joblib.dump(log_model, "../models/loan_classifier.pkl")
joblib.dump(scaler, "../models/loan_scaler.pkl")

print("Saved: loan_classifier.pkl and loan_scaler.pkl")


Saved: loan_classifier.pkl and loan_scaler.pkl


In [43]:
# Take one sample from original data
sample = X.iloc[[0]]

# Scale with saved scaler
sample_scaled = scaler.transform(sample)

prediction = log_model.predict(sample_scaled)
print("Loan Approval Prediction (1=Approved, 0=Rejected):", prediction[0])


Loan Approval Prediction (1=Approved, 0=Rejected): 1
