In [56]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
from sklearn import set_config
set_config(transform_output='pandas')

In [57]:
df = pd.read_csv('Data/loan_approval.csv')
df.head()

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


In [58]:
df['loan_status'] = df['loan_status'].str.strip()

In [59]:
df.describe()

Unnamed: 0,no_of_dependents,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value
count,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0
mean,2.498712,5059124.0,15133450.0,10.900445,599.936051,7472617.0,4973155.0,15126310.0,4976692.0
std,1.69591,2806840.0,9043363.0,5.709187,172.430401,6503637.0,4388966.0,9103754.0,3250185.0
min,0.0,200000.0,300000.0,2.0,300.0,-100000.0,0.0,300000.0,0.0
25%,1.0,2700000.0,7700000.0,6.0,453.0,2200000.0,1300000.0,7500000.0,2300000.0
50%,3.0,5100000.0,14500000.0,10.0,600.0,5600000.0,3700000.0,14600000.0,4600000.0
75%,4.0,7500000.0,21500000.0,16.0,748.0,11300000.0,7600000.0,21700000.0,7100000.0
max,5.0,9900000.0,39500000.0,20.0,900.0,29100000.0,19400000.0,39200000.0,14700000.0


In [60]:
X = df.drop('loan_status', axis=1)
y = df['loan_status']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [61]:
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
cat_cols = X_train.select_dtypes(include='object').columns
processor = make_column_transformer((ohe, cat_cols), remainder='passthrough',
                                    verbose_feature_names_out=False)
processor.fit_transform(X_train)

Unnamed: 0,education_ Graduate,education_ Not Graduate,self_employed_ No,self_employed_ Yes,no_of_dependents,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value
3977,1.0,0.0,0.0,1.0,5,7600000,18000000,12,452,7900000,4300000,23800000,4700000
2425,0.0,1.0,1.0,0.0,0,6000000,22900000,16,302,12200000,6500000,17700000,7100000
1545,0.0,1.0,0.0,1.0,2,8200000,27800000,16,827,11500000,8700000,26000000,10700000
173,0.0,1.0,0.0,1.0,2,7400000,28200000,8,376,21900000,5000000,17400000,5200000
3405,1.0,0.0,0.0,1.0,1,3400000,7000000,4,534,3100000,1600000,13000000,4000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3444,1.0,0.0,0.0,1.0,0,1300000,4700000,16,530,3200000,1000000,3800000,800000
466,0.0,1.0,0.0,1.0,5,500000,1800000,18,411,1000000,500000,1300000,200000
3092,0.0,1.0,0.0,1.0,4,8600000,20600000,16,449,10800000,10600000,28700000,5400000
3772,0.0,1.0,0.0,1.0,2,7000000,21400000,12,541,3700000,1300000,19800000,9600000


In [62]:
rf = RandomForestClassifier(random_state=42)
rf_pipe = make_pipeline(processor, clf)

In [63]:
rf_pipe.fit(X_train, y_train)
print(classification_report(y_test, rf_pipe.predict(X_test)))

              precision    recall  f1-score   support

    Approved       0.98      0.99      0.99       678
    Rejected       0.99      0.97      0.98       390

    accuracy                           0.98      1068
   macro avg       0.98      0.98      0.98      1068
weighted avg       0.98      0.98      0.98      1068



In [64]:
logreg =  LogisticRegression()
logreg_pipe = make_pipeline(processor, logreg)
logreg_pipe.fit(X_train, y_train)
print(classification_report(y_test, logreg_pipe.predict(X_test)))

              precision    recall  f1-score   support

    Approved       0.64      0.99      0.78       678
    Rejected       0.50      0.01      0.03       390

    accuracy                           0.63      1068
   macro avg       0.57      0.50      0.40      1068
weighted avg       0.59      0.63      0.50      1068



In [65]:
import joblib

# rf_dump = {'RF_pipe': rf_pipe}
# logreg_dump = {'logreg_pipe': logreg_pipe}
joblib.dump(rf_pipe, 'Models/RF_pipe.joblib')
joblib.dump(logreg_pipe, 'Models/logreg_pipe.joblib')


['Models/logreg_pipe.joblib']

In [66]:
joblib.dump([X_train, y_train], 'Data/train.joblib')
joblib.dump([X_test, y_test], 'Data/test.joblib')

['Data/test.joblib']

In [67]:
pathlib = {"models": {'RF':'Models/RF_Pipe.joblib',
                      'logreg':'Models/logreg_pipe.joblib'},
           "data": {"train":'Data/train.joblib',
                    "test":'Data/test.joblib'}}

joblib.dump(pathlib, 'pathlib.joblib')



['pathlib.joblib']

In [68]:
df.loc[[0]]

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
