- Read & clean the dataset.
- Import pandas for data handling, scikit-learn tools for preprocessing and splitting.

In [224]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

- Loads the dataset, shows the first few rows and info about datatypes and nulls.

In [225]:
df = pd.read_csv('loan_approval_dataset.csv')

In [226]:
df.head()

Unnamed: 0,Age,Salary,Credit_Score,Loan_Amount,Loan_Term,Employment_Status,Residence_Type,Previous_Default,Loan_Approved
0,56,136748,584,38209,36 months,Employed,Owned,Yes,Yes
1,46,25287,815,27424,24 months,Self-Employed,Rented,No,Yes
2,32,146593,398,42396,12 months,Unemployed,Rented,Yes,Yes
3,60,54387,696,11370,24 months,Unemployed,Owned,No,No
4,25,28512,788,14528,12 months,Employed,Owned,No,No


In [227]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Age                1000 non-null   int64 
 1   Salary             1000 non-null   int64 
 2   Credit_Score       1000 non-null   int64 
 3   Loan_Amount        1000 non-null   int64 
 4   Loan_Term          1000 non-null   object
 5   Employment_Status  1000 non-null   object
 6   Residence_Type     1000 non-null   object
 7   Previous_Default   1000 non-null   object
 8   Loan_Approved      1000 non-null   object
dtypes: int64(4), object(5)
memory usage: 70.4+ KB


- Removes ' months' string from the Loan_Term column and converts it to int.

In [228]:
df['Loan_Term'] = df['Loan_Term'].str.removesuffix(' months').astype('int')

In [229]:
df['Loan_Approved'] = df['Loan_Approved'].map({'Yes': 1, 'No': 0})

- Splits columns into categorical and numerical.

In [230]:
X = df.drop('Loan_Approved',axis=1)
y = df['Loan_Approved']
cat_cols = X.select_dtypes(include="object").columns                # categorical columns
num_cols = X.select_dtypes(exclude="object").columns                # numerical columns

In [231]:
xtrain,xtest,ytrain,ytest = train_test_split(X,y,test_size=0.2,random_state=42)

- Initializes a scaler for numerical data and one-hot encoder for categorical data.

In [232]:
scaler = StandardScaler()
encode = OneHotEncoder(sparse_output=False)
#  To directly convert to array, we use sparse_output instead of converting matrix to array using 'to_array' function

- Fits and scales numerical features in training data.

In [233]:
xtrain[num_cols] = scaler.fit_transform(xtrain[num_cols])
xtrain

Unnamed: 0,Age,Salary,Credit_Score,Loan_Amount,Loan_Term,Employment_Status,Residence_Type,Previous_Default
29,0.217341,0.001268,-0.440664,-0.790774,-0.418173,Self-Employed,Rented,No
535,-0.222472,0.049597,-0.555050,-1.281818,0.473929,Employed,Mortgage,No
695,0.290643,-0.930762,1.179804,0.940788,0.473929,Employed,Mortgage,No
557,0.437247,-0.261695,1.478479,1.030343,-1.310275,Employed,Mortgage,Yes
836,0.363945,0.409350,-0.459728,0.360422,0.473929,Unemployed,Mortgage,No
...,...,...,...,...,...,...,...,...
106,0.803758,0.867453,0.715905,0.336618,0.473929,Employed,Rented,No
270,-1.102097,1.308093,-1.196882,-0.371685,1.366032,Self-Employed,Mortgage,Yes
860,1.683383,1.291202,0.677777,-1.447282,0.473929,Unemployed,Rented,No
435,-1.102097,-0.976906,1.681831,1.425861,0.473929,Unemployed,Mortgage,No


- Encodes categorical features and appends the encoded columns to xtrain. Then drops original categorical columns.

In [234]:
obj_df = encode.fit_transform(xtrain[cat_cols])

In [235]:
# obj_df = pd.DataFrame(obj_df,columns=encode.get_feature_names_out())
xtrain[encode.get_feature_names_out()] = obj_df

In [236]:
xtrain.drop(cat_cols,axis=1,inplace=True)

In [237]:
# xtrain.drop(cat_cols,axis=1,inplace=True)
# xtrain = pd.concat([xtrain,obj_df],axis=1)
xtrain

Unnamed: 0,Age,Salary,Credit_Score,Loan_Amount,Loan_Term,Employment_Status_Employed,Employment_Status_Self-Employed,Employment_Status_Unemployed,Residence_Type_Mortgage,Residence_Type_Owned,Residence_Type_Rented,Previous_Default_No,Previous_Default_Yes
29,0.217341,0.001268,-0.440664,-0.790774,-0.418173,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
535,-0.222472,0.049597,-0.555050,-1.281818,0.473929,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
695,0.290643,-0.930762,1.179804,0.940788,0.473929,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
557,0.437247,-0.261695,1.478479,1.030343,-1.310275,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
836,0.363945,0.409350,-0.459728,0.360422,0.473929,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,0.803758,0.867453,0.715905,0.336618,0.473929,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
270,-1.102097,1.308093,-1.196882,-0.371685,1.366032,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
860,1.683383,1.291202,0.677777,-1.447282,0.473929,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
435,-1.102097,-0.976906,1.681831,1.425861,0.473929,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0


- complete pre-processing and model training, transform and update ytest also

In [238]:
xtest[num_cols] = scaler.transform(xtest[num_cols])
xtest

Unnamed: 0,Age,Salary,Credit_Score,Loan_Amount,Loan_Term,Employment_Status,Residence_Type,Previous_Default
521,-1.175400,-0.953144,0.061363,0.156188,0.473929,Self-Employed,Mortgage,No
737,-0.222472,0.625623,-1.686200,-0.144268,-1.310275,Unemployed,Mortgage,No
740,-0.515680,1.127891,0.893839,0.544960,-0.418173,Self-Employed,Rented,Yes
660,-0.222472,-0.634877,1.554736,-1.314461,-0.418173,Employed,Rented,No
411,1.463477,0.347825,0.658712,0.318939,-1.310275,Unemployed,Rented,No
...,...,...,...,...,...,...,...,...
408,0.950362,1.323422,-0.815596,1.292342,0.473929,Unemployed,Mortgage,No
332,-0.515680,1.601922,0.906549,1.537592,-1.310275,Employed,Mortgage,Yes
208,-0.075868,-1.659064,-1.635362,-1.584290,0.473929,Unemployed,Rented,Yes
613,-1.395306,0.128819,1.001870,1.161769,0.473929,Unemployed,Mortgage,Yes


In [239]:
objt_df = encode.transform(xtest[cat_cols])
# objt_df

In [240]:
xtest[encode.get_feature_names_out()] = objt_df
# xtest

In [241]:
xtest.drop(cat_cols,axis=1,inplace=True)
xtest

Unnamed: 0,Age,Salary,Credit_Score,Loan_Amount,Loan_Term,Employment_Status_Employed,Employment_Status_Self-Employed,Employment_Status_Unemployed,Residence_Type_Mortgage,Residence_Type_Owned,Residence_Type_Rented,Previous_Default_No,Previous_Default_Yes
521,-1.175400,-0.953144,0.061363,0.156188,0.473929,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
737,-0.222472,0.625623,-1.686200,-0.144268,-1.310275,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
740,-0.515680,1.127891,0.893839,0.544960,-0.418173,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
660,-0.222472,-0.634877,1.554736,-1.314461,-0.418173,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
411,1.463477,0.347825,0.658712,0.318939,-1.310275,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
408,0.950362,1.323422,-0.815596,1.292342,0.473929,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
332,-0.515680,1.601922,0.906549,1.537592,-1.310275,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
208,-0.075868,-1.659064,-1.635362,-1.584290,0.473929,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
613,-1.395306,0.128819,1.001870,1.161769,0.473929,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0


In [242]:
model = DecisionTreeClassifier()
model1 = DecisionTreeRegressor()

In [243]:
model.fit(xtrain, ytrain)
model1.fit(xtrain,ytrain)

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [244]:
model.score(xtrain,ytrain)

1.0

In [245]:
model1.score(xtrain,ytrain)

1.0

In [246]:
model.score(xtest,ytest)

0.51

In [247]:
model1.score(xtest,ytest)

-1.0250626566416043

In [248]:
ypred = model.predict(xtest)

confusion_matrix(ytest,ypred)

array([[47, 48],
       [50, 55]])

In [249]:
ypred1 = model1.predict(xtest)

confusion_matrix(ytest,ypred1)

array([[42, 53],
       [48, 57]])

In [250]:
precision_score(ytest,ypred,pos_label=0)

0.4845360824742268

In [251]:
precision_score(ytest,ypred1,pos_label=0)

0.4666666666666667

In [252]:
precision_score(ytest,ypred,pos_label=1)

0.5339805825242718

In [253]:
precision_score(ytest,ypred1,pos_label=1)

0.5181818181818182

In [254]:
recall_score(ytest,ypred,pos_label=0)

0.49473684210526314

In [255]:
recall_score(ytest,ypred1,pos_label=0)

0.4421052631578947

In [256]:
recall_score(ytest,ypred,pos_label=1)

0.5238095238095238

In [257]:
recall_score(ytest,ypred1,pos_label=1)

0.5428571428571428

In [258]:
f1_score(ytest,ypred,pos_label=0)

0.4895833333333333

In [259]:
f1_score(ytest,ypred1,pos_label=0)

0.4540540540540541

In [260]:
f1_score(ytest,ypred,pos_label=1)

0.5288461538461539

In [261]:
f1_score(ytest,ypred1,pos_label=1)

0.5302325581395348

In [262]:
accuracy_score(ytest,ypred)

0.51

In [263]:
accuracy_score(ytest,ypred1)

0.495