- Read & clean the dataset.
- Import pandas for data handling, scikit-learn tools for preprocessing and splitting.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

- Loads the dataset, shows the first few rows and info about datatypes and nulls.

In [2]:
df = pd.read_csv('loan_approval_dataset.csv')

In [3]:
df.head()

Unnamed: 0,Age,Salary,Credit_Score,Loan_Amount,Loan_Term,Employment_Status,Residence_Type,Previous_Default,Loan_Approved
0,56,136748,584,38209,36 months,Employed,Owned,Yes,Yes
1,46,25287,815,27424,24 months,Self-Employed,Rented,No,Yes
2,32,146593,398,42396,12 months,Unemployed,Rented,Yes,Yes
3,60,54387,696,11370,24 months,Unemployed,Owned,No,No
4,25,28512,788,14528,12 months,Employed,Owned,No,No


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Age                1000 non-null   int64 
 1   Salary             1000 non-null   int64 
 2   Credit_Score       1000 non-null   int64 
 3   Loan_Amount        1000 non-null   int64 
 4   Loan_Term          1000 non-null   object
 5   Employment_Status  1000 non-null   object
 6   Residence_Type     1000 non-null   object
 7   Previous_Default   1000 non-null   object
 8   Loan_Approved      1000 non-null   object
dtypes: int64(4), object(5)
memory usage: 70.4+ KB


- Removes ' months' string from the Loan_Term column and converts it to int.

In [5]:
df['Loan_Term'] = df['Loan_Term'].str.removesuffix(' months').astype('int')

In [6]:
df['Loan_Approved'] = df['Loan_Approved'].map({'Yes': 1, 'No': 0})

- Splits columns into categorical and numerical.

In [7]:
X = df.drop('Loan_Approved',axis=1)
y = df['Loan_Approved']
cat_cols = X.select_dtypes(include="object").columns                # categorical columns
num_cols = X.select_dtypes(exclude="object").columns                # numerical columns

In [8]:
xtrain,xtest,ytrain,ytest = train_test_split(X,y,test_size=0.2,random_state=42)

- Initializes a scaler for numerical data and one-hot encoder for categorical data.

In [9]:
scaler = StandardScaler()
encode = OneHotEncoder(sparse_output=False)
#  To directly convert to array, we use sparse_output instead of converting matrix to array using 'to_array' function

- Fits and scales numerical features in training data.

In [10]:
xtrain[num_cols] = scaler.fit_transform(xtrain[num_cols])
xtrain

Unnamed: 0,Age,Salary,Credit_Score,Loan_Amount,Loan_Term,Employment_Status,Residence_Type,Previous_Default
29,0.217341,0.001268,-0.440664,-0.790774,-0.418173,Self-Employed,Rented,No
535,-0.222472,0.049597,-0.555050,-1.281818,0.473929,Employed,Mortgage,No
695,0.290643,-0.930762,1.179804,0.940788,0.473929,Employed,Mortgage,No
557,0.437247,-0.261695,1.478479,1.030343,-1.310275,Employed,Mortgage,Yes
836,0.363945,0.409350,-0.459728,0.360422,0.473929,Unemployed,Mortgage,No
...,...,...,...,...,...,...,...,...
106,0.803758,0.867453,0.715905,0.336618,0.473929,Employed,Rented,No
270,-1.102097,1.308093,-1.196882,-0.371685,1.366032,Self-Employed,Mortgage,Yes
860,1.683383,1.291202,0.677777,-1.447282,0.473929,Unemployed,Rented,No
435,-1.102097,-0.976906,1.681831,1.425861,0.473929,Unemployed,Mortgage,No


- Encodes categorical features and appends the encoded columns to xtrain. Then drops original categorical columns.

In [11]:
obj_df = encode.fit_transform(xtrain[cat_cols])

In [12]:
# obj_df = pd.DataFrame(obj_df,columns=encode.get_feature_names_out())
xtrain[encode.get_feature_names_out()] = obj_df

In [13]:
xtrain.drop(cat_cols,axis=1,inplace=True)

In [14]:
# xtrain.drop(cat_cols,axis=1,inplace=True)
# xtrain = pd.concat([xtrain,obj_df],axis=1)
xtrain

Unnamed: 0,Age,Salary,Credit_Score,Loan_Amount,Loan_Term,Employment_Status_Employed,Employment_Status_Self-Employed,Employment_Status_Unemployed,Residence_Type_Mortgage,Residence_Type_Owned,Residence_Type_Rented,Previous_Default_No,Previous_Default_Yes
29,0.217341,0.001268,-0.440664,-0.790774,-0.418173,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
535,-0.222472,0.049597,-0.555050,-1.281818,0.473929,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
695,0.290643,-0.930762,1.179804,0.940788,0.473929,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
557,0.437247,-0.261695,1.478479,1.030343,-1.310275,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
836,0.363945,0.409350,-0.459728,0.360422,0.473929,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,0.803758,0.867453,0.715905,0.336618,0.473929,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
270,-1.102097,1.308093,-1.196882,-0.371685,1.366032,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
860,1.683383,1.291202,0.677777,-1.447282,0.473929,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
435,-1.102097,-0.976906,1.681831,1.425861,0.473929,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0


- complete pre-processing and model training, transform and update ytest also

In [15]:
xtest[num_cols] = scaler.transform(xtest[num_cols])
xtest

Unnamed: 0,Age,Salary,Credit_Score,Loan_Amount,Loan_Term,Employment_Status,Residence_Type,Previous_Default
521,-1.175400,-0.953144,0.061363,0.156188,0.473929,Self-Employed,Mortgage,No
737,-0.222472,0.625623,-1.686200,-0.144268,-1.310275,Unemployed,Mortgage,No
740,-0.515680,1.127891,0.893839,0.544960,-0.418173,Self-Employed,Rented,Yes
660,-0.222472,-0.634877,1.554736,-1.314461,-0.418173,Employed,Rented,No
411,1.463477,0.347825,0.658712,0.318939,-1.310275,Unemployed,Rented,No
...,...,...,...,...,...,...,...,...
408,0.950362,1.323422,-0.815596,1.292342,0.473929,Unemployed,Mortgage,No
332,-0.515680,1.601922,0.906549,1.537592,-1.310275,Employed,Mortgage,Yes
208,-0.075868,-1.659064,-1.635362,-1.584290,0.473929,Unemployed,Rented,Yes
613,-1.395306,0.128819,1.001870,1.161769,0.473929,Unemployed,Mortgage,Yes


In [16]:
objt_df = encode.transform(xtest[cat_cols])
# objt_df

In [17]:
xtest[encode.get_feature_names_out()] = objt_df
# xtest

In [18]:
xtest.drop(cat_cols,axis=1,inplace=True)
xtest

Unnamed: 0,Age,Salary,Credit_Score,Loan_Amount,Loan_Term,Employment_Status_Employed,Employment_Status_Self-Employed,Employment_Status_Unemployed,Residence_Type_Mortgage,Residence_Type_Owned,Residence_Type_Rented,Previous_Default_No,Previous_Default_Yes
521,-1.175400,-0.953144,0.061363,0.156188,0.473929,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
737,-0.222472,0.625623,-1.686200,-0.144268,-1.310275,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
740,-0.515680,1.127891,0.893839,0.544960,-0.418173,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
660,-0.222472,-0.634877,1.554736,-1.314461,-0.418173,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
411,1.463477,0.347825,0.658712,0.318939,-1.310275,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
408,0.950362,1.323422,-0.815596,1.292342,0.473929,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
332,-0.515680,1.601922,0.906549,1.537592,-1.310275,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
208,-0.075868,-1.659064,-1.635362,-1.584290,0.473929,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
613,-1.395306,0.128819,1.001870,1.161769,0.473929,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0


In [None]:
model = DecisionTreeClassifier(random_state=42)
params = {
    'criterion':['entropy','gini'],
    'max_depth':[10,30,50,100],
    'splitter':['best','random'],
    'min_samples_split':[2,3,5,7,10],
    'min_samples_leaf':[1,2,3,4,5]
}

In [20]:
gridsearch = GridSearchCV(model,params,scoring='accuracy',cv=5,n_jobs=-1,verbose=2,return_train_score=True)
gridsearch.fit(xtrain,ytrain)
gridsearch.best_params_

Fitting 5 folds for each of 400 candidates, totalling 2000 fits


{'criterion': 'gini',
 'max_depth': 10,
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'splitter': 'random'}

In [21]:
print("Best Params:", gridsearch.best_params_)
print("Best CV Accuracy:", gridsearch.best_score_)

bmodel = gridsearch.best_estimator_
print("Accuracy:", bmodel.score(xtest, ytest))

Best Params: {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2, 'splitter': 'random'}
Best CV Accuracy: 0.53375
Accuracy: 0.53


In [22]:
ypred = bmodel.predict(xtest)

confusion_matrix(ytest,ypred)

array([[56, 39],
       [55, 50]])

In [23]:
precision_score(ytest,ypred,pos_label=0)

0.5045045045045045

In [24]:
precision_score(ytest,ypred,pos_label=1)

0.5617977528089888

In [25]:
recall_score(ytest,ypred,pos_label=0)

0.5894736842105263

In [26]:
recall_score(ytest,ypred,pos_label=1)

0.47619047619047616

In [27]:
f1_score(ytest,ypred,pos_label=0)

0.5436893203883495

In [28]:
f1_score(ytest,ypred,pos_label=1)

0.5154639175257731

In [29]:
accuracy_score(ytest,ypred)

0.53