In [57]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder,StandardScaler
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix,recall_score,precision_score,f1_score,accuracy_score

In [58]:
df = pd.read_csv('loan_approval_dataset.csv')
df.head()

Unnamed: 0,Age,Salary,Credit_Score,Loan_Amount,Loan_Term,Employment_Status,Residence_Type,Previous_Default,Loan_Approved
0,56,136748,584,38209,36 months,Employed,Owned,Yes,Yes
1,46,25287,815,27424,24 months,Self-Employed,Rented,No,Yes
2,32,146593,398,42396,12 months,Unemployed,Rented,Yes,Yes
3,60,54387,696,11370,24 months,Unemployed,Owned,No,No
4,25,28512,788,14528,12 months,Employed,Owned,No,No


In [59]:
df['Loan_Term'] = df['Loan_Term'].str.removesuffix(' months').astype('int')

In [60]:
# df['Loan_Approved'] = df['Loan_Approved'].map({'Yes': 1, 'No': 0})

In [61]:
X = df.drop('Loan_Approved',axis=1)
y = df['Loan_Approved']

In [62]:
cat_cols = X.select_dtypes(include="object").columns                
num_cols = X.select_dtypes(exclude="object").columns   

In [63]:
xtrain,xtest,ytrain,ytest = train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:
scaler = StandardScaler()
encode = OneHotEncoder(sparse_output=False)

In [65]:
xtrain[num_cols] = scaler.fit_transform(xtrain[num_cols])
obj_df = encode.fit_transform(xtrain[cat_cols])
xtrain[encode.get_feature_names_out()] = obj_df
xtrain.drop(cat_cols,axis=1,inplace=True)

In [66]:
xtest[num_cols] = scaler.transform(xtest[num_cols])
objt_df = encode.transform(xtest[cat_cols])
xtest[encode.get_feature_names_out()] = objt_df
xtest.drop(cat_cols,axis=1,inplace=True)

In [67]:
model = DecisionTreeClassifier(random_state=42)

In [68]:
params = {
    'criterion':['entropy','gini'],
    'max_depth':[10,30,50,100],
    'splitter':['best','random'],
    'min_samples_split':[2,3,5,7,10],
    'min_samples_leaf':[1,2,3,4,5]
}
gridsearch = GridSearchCV(model,params,scoring='accuracy',cv=5,n_jobs=-1,verbose=2,return_train_score=True)
gridsearch.fit(xtrain,ytrain)

Fitting 5 folds for each of 400 candidates, totalling 2000 fits


0,1,2
,estimator,DecisionTreeC...ndom_state=42)
,param_grid,"{'criterion': ['entropy', 'gini'], 'max_depth': [10, 30, ...], 'min_samples_leaf': [1, 2, ...], 'min_samples_split': [2, 3, ...], ...}"
,scoring,'accuracy'
,n_jobs,-1
,refit,True
,cv,5
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,True

0,1,2
,criterion,'gini'
,splitter,'random'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,2
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [69]:
model = gridsearch.best_estimator_
# model.fit(xtrain,ytrain)
model

0,1,2
,criterion,'gini'
,splitter,'random'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,2
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [85]:
gridsearch.best_params_
# here we can't train the model, that's the difference

{'criterion': 'gini',
 'max_depth': 10,
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'splitter': 'random'}

{'criterion': 'gini',
 'max_depth': 10,
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'splitter': 'random'}

In [71]:
gridsearch.best_score_

np.float64(0.53375)

In [72]:
gridsearch.cv_results_

{'mean_fit_time': array([0.02525458, 0.01456299, 0.01968169, 0.01316624, 0.02406206,
        0.01423545, 0.0186923 , 0.00949225, 0.01536107, 0.00876369,
        0.01429195, 0.01004829, 0.01471486, 0.00936489, 0.01657033,
        0.00985475, 0.01762972, 0.00906782, 0.01562476, 0.00937219,
        0.01934376, 0.01300302, 0.02155824, 0.01042619, 0.0169529 ,
        0.0091084 , 0.01529007, 0.01002135, 0.0211586 , 0.00853796,
        0.01508293, 0.01124558, 0.01371431, 0.00933294, 0.01481209,
        0.00865293, 0.04778786, 0.00808887, 0.01611862, 0.03102121,
        0.01886687, 0.00865698, 0.02519288, 0.02111602, 0.02831268,
        0.03340178, 0.02519326, 0.00975838, 0.04414077, 0.00768957,
        0.01615672, 0.00952106, 0.02530293, 0.0120904 , 0.0226512 ,
        0.00780993, 0.02486644, 0.01831675, 0.01866474, 0.00854993,
        0.02438402, 0.00965033, 0.01974444, 0.00896974, 0.02075286,
        0.01151786, 0.02398314, 0.01179566, 0.02342253, 0.00805655,
        0.02088304, 0.00833211,

In [73]:
result = pd.DataFrame(gridsearch.cv_results_)

In [74]:
result['rank_test_score'].unique()

array([377,  53, 381, 164, 392,   6, 372,  67, 366,  50, 373, 172, 385,
        85, 312, 102, 341, 145, 378, 161,  57, 328, 175, 305,  72, 265,
         7, 248,  93, 342,  61, 266, 179, 206, 215,  89, 191, 253,  79,
       272, 118, 221, 142, 194, 176, 212, 293, 188,  20, 369, 393,  38,
       386, 398, 101, 397,  68,   1, 371,  52, 367,  49, 187, 165, 275,
         3,  69, 203, 124, 262, 382], dtype=int32)

In [75]:
result.sort_values('rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_min_samples_leaf,param_min_samples_split,param_splitter,params,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
213,0.007663,0.001129,0.004880,0.000867,gini,10,2,3,random,"{'criterion': 'gini', 'max_depth': 10, 'min_sa...",...,0.53375,0.019605,1,0.826562,0.850000,0.839063,0.807813,0.821875,0.829063,0.014477
211,0.012634,0.007942,0.004498,0.000491,gini,10,2,2,random,"{'criterion': 'gini', 'max_depth': 10, 'min_sa...",...,0.53375,0.019605,1,0.826562,0.850000,0.839063,0.807813,0.821875,0.829063,0.014477
353,0.017382,0.006724,0.007222,0.000138,gini,100,1,3,random,"{'criterion': 'gini', 'max_depth': 100, 'min_s...",...,0.52000,0.038406,3,0.954688,0.942187,0.962500,0.950000,0.953125,0.952500,0.006600
303,0.016713,0.000953,0.008710,0.000505,gini,50,1,3,random,"{'criterion': 'gini', 'max_depth': 50, 'min_sa...",...,0.52000,0.038406,3,0.954688,0.942187,0.962500,0.950000,0.953125,0.952500,0.006600
253,0.008675,0.000714,0.004783,0.000844,gini,30,1,3,random,"{'criterion': 'gini', 'max_depth': 30, 'min_sa...",...,0.52000,0.038406,3,0.954688,0.942187,0.962500,0.950000,0.953125,0.952500,0.006600
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
364,0.013221,0.002681,0.005111,0.001205,gini,100,2,5,best,"{'criterion': 'gini', 'max_depth': 100, 'min_s...",...,0.43125,0.042939,393,0.929688,0.925000,0.923438,0.925000,0.918750,0.924375,0.003508
208,0.012179,0.002278,0.005786,0.003073,gini,10,1,10,best,"{'criterion': 'gini', 'max_depth': 10, 'min_sa...",...,0.42750,0.016105,397,0.835938,0.776563,0.681250,0.795312,0.812500,0.780313,0.053259
212,0.012085,0.001073,0.004256,0.000393,gini,10,2,3,best,"{'criterion': 'gini', 'max_depth': 10, 'min_sa...",...,0.42750,0.022913,398,0.870313,0.787500,0.700000,0.810937,0.837500,0.801250,0.057651
206,0.011357,0.001107,0.004163,0.000919,gini,10,1,7,best,"{'criterion': 'gini', 'max_depth': 10, 'min_sa...",...,0.42750,0.027839,398,0.857812,0.795312,0.700000,0.820312,0.843750,0.803438,0.055914


In [76]:
ypred = model.predict(xtest)

confusion_matrix(ytest,ypred)

array([[56, 39],
       [55, 50]])

In [78]:
precision_score(ytest,ypred,pos_label="No")

0.5045045045045045

- Out of all negatives, 50% are true negatives.

In [79]:
precision_score(ytest,ypred,pos_label="Yes")

0.5617977528089888

- Out of all Positives, 56% are true positives.

In [80]:
recall_score(ytest,ypred,pos_label="No")

0.5894736842105263

- out of all actual negatives, 59% are correctly predicted as negatives.

In [81]:
recall_score(ytest,ypred,pos_label="Yes")

0.47619047619047616

- out of all actual positives, 47% are correctly predicted as positives.

In [82]:
f1_score(ytest,ypred,pos_label="No")

0.5436893203883495

In [83]:
f1_score(ytest,ypred,pos_label="Yes")

0.5154639175257731

In [84]:
accuracy_score(ytest,ypred)

0.53

- From the entire dataset, 53% of data is correctly predicted.

In [90]:
# class_counts = df['Loan_Term'].value_counts()
# print(class_counts)