In [20]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,precision_score,recall_score

In [8]:
df = pd.read_csv(r'C:\Users\vchan\OneDrive\Desktop\Machine Learning\bank_loan_data.csv')
df

Unnamed: 0,age,income,credit_score,loan_amount,existing_loans,default
0,59,45426,527,51418,3,1
1,49,37772,401,460433,1,1
2,35,123754,716,105699,0,0
3,63,23712,515,95140,3,1
4,28,21367,706,479062,0,0
...,...,...,...,...,...,...
1995,54,145074,742,222995,4,0
1996,38,123065,629,193262,3,0
1997,33,80011,550,445977,3,0
1998,61,58635,422,298313,4,1


## Step 2
### Splitting Data into : Testing, Training, Validation

In [10]:
X = df.drop('default',axis=1)
y = df['default']

In [12]:
X_temp,X_test,y_temp,y_test = train_test_split(X,y,stratify=y,test_size=0.3,random_state=42) #testing set

In [14]:
X_train,X_val,y_train,y_val = train_test_split(X_temp,y_temp,stratify=y_temp,random_state=42,test_size=0.2) #training and validation

In [15]:
print(y_temp.value_counts(normalize=True))

default
0    0.642143
1    0.357857
Name: proportion, dtype: float64


In [16]:
print(y_val.value_counts(normalize=True))

default
0    0.642857
1    0.357143
Name: proportion, dtype: float64


In [17]:
print(y_test.value_counts(normalize=True))

default
0    0.641667
1    0.358333
Name: proportion, dtype: float64


## Step 3
### Making base Assumptions that we have to reject all customer which is '1'

In [19]:
y_val_baseline = np.ones(len(y_val))
y_val_baseline

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1.

In [22]:
cn_base = confusion_matrix(y_val,y_val_baseline)
pr_base = precision_score(y_val,y_val_baseline)
rc_base = recall_score(y_val,y_val_baseline)
print(cn_base)
print(pr_base)
print(rc_base)

[[  0 180]
 [  0 100]]
0.35714285714285715
1.0


## Step 4
### Making Logistic regression model to see difference between base and real

In [24]:
for col in ['income','loan_amount']:
    X_train[col] = np.log1p(X_train[col])
    X_val[col] = np.log1p(X_val[col])
    X_test[col] = np.log1p(X_test[col])

In [25]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [26]:
model = LogisticRegression()
model.fit(X_train_scaled,y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [27]:
y_val_pred = model.predict(X_val_scaled)

In [28]:
cm = confusion_matrix(y_val,y_val_pred)
pc = precision_score(y_val,y_val_pred)
rc = recall_score(y_val,y_val_pred)
print(cm)
print(pc)
print(rc)

[[154  26]
 [ 61  39]]
0.6
0.39


## Step 5
### Threshold Tuning to find best 

In [30]:
y_val_proba = model.predict_proba(X_val_scaled)[:,1]
y_val_proba

array([0.74733559, 0.40038989, 0.3449716 , 0.48039053, 0.15396108,
       0.53210677, 0.38434424, 0.40870551, 0.73437815, 0.65535917,
       0.41145493, 0.48248918, 0.55117199, 0.45945006, 0.34712243,
       0.15629841, 0.58053244, 0.20518195, 0.31916041, 0.35087979,
       0.33346681, 0.1502884 , 0.42069841, 0.16239554, 0.47415659,
       0.3197913 , 0.15564086, 0.32481431, 0.1201262 , 0.78977209,
       0.56245067, 0.45618658, 0.42258478, 0.29649046, 0.12577308,
       0.55459221, 0.624602  , 0.62120615, 0.41719236, 0.21030689,
       0.37668442, 0.42502809, 0.15023019, 0.64137603, 0.3109935 ,
       0.12793265, 0.57640545, 0.41421858, 0.4000773 , 0.39303507,
       0.52686539, 0.19294479, 0.56062176, 0.11756196, 0.30202055,
       0.54084145, 0.75601127, 0.41411986, 0.64226587, 0.26862257,
       0.52278631, 0.28724349, 0.31371439, 0.20183312, 0.56340408,
       0.16719259, 0.530878  , 0.52313288, 0.63020341, 0.3703936 ,
       0.38227957, 0.22057988, 0.20067804, 0.22049085, 0.25606

### Trying with threshold 0.7

In [31]:
y_val_70 = (y_val_proba > 0.70).astype(int)
cm_70 = confusion_matrix(y_val,y_val_70)
cm_70

array([[177,   3],
       [ 94,   6]])

#### Since we can see number went up from 61 -> 94 in increasing threshold. We will now reduce threshold

In [32]:
y_val_30 = (y_val_proba > 0.30).astype(int)
cm_30 = confusion_matrix(y_val,y_val_30)
cm_30

array([[89, 91],
       [13, 87]])

#### This is more stable as FN is 13

## Step 6
### Comparing threshold a practice step

In [34]:
thresholds = [0.2, 0.3, 0.4, 0.5]
results = []

for t in thresholds:
    preds = (y_val_proba >= t).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_val, preds).ravel()
    results.append({"threshold": t,"FN (bad approved)": fn,"FP (good rejected)": fp})

results

[{'threshold': 0.2,
  'FN (bad approved)': np.int64(5),
  'FP (good rejected)': np.int64(134)},
 {'threshold': 0.3,
  'FN (bad approved)': np.int64(13),
  'FP (good rejected)': np.int64(91)},
 {'threshold': 0.4,
  'FN (bad approved)': np.int64(35),
  'FP (good rejected)': np.int64(54)},
 {'threshold': 0.5,
  'FN (bad approved)': np.int64(61),
  'FP (good rejected)': np.int64(26)}]

## Step 7
### Finding ans of model on test set

In [37]:
y_test_proba = model.predict_proba(X_test_scaled)[:,1]
y_test_proba

array([0.18853372, 0.22906526, 0.30852963, 0.28101027, 0.50036693,
       0.44628465, 0.72966502, 0.27468006, 0.30499732, 0.27132404,
       0.13228127, 0.1870501 , 0.11320291, 0.39531911, 0.37583707,
       0.07058338, 0.56869471, 0.50215431, 0.51476337, 0.70461871,
       0.52683809, 0.5676175 , 0.49384599, 0.19574282, 0.47481329,
       0.25191851, 0.32726082, 0.57705351, 0.40017853, 0.16855737,
       0.42259723, 0.15325205, 0.16524008, 0.62982471, 0.11311448,
       0.23817845, 0.35051366, 0.15182221, 0.71928498, 0.37265894,
       0.38895838, 0.24522192, 0.18110502, 0.15557849, 0.46336581,
       0.61067212, 0.28510456, 0.6788181 , 0.1924039 , 0.27387028,
       0.20999224, 0.4423148 , 0.55516245, 0.62569821, 0.21177543,
       0.53741478, 0.45416162, 0.52536132, 0.66652862, 0.51841326,
       0.48665921, 0.39024784, 0.27562189, 0.76699138, 0.34503758,
       0.34355786, 0.41741021, 0.60767075, 0.17892087, 0.40361577,
       0.44896292, 0.1071568 , 0.29642587, 0.49389605, 0.47539

In [39]:
y_test_pred = (y_test_proba > 0.3).astype(int)
confusion_matrix(y_test,y_test_pred)

array([[212, 173],
       [ 48, 167]])

## “At threshold 0.3, the model reduces bad approvals significantly compared to default LR, while allowing business. Risk is controlled and acceptable."