In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score,recall_score,f1_score

In [3]:
df = pd.read_csv(r'C:\Users\vchan\OneDrive\Desktop\45-days-of-ml\datasets\Cleaned_finance.csv')
df

Unnamed: 0,Dependents,Loan_Amount_Term,Credit_History,Loan_Status,Gender_Male,Married_Yes,Education_Not Graduate,Self_Employed_Yes,Property_Area_Semiurban,Property_Area_Urban,ApplicantIncome_log,CoapplicantIncome_log,LoanAmount_log
0,-0.737806,0.273231,1.0,1,1,0,0,0,0,1,0.516186,-1.107783,-0.012803
1,0.253470,0.273231,1.0,0,1,1,0,0,0,0,0.137806,0.782158,-0.012803
2,-0.737806,0.273231,1.0,1,1,1,0,1,0,1,-0.519479,-1.107783,-1.348663
3,-0.737806,0.273231,1.0,1,1,1,1,0,0,1,-0.751605,0.897526,-0.143351
4,-0.737806,0.273231,1.0,1,1,0,0,0,0,1,0.555727,-1.107783,0.182981
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,-0.737806,0.273231,1.0,1,0,0,0,0,0,0,-0.572062,-1.107783,-1.201901
610,2.236021,-2.522836,1.0,1,1,1,0,0,0,0,-0.032679,-1.107783,-2.350110
611,0.253470,0.273231,1.0,1,1,1,0,0,0,1,1.015921,0.308483,1.368737
612,1.244745,0.273231,1.0,1,1,1,0,0,0,1,0.918972,-1.107783,0.755185


In [4]:
X = df.drop(columns=['Loan_Status'])
y = df['Loan_Status']

X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42,stratify=y) #Keeps approval/rejection ratio same, Prevents biased evaluation

In [5]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train,y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [6]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test,y_pred)

In [7]:
print('The accuracy is:',accuracy)

The accuracy is: 0.7922077922077922


In [8]:
coef_df = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': model.coef_[0]
}).sort_values(by='Coefficient',ascending=False)

coef_df

Unnamed: 0,Feature,Coefficient
2,Credit_History,1.86986
7,Property_Area_Semiurban,0.691969
4,Married_Yes,0.403688
10,CoapplicantIncome_log,0.193834
8,Property_Area_Urban,0.143344
0,Dependents,0.083542
9,ApplicantIncome_log,0.009155
1,Loan_Amount_Term,0.0003
11,LoanAmount_log,-0.029651
6,Self_Employed_Yes,-0.062379


In [9]:
cm = confusion_matrix(y_test, y_pred)
cm

array([[24, 24],
       [ 8, 98]])

In [10]:
print('Precision defination: Of all loans the model approved, how many were actually correct?')
precision = precision_score(y_test, y_pred)
print('Of all loans the model approved, how many were actually correct?',precision)

Precision defination: Of all loans the model approved, how many were actually correct?
Of all loans the model approved, how many were actually correct? 0.8032786885245902


In [11]:
print('Recall defination: Out of all people who SHOULD be approved, how many did we approve?')
recall = recall_score(y_test, y_pred)
print('Out of all people who SHOULD be approved, how many did we approve?',recall)


Recall defination: Out of all people who SHOULD be approved, how many did we approve?
Out of all people who SHOULD be approved, how many did we approve? 0.9245283018867925


In [12]:
print('F1 score defination: Balance between precision and recall')
f1 = f1_score(y_test, y_pred)
f1

F1 score defination: Balance between precision and recall


0.8596491228070176

## Day 4: Thresholds

In [13]:
y_prob = model.predict_proba(X_test)[:, 1]
y_prob

array([0.6295    , 0.74838938, 0.37772372, 0.75690522, 0.75319769,
       0.69906085, 0.27946173, 0.39595986, 0.66979577, 0.77380108,
       0.62975141, 0.73995145, 0.80967301, 0.63634696, 0.80977941,
       0.31218183, 0.75775737, 0.81249413, 0.71361217, 0.81310279,
       0.74715352, 0.26134299, 0.77175182, 0.35739649, 0.34659119,
       0.82218823, 0.45805497, 0.71116059, 0.80098258, 0.88161166,
       0.84513579, 0.8348526 , 0.86837783, 0.71284411, 0.2263777 ,
       0.40553967, 0.59264706, 0.39984094, 0.83247199, 0.83083025,
       0.79018602, 0.88035035, 0.71594717, 0.74014709, 0.7633762 ,
       0.6386062 , 0.2086565 , 0.7160803 , 0.71330057, 0.83363024,
       0.88186677, 0.7170876 , 0.62208705, 0.84445188, 0.78731373,
       0.89431466, 0.78758753, 0.81018121, 0.72226097, 0.71328808,
       0.88187371, 0.23664671, 0.77189921, 0.81491796, 0.83444234,
       0.43201211, 0.68072331, 0.8645383 , 0.79037106, 0.71028983,
       0.66719572, 0.74931713, 0.83492679, 0.72162806, 0.75863

## Now when Threshold is 0.5 

In [14]:
y_pred_05 = (y_prob >= 0.5).astype(int)

In [28]:
print(confusion_matrix(y_test,y_pred_05))
print(precision_score(y_test,y_pred_05))
print(recall_score(y_test,y_pred_05))

[[24 24]
 [ 8 98]]
0.8032786885245902
0.9245283018867925


#### Explaination

1. You approve a LOT of people
2. You catch most good customers
3. BUT you approve 24 bad customers

This is risky lending behavior.

## Now when Threshold is 0.7

In [19]:
y_pred_07 = (y_prob >= 0.7).astype(int)

In [27]:
print(confusion_matrix(y_test,y_pred_07))
print(precision_score(y_test,y_pred_07))
print(recall_score(y_test,y_pred_07))

[[33 15]
 [22 84]]
0.8484848484848485
0.7924528301886793


#### Explaination:
1. Fewer bad loans approved
2. You miss more good customers
3. Bank becomes more conservative

This is realistic bank behavior.

## Now when Threshold is 0.8

In [None]:
y_pred_08 = (y_prob >=0.8).astype(int)

In [26]:
print(confusion_matrix(y_test,y_pred_08))
print(precision_score(y_test,y_pred_08))
print(recall_score(y_test,y_pred_08))

[[44  4]
 [56 50]]
0.9259259259259259
0.4716981132075472


#### Explaination
1. Almost no bad loans approved
2. You reject a LOT of good customers
3. Bank is ultra-strict

This is suitable only if:
- Economy is bad
- Bank is in survival mode