## Import libraries

In [1]:
import numpy as np
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

## Loading Dataset to perform an (EDA)

In [2]:
## loading the cleaned loan status dataset
df = pd.read_csv("cleaned_loan_status_dataset.csv")

## Exploratory Data Analysis
- Perform target variable analysis

In [3]:
## target variable 
df.loan_status.value_counts()

loan_status
0    271
1    110
Name: count, dtype: int64

In [4]:
numeric_cols = df.select_dtypes(exclude=[object])

corr_matrix = numeric_cols.corr()

corr_matrix['loan_status']

Unnamed: 0           0.036862
gender              -0.054757
married             -0.092473
dependents          -0.013701
self_employed        0.019105
applicantincome      0.010167
coapplicantincome   -0.009017
loanamount          -0.041220
loan_amount_term     0.046672
credit_history      -0.453699
loan_status          1.000000
Name: loan_status, dtype: float64

## Build a Validation Framework
- Devide the dataset into:
      - Training set (60%)
      - Validation set (20%)
      - Test set (20%)

In [5]:
## 
df_train_full , df_test = train_test_split(df, test_size=0.2, random_state=11) 
df_train, df_valid = train_test_split(df_train_full, test_size=0.25, random_state=11)


print(f'Training dataset: {len(df_train)}')
print(f'Validation dataset: {len(df_valid)}')
print(f'Test dataset: {len(df_test)}')

Training dataset: 228
Validation dataset: 76
Test dataset: 77


In [6]:
y_train = df_train['loan_status'].values
y_valid = df_valid['loan_status'].values
y_test = df_test['loan_status'].values

In [7]:
## Deleting all the loan_status columns from our traning, validation and the test dataset
del df_train['loan_status']
del df_valid['loan_status']
del df_test['loan_status']

## Feature Engineering 
- Dividing our data into numerical and categorical
- perform the one-hot encoding

In [8]:
numerical_features = ['dependents','applicantincome','coapplicantincome','loanamount','loan_amount_term','credit_history']

categorical_features = ['gender','married','education','self_employed','property_area']

In [9]:
## convert the dataframe into dict
train_dict = df_train[categorical_features + numerical_features].to_dict(orient='records')

valid_dict = df_valid[categorical_features + numerical_features].to_dict(orient='records')

In [10]:
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

In [11]:
X_train = dv.transform(train_dict)

X_valid = dv.transform(valid_dict)

## Training The Model

In [12]:
## fitting the training dataset to train the model
model = LogisticRegression(solver='liblinear', random_state=1)
model.fit(X_train, y_train)

## Computing predicted truth against ground truth

In [13]:
y_valid_pred = model.predict_proba(X_valid)

In [14]:
y_valid_pred

array([[0.71107195, 0.28892805],
       [0.92313314, 0.07686686],
       [0.88376136, 0.11623864],
       [0.93878074, 0.06121926],
       [0.80230834, 0.19769166],
       [0.76260103, 0.23739897],
       [0.3494753 , 0.6505247 ],
       [0.95464795, 0.04535205],
       [0.26514758, 0.73485242],
       [0.16671078, 0.83328922],
       [0.75899369, 0.24100631],
       [0.66559324, 0.33440676],
       [0.92067664, 0.07932336],
       [0.71005293, 0.28994707],
       [0.85609394, 0.14390606],
       [0.4647433 , 0.5352567 ],
       [0.50844573, 0.49155427],
       [0.7497022 , 0.2502978 ],
       [0.93402048, 0.06597952],
       [0.94367929, 0.05632071],
       [0.61128842, 0.38871158],
       [0.44645578, 0.55354422],
       [0.88001365, 0.11998635],
       [0.85172355, 0.14827645],
       [0.63643667, 0.36356333],
       [0.64994498, 0.35005502],
       [0.15640786, 0.84359214],
       [0.78181322, 0.21818678],
       [0.70878074, 0.29121926],
       [0.93817819, 0.06182181],
       [0.

In [15]:
y_valid_pred = model.predict_proba(X_valid)[:, 1]

In [16]:
loan_status = y_valid_pred >= 0.5

In [17]:
(y_valid == loan_status).mean()

0.8026315789473685

In [18]:
acc_score = accuracy_score(y_valid, loan_status)
print(f'Validation Accuracy Score: {round(acc_score * 100, 1)}%')

Validation Accuracy Score: 80.3%


## Saving The Model

In [19]:
import pickle

In [20]:
## specifyging where to save the file
with open('loan-model.bin', 'wb') as f_out:
    ## save the model
    pickle.dump((dv,model), f_out)

## Loading The Model

In [21]:
with open('loan-model.bin', 'rb') as f_in:
    dv, model = pickle.load(f_in)

## Testing our model against applicant information

In [22]:
## applicant Data
applicant = {
 'customerid': 0,
 'gender': 'Female',
 'married': 'No',
 'dependents': 2,
 'education': 'Graduate',
 'self_employed': 'Yes',    
 'applicantincome': 50083.0,
 'coapplicantincome': 10.0,
 'loanamount': 100.0,
 'loan_amount_term': 24,
 'credit_history': 0,
 'property_area': 'Rural', 
}

In [23]:
def predict_single(df, dv, model):
    X = dv.transform([applicant])
    y_pred = model.predict_proba(X)[:,1]
    return y_pred[0]

In [24]:
prediction = predict_single(applicant, dv, model)

In [25]:
## Applicant data prediction score
print(f'{prediction}')

0.9995145002243854


In [26]:
## Models's verdict
if prediction >= 0.5:
    print('verdict: Good standing - "Approved"')
else:
    print('verdict: Bad standing - "Rejected"')

verdict: Good standing - "Approved"
