<h1>Modelling for Loan Eligibility Prediction</h1>

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, f1_score, recall_score, precision_score
from sklearn.utils import resample

<h3>Importing Data</h3>

In [6]:
# Loading the dataset into the notebook
data = pd.read_csv('../data/selected_data.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,Married_Encoded,Education_Encoded,Credit_History_Encoded,Property_Area_Encoded,graduatedMarriedMale_Encoded,GME_Encoded,selfemp_Proparea_Encoded,Loan_Status
0,359,0.728873,0.71875,0.794271,0.761364,0.782609,0.782609,0.738636,1
1,127,0.622754,0.71875,0.794271,0.615942,0.610656,0.607843,0.610619,1
2,228,0.728873,0.71875,0.794271,0.761364,0.782609,0.782609,0.738636,1
3,0,0.622754,0.71875,0.794271,0.671533,0.610656,0.607843,0.69863,1
4,482,0.728873,0.71875,0.794271,0.761364,0.782609,0.782609,0.738636,1


In [7]:
test_data = pd.read_csv('../data/test_data.csv')

In [8]:
test_data

Unnamed: 0.1,Unnamed: 0,Married_Encoded,Education_Encoded,Credit_History_Encoded,Property_Area_Encoded,graduatedMarriedMale_Encoded,GME_Encoded,selfemp_Proparea_Encoded,Loan_Status
0,586,0.728873,0.718750,0.794271,0.671533,0.782609,0.782609,0.698630,1
1,570,0.728873,0.718750,0.794271,0.671533,0.782609,0.782609,0.698630,1
2,253,0.728873,0.585859,0.794271,0.761364,0.610656,0.607843,0.738636,1
3,571,0.728873,0.718750,0.089552,0.671533,0.782609,0.782609,0.698630,0
4,528,0.622754,0.585859,0.794271,0.761364,0.610656,0.594937,0.738636,1
...,...,...,...,...,...,...,...,...,...
108,208,0.622754,0.718750,0.794271,0.671533,0.610656,0.607843,0.698630,1
109,96,0.728873,0.718750,0.794271,0.761364,0.610656,0.607843,0.738636,1
110,569,0.728873,0.718750,0.089552,0.671533,0.782609,0.782609,0.698630,0
111,402,0.622754,0.718750,0.794271,0.761364,0.610656,0.607843,0.738636,1


In [9]:
data.drop('Unnamed: 0', axis=1, inplace=True)

In [10]:
test_data.drop('Unnamed: 0', axis=1, inplace=True)

In [11]:
data

Unnamed: 0,Married_Encoded,Education_Encoded,Credit_History_Encoded,Property_Area_Encoded,graduatedMarriedMale_Encoded,GME_Encoded,selfemp_Proparea_Encoded,Loan_Status
0,0.728873,0.71875,0.794271,0.761364,0.782609,0.782609,0.738636,1
1,0.622754,0.71875,0.794271,0.615942,0.610656,0.607843,0.610619,1
2,0.728873,0.71875,0.794271,0.761364,0.782609,0.782609,0.738636,1
3,0.622754,0.71875,0.794271,0.671533,0.610656,0.607843,0.698630,1
4,0.728873,0.71875,0.794271,0.761364,0.782609,0.782609,0.738636,1
...,...,...,...,...,...,...,...,...
446,0.622754,0.71875,0.794271,0.671533,0.610656,0.607843,0.698630,0
447,0.622754,0.71875,0.794271,0.761364,0.610656,0.607843,0.738636,1
448,0.728873,0.71875,0.794271,0.671533,0.782609,0.782609,0.698630,1
449,0.728873,0.71875,0.794271,0.761364,0.782609,0.782609,0.738636,1


In [12]:
test_data

Unnamed: 0,Married_Encoded,Education_Encoded,Credit_History_Encoded,Property_Area_Encoded,graduatedMarriedMale_Encoded,GME_Encoded,selfemp_Proparea_Encoded,Loan_Status
0,0.728873,0.718750,0.794271,0.671533,0.782609,0.782609,0.698630,1
1,0.728873,0.718750,0.794271,0.671533,0.782609,0.782609,0.698630,1
2,0.728873,0.585859,0.794271,0.761364,0.610656,0.607843,0.738636,1
3,0.728873,0.718750,0.089552,0.671533,0.782609,0.782609,0.698630,0
4,0.622754,0.585859,0.794271,0.761364,0.610656,0.594937,0.738636,1
...,...,...,...,...,...,...,...,...
108,0.622754,0.718750,0.794271,0.671533,0.610656,0.607843,0.698630,1
109,0.728873,0.718750,0.794271,0.761364,0.610656,0.607843,0.738636,1
110,0.728873,0.718750,0.089552,0.671533,0.782609,0.782609,0.698630,0
111,0.622754,0.718750,0.794271,0.761364,0.610656,0.607843,0.738636,1


In [13]:
def splitData(data):
    x_train = data.drop('Loan_Status',axis=1)
    y_train = data[['Loan_Status']]

    x_test = test_data.drop('Loan_Status',axis=1)
    y_test = test_data[['Loan_Status']]
    return [x_train, x_test, y_train, y_test]

In [14]:
def standardizeTransform(x_train, x_test, y_train, y_test):
    scaler = StandardScaler().fit(x_train)
    x_train_scaled = scaler.transform(x_train)
    x_test_scaled = scaler.transform(x_test)
    y_train_log = np.log(1+y_train)
    y_test_log = np.log(1+y_test)
    return [x_train_scaled, x_test_scaled, y_train_log, y_test_log]

In [15]:
x_train, x_test, y_train, y_test = splitData(data)

In [16]:
# Fit Model and Evaluate
def fitPredict(x_train, y_train, x_test, y_test, model):
    model.fit(x_train, y_train)

    y_hat = model.predict(x_test)

    #print("Train R2 score: "+ str(model.score(x_train,y_train)))
    print("Test F1 Score: "+ str(f1_score(y_test,y_hat)))
    print("Test Precision: "+ str(precision_score(y_test,y_hat)))
    print("Test Recall: "+ str(recall_score(y_test,y_hat)))

    return model

In [17]:
# Logistic Model Test set prediction and Evaluation
print('Count Vectorizer Test Set Results:')
lm = fitPredict(x_train, y_train, x_test, y_test, LogisticRegression())

Count Vectorizer Test Set Results:
Test F1 Score: 0.8848484848484848
Test Precision: 0.8021978021978022
Test Recall: 0.9864864864864865


  y = column_or_1d(y, warn=True)


<h3>Upsampling</h3>

In [18]:
# Upsampling the minority dataset
minority = data[data['Loan_Status'] == 0]
majority = data[data['Loan_Status'] == 1]

In [19]:
minority.shape

(140, 8)

In [20]:
majority.shape

(311, 8)

In [21]:
upsampled = resample(minority, replace=True, n_samples=300, random_state=42)

In [28]:
updata = pd.concat([majority, upsampled])

In [29]:
x_train_up, x_test_up, y_train_up, y_test_up = splitData(updata)

In [32]:
updata['Loan_Status'].value_counts()

1    311
0    300
Name: Loan_Status, dtype: int64

In [30]:
# Logistic Model Test set prediction and Evaluation
print('Count Vectorizer Test Set Results:')
lm = fitPredict(x_train_up, y_train_up, x_test_up, y_test_up, LogisticRegression())

Count Vectorizer Test Set Results:
Test F1 Score: 0.8848484848484848
Test Precision: 0.8021978021978022
Test Recall: 0.9864864864864865


  y = column_or_1d(y, warn=True)


<h3>Downsampling</h3>

In [33]:
downsampled = resample(majority, replace=True, n_samples=140, random_state=42)

In [34]:
downData = pd.concat([downsampled, minority])

In [35]:
downData['Loan_Status'].value_counts()

1    140
0    140
Name: Loan_Status, dtype: int64

In [36]:
x_train_down, x_test_down, y_train_down, y_test_down = splitData(downData)

In [37]:
# Logistic Model Test set prediction and Evaluation
print('Count Vectorizer Test Set Results:')
down_lm = fitPredict(x_train_down, y_train_down, x_test_down, y_test_down, LogisticRegression())

Count Vectorizer Test Set Results:
Test F1 Score: 0.8848484848484848
Test Precision: 0.8021978021978022
Test Recall: 0.9864864864864865


  y = column_or_1d(y, warn=True)


It would seem upsampling or downsampling does not make a difference for this dataset.