# Import data

Data source:
https://datahack.analyticsvidhya.com/contest/practice-problem-loan-prediction-iii/

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('train.csv')

# Examine data

In [3]:
df.shape

(614, 13)

In [4]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [5]:
df.describe(include='all')

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
count,614,601,611,599.0,614,582,614.0,614.0,592.0,600.0,564.0,614,614
unique,614,2,2,4.0,2,2,,,,,,3,2
top,LP001711,Male,Yes,0.0,Graduate,No,,,,,,Semiurban,Y
freq,1,489,398,345.0,480,500,,,,,,233,422
mean,,,,,,,5403.459283,1621.245798,146.412162,342.0,0.842199,,
std,,,,,,,6109.041673,2926.248369,85.587325,65.12041,0.364878,,
min,,,,,,,150.0,0.0,9.0,12.0,0.0,,
25%,,,,,,,2877.5,0.0,100.0,360.0,1.0,,
50%,,,,,,,3812.5,1188.5,128.0,360.0,1.0,,
75%,,,,,,,5795.0,2297.25,168.0,360.0,1.0,,


In [6]:
df.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [7]:
print(df['Loan_Amount_Term'].unique())

[360. 120. 240.  nan 180.  60. 300. 480.  36.  84.  12.]


In [9]:
print(df['Gender'].unique())
print(df['Married'].unique())
print(df['Education'].unique())
print(df['Self_Employed'].unique())
print(df['Property_Area'].unique())

['Male' 'Female' nan]
['No' 'Yes' nan]
['Graduate' 'Not Graduate']
['No' 'Yes' nan]
['Urban' 'Rural' 'Semiurban']


**Observations**
 - There is some missing data: Gender, Married, Dependents, Self_Employed, LoanAmount, Loan_Amount_Term, Credit_History
 - No irregular categorical values except for nulls.
 - Loan_ID is not needed for modeling since it is a unique id.

In [3]:
df_interpolate = df

In [4]:
# Interpolate data for categorical variables - greatest frequency
df_interpolate['Gender'].\
fillna(df_interpolate['Gender'].mode().iloc[0], inplace=True)

df_interpolate['Married'].\
fillna(df_interpolate['Married'].mode().iloc[0], inplace=True)

df_interpolate['Self_Employed'].\
fillna(df_interpolate['Self_Employed'].mode().iloc[0], inplace=True)

df_interpolate['Dependents'].\
fillna(df_interpolate['Dependents'].mode().iloc[0], inplace=True)

In [5]:
# Interpolate data for numeric variables - median
df_interpolate['LoanAmount'].\
fillna((df_interpolate['LoanAmount'].median()), inplace=True)

## Special case: Terms are fixed amounts (categorical)
df_interpolate['Loan_Amount_Term'].\
fillna((df_interpolate['Loan_Amount_Term'].mode().iloc[0]), 
       inplace=True)

df_interpolate['Credit_History'].\
fillna((df_interpolate['Credit_History'].median()), inplace=True)

In [6]:
df_interpolate.describe(include='all')

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
count,614,614,614,614.0,614,614,614.0,614.0,614.0,614.0,614.0,614,614
unique,614,2,2,4.0,2,2,,,,,,3,2
top,LP001639,Male,Yes,0.0,Graduate,No,,,,,,Semiurban,Y
freq,1,502,401,360.0,480,532,,,,,,233,422
mean,,,,,,,5403.459283,1621.245798,145.752443,342.410423,0.855049,,
std,,,,,,,6109.041673,2926.248369,84.107233,64.428629,0.352339,,
min,,,,,,,150.0,0.0,9.0,12.0,0.0,,
25%,,,,,,,2877.5,0.0,100.25,360.0,1.0,,
50%,,,,,,,3812.5,1188.5,128.0,360.0,1.0,,
75%,,,,,,,5795.0,2297.25,164.75,360.0,1.0,,


 - Missing values have been imputed using median (for continuous values) and the most frequent value (for categorical values)

# Feature Engineering

 - List of categorical features: Gender, Married, Education, Self_Employed, Property_Area

In [7]:
# Get dummies dataframe on categorical data
gender_dummies = pd.get_dummies(df.Gender)
print(gender_dummies.columns)
gender_dummies = gender_dummies.add_prefix('Gender_')
gender_dummies.drop('Gender_Male', axis=1, inplace=True)
gender_dummies.head()

Index(['Female', 'Male'], dtype='object')


Unnamed: 0,Gender_Female
0,0
1,0
2,0
3,0
4,0


In [8]:
marital_dummies = pd.get_dummies(df.Married)
print(marital_dummies.columns)
marital_dummies = marital_dummies.add_prefix('Married_')
marital_dummies.drop('Married_No', axis=1, inplace=True)
marital_dummies.head()

Index(['No', 'Yes'], dtype='object')


Unnamed: 0,Married_Yes
0,0
1,1
2,1
3,1
4,0


In [9]:
education_dummies = pd.get_dummies(df.Education)
print(education_dummies.columns)
education_dummies = education_dummies.add_prefix('Education_')
education_dummies.drop('Education_Not Graduate', axis=1, inplace=True)
education_dummies.head()

Index(['Graduate', 'Not Graduate'], dtype='object')


Unnamed: 0,Education_Graduate
0,1
1,1
2,1
3,0
4,1


In [10]:
self_employed_dummies = pd.get_dummies(df.Self_Employed)
print(self_employed_dummies.columns)
self_employed_dummies = self_employed_dummies.\
add_prefix('Self_Employed_')
self_employed_dummies.drop('Self_Employed_No', axis=1, inplace=True)
self_employed_dummies.head()

Index(['No', 'Yes'], dtype='object')


Unnamed: 0,Self_Employed_Yes
0,0
1,0
2,1
3,0
4,0


In [11]:
property_area_dummies = pd.get_dummies(df.Property_Area)
print(property_area_dummies.columns)
property_area_dummies = property_area_dummies\
.add_prefix('Property_Area_')
property_area_dummies.drop('Property_Area_Semiurban', 
                           axis=1, inplace=True)
property_area_dummies.head()

Index(['Rural', 'Semiurban', 'Urban'], dtype='object')


Unnamed: 0,Property_Area_Rural,Property_Area_Urban
0,0,1
1,1,0
2,0,1
3,0,1
4,0,1


In [12]:
dependents_dummies = pd.get_dummies(df.Dependents)
print(dependents_dummies.columns)
dependents_dummies = dependents_dummies\
.add_prefix('Dependents_')
dependents_dummies.drop('Dependents_3+', 
                        axis=1, inplace=True)
dependents_dummies.head()

Index(['0', '1', '2', '3+'], dtype='object')


Unnamed: 0,Dependents_0,Dependents_1,Dependents_2
0,1,0,0
1,0,1,0
2,1,0,0
3,1,0,0
4,1,0,0


In [13]:
# Combined DF of all categorical cols
categorical_cols = pd.concat([gender_dummies, marital_dummies,
                             education_dummies, self_employed_dummies,
                             property_area_dummies, dependents_dummies],
                            axis=1)
print(categorical_cols.shape)
categorical_cols.head()

(614, 9)


Unnamed: 0,Gender_Female,Married_Yes,Education_Graduate,Self_Employed_Yes,Property_Area_Rural,Property_Area_Urban,Dependents_0,Dependents_1,Dependents_2
0,0,0,1,0,0,1,1,0,0
1,0,1,1,0,1,0,0,1,0
2,0,1,1,1,0,1,1,0,0
3,0,1,0,0,0,1,1,0,0
4,0,0,1,0,0,1,1,0,0


In [14]:
df_fnl = pd.concat([df_interpolate, categorical_cols], axis=1)
print(df_fnl.shape)
df_fnl.head()

(614, 22)


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,...,Loan_Status,Gender_Female,Married_Yes,Education_Graduate,Self_Employed_Yes,Property_Area_Rural,Property_Area_Urban,Dependents_0,Dependents_1,Dependents_2
0,LP001002,Male,No,0,Graduate,No,5849,0.0,128.0,360.0,...,Y,0,0,1,0,0,1,1,0,0
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,...,N,0,1,1,0,1,0,0,1,0
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,...,Y,0,1,1,1,0,1,1,0,0
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,...,Y,0,1,0,0,0,1,1,0,0
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,...,Y,0,0,1,0,0,1,1,0,0


In [15]:
print(df_fnl.dtypes)
print(df_fnl.columns)

Loan_ID                 object
Gender                  object
Married                 object
Dependents              object
Education               object
Self_Employed           object
ApplicantIncome          int64
CoapplicantIncome      float64
LoanAmount             float64
Loan_Amount_Term       float64
Credit_History         float64
Property_Area           object
Loan_Status             object
Gender_Female            uint8
Married_Yes              uint8
Education_Graduate       uint8
Self_Employed_Yes        uint8
Property_Area_Rural      uint8
Property_Area_Urban      uint8
Dependents_0             uint8
Dependents_1             uint8
Dependents_2             uint8
dtype: object
Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status',
       'Gender_Female', 'Married_Yes', 'Education_Graduate',
       'Self_Employed_

In [16]:
X = df_fnl[['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History',
       'Gender_Female', 'Married_Yes', 'Education_Graduate',
       'Self_Employed_Yes', 'Property_Area_Rural', 'Property_Area_Urban',
       'Dependents_0', 'Dependents_1', 'Dependents_2' ]]
X.dtypes

ApplicantIncome          int64
CoapplicantIncome      float64
LoanAmount             float64
Loan_Amount_Term       float64
Credit_History         float64
Gender_Female            uint8
Married_Yes              uint8
Education_Graduate       uint8
Self_Employed_Yes        uint8
Property_Area_Rural      uint8
Property_Area_Urban      uint8
Dependents_0             uint8
Dependents_1             uint8
Dependents_2             uint8
dtype: object

In [17]:
# Numpy format for kera models

X = X.astype('float').as_matrix()
y = df['Loan_Status'].apply(lambda row: 1 if row == 'Y' else 0).as_matrix()

In [24]:
X.shape

(614, 14)

# Model creation

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X , y, test_size=0.15)

In [19]:
# import panda, keras and tensorflow
import tensorflow as tf
import keras
from keras import models, layers

Using TensorFlow backend.


In [20]:
# Use a custom metricfunction
def auc(y_true, y_pred):
    auc = tf.metrics.auc(y_true, y_pred)[1]
    keras.backend.get_session().run(tf.local_variables_initializer())
    return auc

In [39]:
import keras
from keras.models import Sequential
from keras import models, layers
from keras.callbacks import EarlyStopping

model = Sequential()
model.add(layers.Dense(4, activation='relu', input_shape=(14,)))
#model.add(layers.Dropout(0.02))
model.add(layers.Dense(2, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy', auc])


callbacks = [EarlyStopping(monitor='acc', patience=100)]
model.fit(X_train, y_train, epochs=1000,
          callbacks = callbacks,
          validation_split = 0.1, batch_size=250)


Train on 468 samples, validate on 53 samples
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/100

Epoch 98/1000
Epoch 99/1000
Epoch 100/1000
Epoch 101/1000
Epoch 102/1000
Epoch 103/1000


<keras.callbacks.History at 0x1735fcbb908>

# Save model

In [41]:
# Save the model in h5 format
model.save("../../bank-loan-keras-nn.h5")

In [42]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_37 (Dense)             (None, 4)                 60        
_________________________________________________________________
dense_38 (Dense)             (None, 2)                 10        
_________________________________________________________________
dense_39 (Dense)             (None, 1)                 3         
Total params: 73
Trainable params: 73
Non-trainable params: 0
_________________________________________________________________


# Evaluation

In [30]:
y_pred = model.predict(X_test)

In [31]:
y_pred[y_pred <= 0.5] = 0.
y_pred[y_pred > 0.5] = 1.

In [32]:
y_pred.tolist()

[[1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0],
 [1.0]]

# Outcome data

In [None]:
#https://towardsdatascience.com/deploying-keras-deep-learning-models-with-flask-5da4181436a2

In [46]:
(df.head(1))

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,128.0,360.0,1.0,Urban,Y


In [47]:
df.columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [48]:
input_df = pd.DataFrame()

col_names = ['Gender', 'Married', 'Dependents', 'Education',
             'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 
             'LoanAmount', 'Loan_Amount_Term', 'Credit_History', 
             'Property_Area']
input_vec = ['Male', 'No', '0', 'Graduate', 
             'No', 5849, 0.0, 
             128.0, 360.0, 1.0, 
             'Urban']

input_df = pd.DataFrame(input_vec, col_names)

input_df = input_df.transpose()

input_df.head(1)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,Male,No,0,Graduate,No,5849,0,128,360,1,Urban


In [49]:
gender_list = ['Gender_Female'] 
for gender in gender_list:
    input_df[gender] = 0

In [50]:
marital_list = ['Married_Yes']
for marital in marital_list:
    input_df[marital] = 0

In [51]:
edu_list = ['Education_Graduate']
for edu in edu_list:
    input_df[edu] = 0

In [52]:
self_employed_list = ['Self_Employed_Yes']
for self_employed in self_employed_list:
    input_df[self_employed] = 0

In [53]:
property_area_list = ['Property_Area_Rural', 'Property_Area_Urban']
for property_area in property_area_list:
    input_df[property_area] = 0

In [54]:
dep_list = [ 'Dependents_0', 'Dependents_1', 'Dependents_2']
for dep in dep_list:
    input_df[dep] = 0

In [55]:
input_df.columns

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Gender_Female',
       'Married_Yes', 'Education_Graduate', 'Self_Employed_Yes',
       'Property_Area_Rural', 'Property_Area_Urban', 'Dependents_0',
       'Dependents_1', 'Dependents_2'],
      dtype='object')

In [56]:
def determine_gender(input_df):
    if input_df['Gender'][0]=="Female":
        input_df['Gender_Female'] = 1


In [57]:
def determine_married(input_df):
    if input_df['Married'][0]=="Yes":
        input_df['Married_Yes'] = 1 


In [58]:
def determine_edu_cat(input_df):
    if input_df['Education'][0]=='Graduate':
        input_df['Education_Graduate'] = 1


In [59]:
def determine_self_emp(input_df):
    if input_df['Self_Employed'][0]=="Yes":
        input_df['Self_Employed_Yes'] = 1 


In [60]:
#'Property_Area_Rural', 'Property_Area_Urban'
def determine_prop_area(input_df):
    if input_df['Property_Area'][0]=="Rural":
        input_df['Property_Area_Rural'] = 1 
    elif input_df['Property_Area'][0]=="Urban":
        input_df['Property_Area_Urban'] = 1 


In [61]:
# 'Dependents_0', 'Dependents_1', 'Dependents_2'
def determine_dependent(input_df):
    if input_df['Dependents'][0]=="0":
        input_df['Dependents_0'] = 1 
    elif input_df['Dependents'][0]=="1":
        input_df['Dependents_1'] = 1 
    elif input_df['Dependents'][0]=="2":
        input_df['Dependents_2'] = 1 

In [62]:
determine_gender(input_df)
determine_married(input_df)
determine_edu_cat(input_df)
determine_self_emp(input_df)
determine_prop_area(input_df)
determine_dependent(input_df)

In [63]:
input_df.columns

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Gender_Female',
       'Married_Yes', 'Education_Graduate', 'Self_Employed_Yes',
       'Property_Area_Rural', 'Property_Area_Urban', 'Dependents_0',
       'Dependents_1', 'Dependents_2'],
      dtype='object')

In [64]:
input_df = input_df[['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Gender_Female', 'Married_Yes',
       'Education_Graduate', 'Self_Employed_Yes', 'Property_Area_Rural',
       'Property_Area_Urban', 'Dependents_0', 'Dependents_1', 'Dependents_2']]

In [65]:
input_df = input_df.as_matrix()

In [66]:
y_pred.tolist()[0][0]

1.0

In [67]:
def convert_to_yn(input_char):
    if input_char == 0:
        return 'N'
    else:
        return 'Y'

In [68]:
convert_to_yn(y_pred.tolist()[0][0])

'Y'