# Deployment

In [1]:
#Standard libraries for data analysis:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy.stats import norm, skew
from scipy import stats
import statsmodels.api as sm

# sklearn modules for data preprocessing:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

#sklearn modules for Model Selection:
from sklearn import svm, tree, linear_model, neighbors
from sklearn import naive_bayes, ensemble, discriminant_analysis, gaussian_process
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

#sklearn modules for Model Evaluation & Improvement:    
from sklearn.metrics import confusion_matrix, accuracy_score 
from sklearn.metrics import f1_score, precision_score, recall_score, fbeta_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import KFold
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics
from sklearn.metrics import classification_report, precision_recall_curve
from sklearn.metrics import auc, roc_auc_score, roc_curve
from sklearn.metrics import make_scorer, recall_score, log_loss
from sklearn.metrics import average_precision_score

#Standard libraries for data visualization:
import seaborn as sn
from matplotlib import pyplot
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import matplotlib 
%matplotlib inline
color = sn.color_palette()
import matplotlib.ticker as mtick
from IPython.display import display
pd.options.display.max_columns = None
from pandas.plotting import scatter_matrix
from sklearn.metrics import roc_curve

#Miscellaneous Utilitiy Libraries:    
import random
import os
import re
import sys
import timeit
import string
import time
from datetime import datetime
from time import time
from dateutil.parser import parse
import joblib

### Data Cleaning

Finally, we applied the best model obtained to make predictions using the attached predict.csv as the test set and the information from loan.csv as training set. We created a new Jupyter Notebook that cleans the predict.csv dataset by imputing missing values, labeling and applying one-hot encoding on categorical data, transforming continuous values into integers, and selecting the essential features. 

#### Download Data

In [2]:
# Download Data
data = pd.read_csv('predict.csv')
del data['Loan_ID']
data['Credit_History'] = data['Credit_History'].astype(object)


#### Missing data

In [3]:
data_cat = data.select_dtypes(include=['object'])
data_num = data.select_dtypes(exclude=['object'])

In [4]:
pd.DataFrame(data_cat.isnull().sum()/len(data_cat)*100,columns=["NA"])

Unnamed: 0,NA
Gender,2.997275
Married,0.0
Dependents,2.724796
Education,0.0
Self_Employed,6.26703
Credit_History,7.901907
Property_Area,0.0


*From here we have 4 categorical variables with missing data.*

In [5]:
pd.DataFrame(data_num.isnull().sum()/len(data_num)*100,columns=["NA"])

Unnamed: 0,NA
ApplicantIncome,0.0
CoapplicantIncome,0.0
LoanAmount,1.362398
Loan_Amount_Term,1.634877


*From here we have 2 numerical variables with missing data.*

In [6]:
from sklearn.impute import SimpleImputer

imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imp.fit(data_cat)
data_cat_new = pd.DataFrame(imp.transform(data_cat))
data_cat_new.columns = data_cat.columns
df_cat = data_cat_new.copy()

In [7]:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler

nan = np.nan
imp = KNNImputer(n_neighbors=2, weights="uniform")
data_num_new_1 = pd.DataFrame(imp.fit_transform(data_num))
data_num_new_1.columns = data_num.columns

sc_X = MinMaxScaler()
data_num_new = pd.DataFrame(sc_X.fit_transform(data_num_new_1))
data_num_new.columns = data_num_new_1.columns

df_num = data_num_new.copy()

In [8]:
pd.DataFrame(data_cat_new.isnull().sum()/len(data_cat_new)*100,columns=["NA"])

Unnamed: 0,NA
Gender,0.0
Married,0.0
Dependents,0.0
Education,0.0
Self_Employed,0.0
Credit_History,0.0
Property_Area,0.0


In [9]:
pd.DataFrame(data_num_new.isnull().sum()/len(data_num_new)*100,columns=["NA"])

Unnamed: 0,NA
ApplicantIncome,0.0
CoapplicantIncome,0.0
LoanAmount,0.0
Loan_Amount_Term,0.0


#### Label encoding and One hot encoding

In [10]:
data_cat_new.nunique()

Gender            2
Married           2
Dependents        4
Education         2
Self_Employed     2
Credit_History    2
Property_Area     3
dtype: int64

In [11]:
le = LabelEncoder()
n = 0
for col in data_cat_new.columns[0:]:
    le.fit(data_cat_new.loc[:,col])
    data_cat_new.loc[:,col] = le.transform(data_cat_new.loc[:,col])

In [12]:
# creating instance of one-hot-encoder
enc = OneHotEncoder(handle_unknown='ignore')
enc_dep = pd.DataFrame(enc.fit_transform(data_cat_new[['Dependents']]).toarray())
enc_dep =enc_dep.rename(columns={0: 'Dependendents_0',
                              1: 'Dependendents_1',
                              2: 'Dependendents_2',
                              3: 'Dependendents_3'})

enc_prop = pd.DataFrame(enc.fit_transform(data_cat_new[['Property_Area']]).toarray())
enc_prop =enc_prop.rename(columns={0: 'Property_Area_0',
                                   1: 'Property_Area_1',
                                   2: 'Property_Area_2'})

# merge with main df data_cat_new on key values
data_cat_new = data_cat_new.join([enc_dep,enc_prop])
sc_X = StandardScaler()

data_cat_new = data_cat_new.drop(columns=['Dependents','Property_Area'])

In [13]:
final_data= data_cat_new.join([data_num_new])
modeling_dataset = final_data[['Married', 
                               'Credit_History', 
                               'Property_Area_1', 
                               'ApplicantIncome', 
                               'CoapplicantIncome', 
                               'LoanAmount']]

X_test = modeling_dataset


#### Train and test dataset

In [14]:
train = pd.read_excel('Training Data.xlsx')
train = train.iloc[: , 1:]

In [15]:
y_train = train["Loan_Status"]
y_train = np.where(y_train > 0.5, 1, 0)
X_train = train.drop(columns="Loan_Status")

In [16]:
#to resolve any class imbalance - use stratify parameter.
print("Number transactions X_train dataset: ", X_train.shape)
print("Number transactions y_train dataset: ", y_train.shape)
print("Number transactions X_test dataset: ", X_test.shape)

Number transactions X_train dataset:  (844, 6)
Number transactions y_train dataset:  (844,)
Number transactions X_test dataset:  (367, 6)


# Applying Random Forest

The predictions for each user are:

In [17]:
# Function to calculate accuracy

def train_using_RF(X_train, X_test, y_train):
    # Creating the classifier object
    clf = RandomForestClassifier(max_depth = 13, min_samples_leaf = 1, n_estimators=45, random_state = 0)
    # Performing training
    clf.fit(X_train, y_train)
    return clf

def prediction(X_test, clf_object):
  
    # Predicton on test with giniIndex
    y_pred = clf_object.predict(X_test)
    print(y_pred)
    return y_pred

clf = train_using_RF(X_train, X_test, y_train)

print("##################### Results Using Random Forest #####################")
y_pred = prediction(X_test, clf)


##################### Results Using Random Forest #####################
[1 1 1 1 0 1 1 0 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 0 1
 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1 0 1 1 0 0 1 0 1 1 1 0
 1 0 1 1 1 1 0 1 0 1 0 1 0 1 1 0 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1
 1 1 1 1 1 1 0 0 0 1 1 0 0 0 1 0 0 0 1 1 1 0 1 0 0 1 1 1 1 0 1 0 1 1 1 1 0
 1 1 1 1 1 0 1 1 1 1 1 1 1 0 1 1 1 0 0 1 0 1 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 0 0 0 1 1 0 1 0 1 1 1 1 1 1 1 1 1 0 1 1 0 0 1 1 1 1 1 1 0 1 0
 1 1 0 0 1 1 1 0 1 1 1 1 1 0 0 1 1 1 1 0 1 0 1 0 1 1 1 1 0 1 1 1 1 0 1 1 1
 1 1 1 1 1 1 1 0 1 0 1 1 0 1 0 0 1 1 1 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1
 1 0 1 0 1 0 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 1 0 1 1 0 1 1 1 1 1 1 0
 1 1 1 1 1 1 0 1 1 0 1 0 1 0 0 1 1 1 0 1 1 0 1 1 1 0 1 1 1 1 1 1 1 0]


#### Download results

In [18]:
model_results = pd.DataFrame(columns=['ID','Loan_Status'])
data = pd.read_csv('predict.csv')
model_results['ID']=data['Loan_ID']
model_results['Loan_Status']=y_pred

In [19]:
model_results.to_excel('Model Predictions.xlsx')
model_results

Unnamed: 0,ID,Loan_Status
0,LP001015,1
1,LP001022,1
2,LP001031,1
3,LP001035,1
4,LP001051,0
...,...,...
362,LP002971,1
363,LP002975,1
364,LP002980,1
365,LP002986,1


## Conclusions

In conclusion, we used the Dream Housing Finance Loan dataset to build a machine learning classifier to automate the loan eligibility process. This model attained a ROC/AUC score of accuracy. Additionally, according to this analysis, we can conclude that the customer segments that DHF should target are applicants that appear to be married and are looking for a property in the suburban area. This situation could mean that they may be planning to grow a family; thus, they have a higher probability of being responsible for avoiding debts. Furthermore, these applicants and their co-applicants, should count on a high amount of income. If DHF targets people who follow these characteristics, they can ensure that customers will be capable of paying back; thus, DHF will be more secure in lending a higher amount of money to them. Finally, and most importantly, ensure that the person has a credit history because applicants who have repaid their previous debts have a significantly higher probability of repaying this one.