# Authors

Addison Byers
Cameron Colaneri
Collin Campbell
Ethan Brown

# Home Loan Dataset

We chose this dataset because of its high usability score, low amounts of missing data, and because we were interested in seeing what makes an applicant likely to receive their home loan.

In [2]:
# code block
# read data set
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Getting the data imported into pandas

data = pd.read_csv("./data/loan_sanction_train.csv")

data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [3]:
# normalize/codify data set.
# Scale the data, but maybe make it a copy rather than overwrite the normalized data. KNN depends on the scaled features.
# add another column saying whether or not missing data was present in the data
data["Missing_Data"] = np.zeros(shape=(614, 1))

# if null make 1 for missing data row
for i in range(len(data)):
  empty_check = data.loc[i].isna().any()
  if empty_check:
    data.loc[i, "Missing_Data"] = 1.0


# Cleaning the data #
data["Gender"] = data["Gender"].fillna("Male")
data["Married"] = data["Married"].fillna("No")
data["Dependents"] = data["Dependents"].fillna("0")
data["Self_Employed"] = data["Self_Employed"].fillna("No")
data["LoanAmount"] = data["LoanAmount"].fillna(data["LoanAmount"].median())
data["Loan_Amount_Term"] = data["Loan_Amount_Term"].fillna(data["Loan_Amount_Term"].median())
data["Credit_History"] = data["Credit_History"].fillna("1")


# Remove the Loan_ID Column
data = data.drop(columns="Loan_ID")

# making gender a normalization
male = data['Gender'].value_counts(normalize=True)['Male']

data_gender_normalized = data['Gender'].map({'Male': male, 'Female': 1 - male, "NaN": male})

data['Gender'] = data_gender_normalized

# normalizing education
education = data['Education'].value_counts(normalize=True)["Graduate"]

data_education_normalized = data['Education'].map({'Graduate': education, 'Not Graduate': 1 - education})

data['Education'] = data_education_normalized

# normalizing self_employed
self_employed = data['Self_Employed'].value_counts(normalize=True)["Yes"]

data_selfEmp_normalized = data['Self_Employed'].map({"Yes": self_employed, "No": 1-self_employed})

data["Self_Employed"] = data_selfEmp_normalized

# normalzing property area
area_property = data["Property_Area"].value_counts(normalize=True)

data_area_normalized = data["Property_Area"].map({"Urban": area_property["Urban"], "Semiurban": area_property['Semiurban'], "Rural": area_property["Rural"]})

data["Property_Area"] = data_area_normalized

# normalzing married 
married = data["Married"].value_counts(normalize=True)["Yes"]

married_normalized = data["Married"].map({"Yes": married, "No": 1-married})

data["Married"] = married_normalized

# applicant income normalized
data["ApplicantIncome"] /= data["ApplicantIncome"].abs().max()

# applicant income normalized
data["CoapplicantIncome"] /= data["CoapplicantIncome"].abs().max()

data["LoanAmount"] /= data["LoanAmount"].abs().max()

data["Loan_Amount_Term"] /= data["Loan_Amount_Term"].abs().max()

# Switchin loan status to 0 or 1
data["Loan_Status"] = data["Loan_Status"].map({"Y": 1, "N": 0})

# Cleaning the dependents
data["Dependents"] = data["Dependents"].map({"0":0, "1":1, "2":2, "3+": 3})

data["Dependents"] /= data["Dependents"].abs().max()

def ratio(a, b):
  return b/a

data['WealthToLoanRatio'] = data.apply(lambda row : ratio(row['ApplicantIncome'], row['LoanAmount']), axis = 1)

data["WealthToLoanRatio"] /= data["WealthToLoanRatio"].abs().max()

data_train = data.loc[:, data.columns != "Loan_Status"]
data_target = data["Loan_Status"]

data

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Missing_Data,WealthToLoanRatio
0,0.81759,0.351792,0.000000,0.781759,0.86645,0.072210,0.000000,0.182857,0.750,1.0,0.328990,1,1.0,0.024316
1,0.81759,0.648208,0.333333,0.781759,0.86645,0.056580,0.036192,0.182857,0.750,1.0,0.291531,0,0.0,0.031033
2,0.81759,0.648208,0.000000,0.781759,0.13355,0.037037,0.000000,0.094286,0.750,1.0,0.328990,1,0.0,0.024444
3,0.81759,0.648208,0.000000,0.218241,0.86645,0.031889,0.056592,0.171429,0.750,1.0,0.328990,1,0.0,0.051620
4,0.81759,0.351792,0.000000,0.781759,0.86645,0.074074,0.000000,0.201429,0.750,1.0,0.328990,1,0.0,0.026111
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,0.18241,0.351792,0.000000,0.781759,0.86645,0.035802,0.000000,0.101429,0.750,1.0,0.291531,1,0.0,0.027203
610,0.81759,0.648208,1.000000,0.781759,0.86645,0.050691,0.000000,0.057143,0.375,1.0,0.291531,1,0.0,0.010824
611,0.81759,0.648208,0.333333,0.781759,0.86645,0.099654,0.005760,0.361429,0.750,1.0,0.328990,1,0.0,0.034825
612,0.81759,0.648208,0.666667,0.781759,0.86645,0.093617,0.000000,0.267143,0.750,1.0,0.328990,1,0.0,0.027400


# Why Scale the Data

Something about KNN here

In [None]:
# First Model

In [None]:
# Graphs

# Observations



This model using logistic regression, with a 10% 90% split.

In [None]:
# Second Model

In [None]:
# Graphs

# Observations



In [None]:
# Third Model

In [None]:
# Graphs

# Observations



In [None]:
# Fourth Model

In [None]:
# Graphs

# Observations

