In [1]:
import pandas as pd
import numpy as np

data = pd.read_csv("./data/loan_sanction_train.csv")

print(type(data))

data.head()

# Cleaning the data #
data["Gender"] = data["Gender"].fillna("Male")
data["Married"] = data["Married"].fillna("No")
data["Dependents"] = data["Dependents"].fillna("0")
data["Self_Employed"] = data["Self_Employed"].fillna("No")
data["LoanAmount"] = data["LoanAmount"].fillna(data["LoanAmount"].mean())
data["Loan_Amount_Term"] = data["Loan_Amount_Term"].fillna(data["Loan_Amount_Term"].mean())
data["Credit_History"] = data["Credit_History"].fillna("1")


# Remove the Loan_ID Column
data = data.drop(columns="Loan_ID")

# making gender a normalization
male = data['Gender'].value_counts(normalize=True)['Male']

data_gender_normalized = data['Gender'].map({'Male': male, 'Female': 1 - male, "NaN": male})

data['Gender'] = data_gender_normalized

# normalizing education
education = data['Education'].value_counts(normalize=True)["Graduate"]

data_education_normalized = data['Education'].map({'Graduate': education, 'Not Graduate': 1 - education})

data['Education'] = data_education_normalized

# normalizing self_employed
self_employed = data['Self_Employed'].value_counts(normalize=True)["Yes"]

data_selfEmp_normalized = data['Self_Employed'].map({"Yes": self_employed, "No": 1-self_employed})

data["Self_Employed"] = data_selfEmp_normalized

# normalzing property area
area_property = data["Property_Area"].value_counts(normalize=True)

print(area_property)

data_area_normalized = data["Property_Area"].map({"Urban": area_property["Urban"], "Semiurban": area_property['Semiurban'], "Rural": area_property["Rural"]})

data["Property_Area"] = data_area_normalized

# normalzing married 
married = data["Married"].value_counts(normalize=True)["Yes"]

print(married)

married_normalized = data["Married"].map({"Yes": married, "No": 1-married})

data["Married"] = married_normalized

# Switchin loan status to 0 or 1
data["Loan_Status"] = data["Loan_Status"].map({"Y": 1, "N": 0})

# Cleaning the dependents
data["Dependents"] = data["Dependents"].map({"0":0, "1":1, "2":2, "3+": 3})

print(data.isna().sum())

data_train = data.loc[:, data.columns != "Loan_Status"]
data_target = data["Loan_Status"]


<class 'pandas.core.frame.DataFrame'>
Semiurban    0.379479
Urban        0.328990
Rural        0.291531
Name: Property_Area, dtype: float64
0.6482084690553745
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64


In [2]:
def ratio(a, b):
  return a/b

data['WealthToLoanRatio'] = data.apply(lambda row : ratio(row['ApplicantIncome'], row['LoanAmount']), axis = 1)

In [3]:
X = data_train.to_numpy()
y = data_target.to_numpy()

X

array([[0.8175895765472313, 0.35179153094462545, 0, ..., 360.0, 1.0,
        0.3289902280130293],
       [0.8175895765472313, 0.6482084690553745, 1, ..., 360.0, 1.0,
        0.2915309446254072],
       [0.8175895765472313, 0.6482084690553745, 0, ..., 360.0, 1.0,
        0.3289902280130293],
       ...,
       [0.8175895765472313, 0.6482084690553745, 1, ..., 360.0, 1.0,
        0.3289902280130293],
       [0.8175895765472313, 0.6482084690553745, 2, ..., 360.0, 1.0,
        0.3289902280130293],
       [0.1824104234527687, 0.35179153094462545, 0, ..., 360.0, 0.0,
        0.3794788273615635]], dtype=object)

In [9]:
data_train.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,0.81759,0.351792,0,0.781759,0.86645,5849,0.0,146.412162,360.0,1.0,0.32899
1,0.81759,0.648208,1,0.781759,0.86645,4583,1508.0,128.0,360.0,1.0,0.291531
2,0.81759,0.648208,0,0.781759,0.13355,3000,0.0,66.0,360.0,1.0,0.32899
3,0.81759,0.648208,0,0.218241,0.86645,2583,2358.0,120.0,360.0,1.0,0.32899
4,0.81759,0.351792,0,0.781759,0.86645,6000,0.0,141.0,360.0,1.0,0.32899


In [16]:
data.head(10)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,WealthToLoanRatio
0,0.81759,0.351792,0,0.781759,0.86645,5849,0.0,146.412162,360.0,1.0,0.32899,1,39.948867
1,0.81759,0.648208,1,0.781759,0.86645,4583,1508.0,128.0,360.0,1.0,0.291531,0,35.804688
2,0.81759,0.648208,0,0.781759,0.13355,3000,0.0,66.0,360.0,1.0,0.32899,1,45.454545
3,0.81759,0.648208,0,0.218241,0.86645,2583,2358.0,120.0,360.0,1.0,0.32899,1,21.525
4,0.81759,0.351792,0,0.781759,0.86645,6000,0.0,141.0,360.0,1.0,0.32899,1,42.553191
5,0.81759,0.648208,2,0.781759,0.13355,5417,4196.0,267.0,360.0,1.0,0.32899,1,20.28839
6,0.81759,0.648208,0,0.218241,0.86645,2333,1516.0,95.0,360.0,1.0,0.32899,1,24.557895
7,0.81759,0.648208,3,0.781759,0.86645,3036,2504.0,158.0,360.0,0.0,0.379479,0,19.21519
8,0.81759,0.648208,2,0.781759,0.86645,4006,1526.0,168.0,360.0,1.0,0.32899,1,23.845238
9,0.81759,0.648208,1,0.781759,0.86645,12841,10968.0,349.0,360.0,1.0,0.379479,0,36.793696


In [4]:
from sklearn.linear_model import LinearRegression

model = LinearRegression().fit(X, y)

# This returns R^2, or, how well this model fits to the data
r_sq = model.score(X, y)

print(r_sq)



0.3226836027339237


In [6]:
model.predict(np.array([[0.81, 0.35, 0, 0.781, 0.86, 5849, 0, 146, 360, 1.0, .32]]))

array([0.73297432])

In [8]:
model.predict(np.array([[0.81, 0.64, 1, 0.78, 0.86, 4583, 1508, 128, 360, 1, 0.29]]))

array([0.76397377])

In [22]:
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_predict

from numpy import mean
from numpy import absolute
from numpy import sqrt

cv = LeaveOneOut()

scores = cross_val_predict(model, data_train, data_target, cv=cv, n_jobs=-1)

predict_true = 0
for i in range(len(scores)):
    if scores[i] > 0.5 and data.iloc[i]["Loan_Status"] == 1:
        predict_true += 1


print(f"Accuracy was: {predict_true/len(scores)}")

Accuracy was: 0.6758957654723127
