In [1]:
import numpy as np
import pandas as pd 
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("loan_train.csv")

In [3]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [5]:
df.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [6]:
df.shape

(614, 13)

In [7]:
df.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [8]:
# check percentage of missing values
df.isnull().sum()*100/len (df)

Loan_ID              0.000000
Gender               2.117264
Married              0.488599
Dependents           2.442997
Education            0.000000
Self_Employed        5.211726
ApplicantIncome      0.000000
CoapplicantIncome    0.000000
LoanAmount           3.583062
Loan_Amount_Term     2.280130
Credit_History       8.143322
Property_Area        0.000000
Loan_Status          0.000000
dtype: float64

In [9]:
#drop missing values that are less than 5%
columns =["Gender", "Dependents","LoanAmount", "Loan_Amount_Term"]
df = df.dropna (subset = columns)
df.isnull().sum()*100/len (df)


Loan_ID              0.000000
Gender               0.000000
Married              0.000000
Dependents           0.000000
Education            0.000000
Self_Employed        5.424955
ApplicantIncome      0.000000
CoapplicantIncome    0.000000
LoanAmount           0.000000
Loan_Amount_Term     0.000000
Credit_History       8.679928
Property_Area        0.000000
Loan_Status          0.000000
dtype: float64

In [10]:
#Fill missing values that are above 5%
df["Self_Employed"]=df["Self_Employed"].fillna (df["Self_Employed"].mode()[0])
df["Credit_History"]=df["Credit_History"].fillna (df["Credit_History"].mode()[0])

In [11]:
df.isnull().sum()*100/len (df)

Loan_ID              0.0
Gender               0.0
Married              0.0
Dependents           0.0
Education            0.0
Self_Employed        0.0
ApplicantIncome      0.0
CoapplicantIncome    0.0
LoanAmount           0.0
Loan_Amount_Term     0.0
Credit_History       0.0
Property_Area        0.0
Loan_Status          0.0
dtype: float64

In [12]:
df["Dependents"]

1       1
2       0
3       0
4       0
5       2
       ..
609     0
610    3+
611     1
612     2
613     0
Name: Dependents, Length: 553, dtype: object

In [13]:
# we cannot work with 3+ so we will change it to 4
df["Dependents"] = df ["Dependents"]. replace (to_replace = "3+",value = 4)
df["Dependents"].unique

<bound method Series.unique of 1      1
2      0
3      0
4      0
5      2
      ..
609    0
610    4
611    1
612    2
613    0
Name: Dependents, Length: 553, dtype: object>

# Mapping (Manual Encoding)

In [14]:
df["Loan_Status"].unique()

array(['N', 'Y'], dtype=object)

In [15]:
df["Gender"] = df ["Gender"].map({"Male": 1, "Female": 0}).astype ("int")
df["Married"] = df ["Married"].map({"Yes": 1, "No": 0}).astype ("int")
df["Education"] = df ["Education"].map({"Graduate": 1, "Not Graduate": 0}).astype ("int")
df["Self_Employed"] = df ["Self_Employed"].map({"Yes": 1, "No": 0}).astype ("int")
df["Property_Area"] = df ["Property_Area"].map({"Rural": 0, "Urban": 1, "Semiurban": 2}).astype ("int")
df["Loan_Status"] = df ["Loan_Status"].map({"Y": 1, "N": 0}).astype ("int")

In [16]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
1,LP001003,1,1,1,1,0,4583,1508.0,128.0,360.0,1.0,0,0
2,LP001005,1,1,0,1,1,3000,0.0,66.0,360.0,1.0,1,1
3,LP001006,1,1,0,0,0,2583,2358.0,120.0,360.0,1.0,1,1
4,LP001008,1,0,0,1,0,6000,0.0,141.0,360.0,1.0,1,1
5,LP001011,1,1,2,1,1,5417,4196.0,267.0,360.0,1.0,1,1


# Scalling

In [17]:
x = df.drop (columns = ["Loan_ID", "Loan_Status"])
y = df ["Loan_Status"]

In [18]:
# List the columns we want to scale
cols = ["ApplicantIncome", "CoapplicantIncome","LoanAmount","Loan_Amount_Term"]

In [19]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler ()
x [cols] = scaler.fit_transform (x[cols])

In [20]:
x.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
1,1,1,1,1,0,-0.128694,-0.049699,-0.214368,0.279961,1.0,0
2,1,1,0,1,1,-0.394296,-0.545638,-0.952675,0.279961,1.0,1
3,1,1,0,0,0,-0.464262,0.229842,-0.309634,0.279961,1.0,1
4,1,0,0,1,0,0.109057,-0.545638,-0.059562,0.279961,1.0,1
5,1,1,2,1,1,0.011239,0.834309,1.440866,0.279961,1.0,1


In [21]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

In [22]:
model_df = {}
def model_val (model, x, y):
    x_train, x_test, y_train, y_test = train_test_split (x,y,test_size =0.20, stratify = y, random_state = 42 )
    model.fit(x_train, y_train)
    y_pred = model.predict (x_test)
    print (f"{model} accuracy is {accuracy_score (y_test, y_pred)}")
    score = cross_val_score (model,x,y,cv = 5)
    print (f"{model}Average Cross Val Score is {np.mean(score)}")
    model_df [model] = round (np.mean (score)* 100,2)

## LOGISTIC REGRESSION

In [23]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model_val (model, x, y)

LogisticRegression() accuracy is 0.8018018018018018
LogisticRegression()Average Cross Val Score is 0.8047829647829647


## SVC

In [24]:
from sklearn import svm
model = svm.SVC ()
model_val (model, x, y)

SVC() accuracy is 0.7927927927927928
SVC()Average Cross Val Score is 0.7938902538902539


## Descision Tree Classifier

In [25]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier ()
model_val (model, x, y)

DecisionTreeClassifier() accuracy is 0.7027027027027027
DecisionTreeClassifier()Average Cross Val Score is 0.7053398853398853


## Random Forest Classifier

In [26]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier ()
model_val (model, x, y)

RandomForestClassifier() accuracy is 0.8018018018018018
RandomForestClassifier()Average Cross Val Score is 0.7830630630630631


## Gradient Boosting Classifier

In [27]:
from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier ()
model_val (model, x, y)

GradientBoostingClassifier() accuracy is 0.8018018018018018
GradientBoostingClassifier()Average Cross Val Score is 0.7685503685503685


In [28]:
model_df

{LogisticRegression(): 80.48,
 SVC(): 79.39,
 DecisionTreeClassifier(): 70.53,
 RandomForestClassifier(): 78.31,
 GradientBoostingClassifier(): 76.86}

# Saving The Model

In [29]:
x_train, x_test, y_train, y_test = train_test_split (x,y,test_size =0.20, stratify = y, random_state = 42 )


In [30]:
model = LogisticRegression ()
model.fit (x_train, y_train)

In [31]:
import joblib
joblib.dump (model,"loan_status")

['loan_status']

In [32]:
model = joblib.load("loan_status")
model

In [33]:
df = pd.DataFrame ({"Gender" : 1, 
           "Married" : 1, 
           "Dependents" : 2,
           "Education": 0,
           "Self_Employed" : 0,
           "ApplicantIncome" : 2889,
           "CoapplicantIncome" : 0.0,
           "LoanAmount" : 45,
           "Loan_Amount_Term" : 180,
           "Credit_History" : 0,
           "Property_Area" : 1}, index = [0])

In [34]:
df

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,1,1,2,0,0,2889,0.0,45,180,0,1


In [35]:
result = model.predict (df)
print (result)
if result == 1:
    print("Loan Approved")
else:
    ("Loan Not Approved")

[1]
Loan Approved


## GUI (Graphical User Interface ) 

In [36]:
from tkinter import *
import joblib

In [37]:
def show_entry():
    
    p1 = float(e1.get())
    p2 = float(e2.get())
    p3 = float(e3.get())
    p4 = float(e4.get())
    p5 = float(e5.get())
    p6 = float(e6.get())
    p7 = float(e7.get())
    p8 = float(e8.get())
    p9 = float(e9.get())
    p10 = float(e10.get())
    p11 = float(e11.get())
    
    
    model = joblib.load ("loan_status")
    df = pd.DataFrame({
    "Gender":p1,
    "Married":p2,
    "Dependents":p3,
    "Education":p4,
    "Self_Employed":p5,
    "ApplicantIncome":p6,
    "CoapplicantIncome":p7,
    "LoanAmount":p8,
    "Loan_Amount_Term":p9,
    "Credit_History":p10,
    "Property_Area":p11},
index = [0])
    result = model.predict(df)
    
    if result == 1:
        Label(master, text = "Loan Approved").grid(row=31)
    else:
        Label(master, text = "Loan Not Approved").grid(row=31)
        
master =Tk()
master.title("Loan Status Prediction Using Machine Learning")
label = Label(master, text = "Loan Status Prediction" , bg = "black",
        fg = "white").grid(row= 0, columnspan =2)

Label(master,text = "Gender [1:Male , 0:Female]").grid(row=1)
Label(master,text = "Married[1:Yes,0:No]").grid(row=2)
Label(master,text = "Dependents [1,2,3,4]").grid(row=3)
Label(master,text = "Education").grid(row=4)
Label(master,text = "Self_Employed").grid(row=5)
Label(master,text = "ApplicantIncome").grid(row=6)
Label(master,text = "CoapplicantIncome").grid(row=7)
Label(master,text = "LoanAmount").grid(row=8)
Label(master,text = "Loan_Amount_Term").grid(row=9)
Label(master,text = "Credit_History").grid(row=10)
Label(master,text = "Property_Area").grid(row=11)


e1 = Entry(master)
e2 = Entry(master)
e3 = Entry(master)
e4 = Entry(master)
e5 = Entry(master)
e6 = Entry(master)
e7 = Entry(master)
e8 = Entry(master)
e9 = Entry(master)
e10 = Entry(master)
e11= Entry(master)


e1.grid(row=1,column=1)
e2.grid(row=2,column=1)
e3.grid(row=3,column=1)
e4.grid(row=4,column=1)
e5.grid(row=5,column=1)
e6.grid(row=6,column=1)
e7.grid(row=7,column=1)
e8.grid(row=8,column=1)
e9.grid(row=9,column=1)
e10.grid(row=10,column=1)
e11.grid(row=11,column=1)

Button(master,text="Predict",command = show_entry).grid()

mainloop()