# Loan Eligibility Prediction

### Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline

### Importing Datasets

In [2]:
df = pd.read_csv('Loan_Eligibility.csv')
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


### Dropped the column 'Loan_ID' as the column doesnt have any impact on loan status

In [3]:
df.drop('Loan_ID', axis = 'columns', inplace=True)

In [4]:
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


### Check the number of NaN in the dataset and drop the rows containing NaN

In [5]:
df.isna().sum()

Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [6]:
df.shape

(614, 12)

In [7]:
df1 = df.dropna()

In [8]:
df1.shape

(480, 12)

In [9]:
df1.isna().sum()

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [10]:
df1.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
5,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y


### Checking categorical values

In [11]:
df1['Gender'].unique()

array(['Male', 'Female'], dtype=object)

In [12]:
df1['Married'].unique()

array(['Yes', 'No'], dtype=object)

In [13]:
df1['Education'].unique()

array(['Graduate', 'Not Graduate'], dtype=object)

In [14]:
df1['Self_Employed'].unique()

array(['No', 'Yes'], dtype=object)

In [15]:
df1['Credit_History'].unique()

array([1., 0.])

In [16]:
df1['Property_Area'].unique()

array(['Rural', 'Urban', 'Semiurban'], dtype=object)

In [17]:
df1['Loan_Status'].unique()

array(['N', 'Y'], dtype=object)

In [18]:
# Making a copy of df1 to enable replacement without hampering original df

df2 = df1

### Encoding Catagorical Data

In [19]:
df2['Gender'].replace({'Male' : 1, 'Female' : 0}, inplace=True)   #Replace Male with 1 and Female with 0

df2['Married'].replace({'Yes' : 1, 'No' : 0}, inplace=True)   #If Married is Yes replace with 1 else 0 

df2['Education'].replace({'Graduate' : 1, 'Not Graduate' : 0}, inplace=True)  #If Education is Graduate replace with 1 else 0

df2['Self_Employed'].replace({'Yes' : 1, 'No' : 0}, inplace=True)   #If Self_Employed is Yes replace with 1 else 0

df2['Property_Area'].replace({'Rural' : 0, 'Semiurban' : 1, 'Urban' : 2}, inplace=True) #If Property_Area is rural replace with 0, else if semiurban replace with 1, else 2

df2['Loan_Status'].replace({'Y' : 1, 'N' : 0}, inplace=True) #If Loan_Status is Y replace with 1 else 0

df2.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
1,1,1,1,1,0,4583,1508.0,128.0,360.0,1.0,0,0
2,1,1,0,1,1,3000,0.0,66.0,360.0,1.0,2,1
3,1,1,0,0,0,2583,2358.0,120.0,360.0,1.0,2,1
4,1,0,0,1,0,6000,0.0,141.0,360.0,1.0,2,1
5,1,1,2,1,1,5417,4196.0,267.0,360.0,1.0,2,1


In [20]:
df2['Dependents'].replace({'3+' : 3}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


In [21]:
df2['Dependents'].unique()

array(['1', '0', '2', 3], dtype=object)

In [22]:
df2['Gender'].astype(float)

df2['Married'].astype(float)

df2['Dependents'].astype(float)

df2['Education'].astype(float)

df2['Self_Employed'].astype(float)

df2['ApplicantIncome'].astype(float)

df2['Property_Area'].astype(float)

df2['Loan_Status'].astype(float)

1      0.0
2      1.0
3      1.0
4      1.0
5      1.0
      ... 
609    1.0
610    1.0
611    1.0
612    1.0
613    0.0
Name: Loan_Status, Length: 480, dtype: float64

### Slicing Dependent and Independent Variables

In [23]:
x = df2.drop('Loan_Status', axis = 1)

In [24]:
y = df2['Loan_Status']

### Spliting Data for Test and Train Set

In [25]:
from sklearn.model_selection import train_test_split

In [26]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.20)

### Creating Logistic Regression Model

In [28]:
from sklearn.linear_model import LogisticRegression
m = LogisticRegression()
m.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

### Testing Train set data on Model

In [29]:
m.score(x_test, y_test)

0.78125