
## Loan-Approval-Prediction-Dataset



### About Dataset


The loan approval dataset is a collection of financial records and associated information used to determine the eligibility of individuals or organizations for obtaining loans from a lending institution. It includes various factors such as cibil score, income, employment status, loan term, loan amount, assets value, and loan status. This dataset is commonly used in machine learning and data analysis to develop models and algorithms that predict the likelihood of loan approval based on the given features.

In [34]:
# import all libraries for the model creation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# convert data into different types for train and testing data 
from sklearn.model_selection import train_test_split

# convert data into StandardScaler data 
from sklearn.preprocessing import StandardScaler

# Now, we are going to use Logistic Regression model.
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
# export model through pickle
import pickle
# check accuracy score
from sklearn.metrics import accuracy_score

In [35]:
# imoport dataset for data preprocessing

df = pd.read_csv(r"loan_approval_dataset.csv")

# check dataset values
df.head(10)

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected
5,6,0,Graduate,Yes,4800000,13500000,10,319,6800000,8300000,13700000,5100000,Rejected
6,7,5,Graduate,No,8700000,33000000,4,678,22500000,14800000,29200000,4300000,Approved
7,8,2,Graduate,Yes,5700000,15000000,20,382,13200000,5700000,11800000,6000000,Rejected
8,9,0,Graduate,Yes,800000,2200000,20,782,1300000,800000,2800000,600000,Approved
9,10,5,Not Graduate,No,1100000,4300000,10,388,3200000,1400000,3300000,1600000,Rejected


In [36]:
# describe dataset
print(df.info())
# or
print(df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4269 entries, 0 to 4268
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   loan_id                    4269 non-null   int64 
 1    no_of_dependents          4269 non-null   int64 
 2    education                 4269 non-null   object
 3    self_employed             4269 non-null   object
 4    income_annum              4269 non-null   int64 
 5    loan_amount               4269 non-null   int64 
 6    loan_term                 4269 non-null   int64 
 7    cibil_score               4269 non-null   int64 
 8    residential_assets_value  4269 non-null   int64 
 9    commercial_assets_value   4269 non-null   int64 
 10   luxury_assets_value       4269 non-null   int64 
 11   bank_asset_value          4269 non-null   int64 
 12   loan_status               4269 non-null   object
dtypes: int64(10), object(3)
memory usage: 433.7+ KB
None
          

In [37]:
# delete loan_id column loan_id 
df = df.drop(['loan_id'],axis=1)

In [47]:
df.shape

(4269, 11)

In [38]:
print(df.columns)

Index([' no_of_dependents', ' education', ' self_employed', ' income_annum',
       ' loan_amount', ' loan_term', ' cibil_score',
       ' residential_assets_value', ' commercial_assets_value',
       ' luxury_assets_value', ' bank_asset_value', ' loan_status'],
      dtype='object')


In [39]:
# delete loan_id column eductaion
df = df.drop([' education'],axis=1)

In [40]:
df.head(10)

Unnamed: 0,no_of_dependents,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,2,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,0,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,3,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected
5,0,Yes,4800000,13500000,10,319,6800000,8300000,13700000,5100000,Rejected
6,5,No,8700000,33000000,4,678,22500000,14800000,29200000,4300000,Approved
7,2,Yes,5700000,15000000,20,382,13200000,5700000,11800000,6000000,Rejected
8,0,Yes,800000,2200000,20,782,1300000,800000,2800000,600000,Approved
9,5,No,1100000,4300000,10,388,3200000,1400000,3300000,1600000,Rejected


In [41]:
# change the self_employed column values into 0 and 1
df[' self_employed'] = df[' self_employed'].map({' No': 0, ' Yes': 1})

In [42]:
df.head(10)

Unnamed: 0,no_of_dependents,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,2,0,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,0,1,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,0,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,3,0,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,1,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected
5,0,1,4800000,13500000,10,319,6800000,8300000,13700000,5100000,Rejected
6,5,0,8700000,33000000,4,678,22500000,14800000,29200000,4300000,Approved
7,2,1,5700000,15000000,20,382,13200000,5700000,11800000,6000000,Rejected
8,0,1,800000,2200000,20,782,1300000,800000,2800000,600000,Approved
9,5,0,1100000,4300000,10,388,3200000,1400000,3300000,1600000,Rejected


In [43]:
# change the loan_status column values into 0 and 1
df[' loan_status'] = df[' loan_status'].map({' Rejected': 0, ' Approved': 1})

In [44]:
df.head(10)

Unnamed: 0,no_of_dependents,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,2,0,9600000,29900000,12,778,2400000,17600000,22700000,8000000,1
1,0,1,4100000,12200000,8,417,2700000,2200000,8800000,3300000,0
2,3,0,9100000,29700000,20,506,7100000,4500000,33300000,12800000,0
3,3,0,8200000,30700000,8,467,18200000,3300000,23300000,7900000,0
4,5,1,9800000,24200000,20,382,12400000,8200000,29400000,5000000,0
5,0,1,4800000,13500000,10,319,6800000,8300000,13700000,5100000,0
6,5,0,8700000,33000000,4,678,22500000,14800000,29200000,4300000,1
7,2,1,5700000,15000000,20,382,13200000,5700000,11800000,6000000,0
8,0,1,800000,2200000,20,782,1300000,800000,2800000,600000,1
9,5,0,1100000,4300000,10,388,3200000,1400000,3300000,1600000,0


In [45]:
# change into X and y
X = df.drop([' loan_status'],axis=1)
y = df[' loan_status']

In [46]:
# appling StandardScaler
ss = StandardScaler()
X = ss.fit_transform(X)
X

array([[-0.2941017 , -1.00728821,  1.61797904, ...,  2.877289  ,
         0.83202837,  0.93030441],
       [-1.4735476 ,  0.99276452, -0.34174956, ..., -0.63192107,
        -0.69499321, -0.51593638],
       [ 0.29562125, -1.00728821,  1.4398219 , ..., -0.10781827,
         1.99651964,  2.40731629],
       ...,
       [-0.2941017 , -1.00728821,  0.51340474, ...,  1.69236092,
         0.3266831 ,  0.71490685],
       [-0.88382465, -1.00728821, -0.34174956, ..., -0.97372725,
        -0.11274758,  0.25334064],
       [-0.88382465, -1.00728821,  1.47545332, ...,  1.55563845,
         2.26017804,  2.16114764]])

In [48]:
# apply train test split function
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [50]:
X_train.shape

(2860, 10)

In [51]:
# Appling Logistic Regression
lg = LogisticRegression()
lg.fit(X_train,y_train)

In [53]:
# check prediction on data  
pred = lg.predict(X_test)

pred[0:5]

array([0, 1, 0, 1, 1])

In [54]:
y_test[0:5]

1703    0
1173    1
308     0
1322    1
3271    1
Name:  loan_status, dtype: int64

In [55]:
# check accuracy score on test data
print(accuracy_score(y_test,pred))

0.9034776437189496


In [59]:
X_test[3]

array([-0.2941017 , -1.00728821,  0.40651045,  0.91420835, -0.50809068,
        0.02937152,  0.38865633,  1.32776767,  0.73315647,  1.29955738])

In [71]:
# check prediction on single input data  
pred = lg.predict([X_test[2]])

pred

array([0])

In [66]:
y_test

1703    0
1173    1
308     0
1322    1
3271    1
       ..
1097    1
1860    0
48      1
2691    0
2392    1
Name:  loan_status, Length: 1409, dtype: int64

In [72]:
pickle.dump(lg,open('Model.pkl','wb'))