# Importing Libraries for Cleaning the Data

In [2]:
# pandas is used to create the dataframe
import pandas as pd

# Data Collection and Processing

In [4]:
# importing the dataset into a dataframe
data = pd.read_csv('loan_approval_dataset.csv')

In [5]:
# prints only the top five rows of the dataframe by default
data.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


### Cleaning data in entire dataframe

In [7]:
# prints all the required information about the dataframe
# like number of rows and columns, number of non-null values, datypes
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4269 entries, 0 to 4268
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   loan_id                    4269 non-null   int64 
 1    no_of_dependents          4269 non-null   int64 
 2    education                 4269 non-null   object
 3    self_employed             4269 non-null   object
 4    income_annum              4269 non-null   int64 
 5    loan_amount               4269 non-null   int64 
 6    loan_term                 4269 non-null   int64 
 7    cibil_score               4269 non-null   int64 
 8    residential_assets_value  4269 non-null   int64 
 9    commercial_assets_value   4269 non-null   int64 
 10   luxury_assets_value       4269 non-null   int64 
 11   bank_asset_value          4269 non-null   int64 
 12   loan_status               4269 non-null   object
dtypes: int64(10), object(3)
memory usage: 433.7+ KB


In [8]:
# dropping the unnecessary columns from the dataframe
data.drop(columns=['loan_id'], inplace=True)

In [9]:
# checking the dataframe again after dropping
data.head(20)

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected
5,0,Graduate,Yes,4800000,13500000,10,319,6800000,8300000,13700000,5100000,Rejected
6,5,Graduate,No,8700000,33000000,4,678,22500000,14800000,29200000,4300000,Approved
7,2,Graduate,Yes,5700000,15000000,20,382,13200000,5700000,11800000,6000000,Rejected
8,0,Graduate,Yes,800000,2200000,20,782,1300000,800000,2800000,600000,Approved
9,5,Not Graduate,No,1100000,4300000,10,388,3200000,1400000,3300000,1600000,Rejected


In [10]:
# remove any white spaces before or after the column names
data.columns = data.columns.str.strip()
data.columns

Index(['no_of_dependents', 'education', 'self_employed', 'income_annum',
       'loan_amount', 'loan_term', 'cibil_score', 'residential_assets_value',
       'commercial_assets_value', 'luxury_assets_value', 'bank_asset_value',
       'loan_status'],
      dtype='object')

In [11]:
# residential_assets_value, commercial_assets_value, luxury_assets_value, bank_asset_value
# these four are all assets of a person, they don't need to be in four different columns
# best to combine all four into one column indicating the total asset worth a person has
data['assets'] = data.residential_assets_value + data.commercial_assets_value + data.luxury_assets_value + data.bank_asset_value
data

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status,assets
0,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved,50700000
1,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected,17000000
2,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected,57700000
3,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected,52700000
4,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected,55000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4264,5,Graduate,Yes,1000000,2300000,12,317,2800000,500000,3300000,800000,Rejected,7400000
4265,0,Not Graduate,Yes,3300000,11300000,20,559,4200000,2900000,11000000,1900000,Approved,20000000
4266,2,Not Graduate,No,6500000,23900000,18,457,1200000,12400000,18100000,7300000,Rejected,39000000
4267,1,Not Graduate,No,4100000,12800000,8,780,8200000,700000,14100000,5800000,Approved,28800000


In [12]:
# remove the four assets columns since the total asset is there
data.drop(columns=['residential_assets_value','commercial_assets_value','luxury_assets_value','bank_asset_value'], inplace=True)
data

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,loan_status,assets
0,2,Graduate,No,9600000,29900000,12,778,Approved,50700000
1,0,Not Graduate,Yes,4100000,12200000,8,417,Rejected,17000000
2,3,Graduate,No,9100000,29700000,20,506,Rejected,57700000
3,3,Graduate,No,8200000,30700000,8,467,Rejected,52700000
4,5,Not Graduate,Yes,9800000,24200000,20,382,Rejected,55000000
...,...,...,...,...,...,...,...,...,...
4264,5,Graduate,Yes,1000000,2300000,12,317,Rejected,7400000
4265,0,Not Graduate,Yes,3300000,11300000,20,559,Approved,20000000
4266,2,Not Graduate,No,6500000,23900000,18,457,Rejected,39000000
4267,1,Not Graduate,No,4100000,12800000,8,780,Approved,28800000


In [13]:
# check for null values
data.isnull().sum()

no_of_dependents    0
education           0
self_employed       0
income_annum        0
loan_amount         0
loan_term           0
cibil_score         0
loan_status         0
assets              0
dtype: int64

### Cleaning 'education' data

In [15]:
# display unique values in 'education' column
data.education.unique()

array([' Graduate', ' Not Graduate'], dtype=object)

In [16]:
# remove the extra spaces in a column
def clean_data(cl):
    cl = cl.strip()
    return cl

In [17]:
# testing the function
clean_data(' Graduate')

'Graduate'

In [18]:
# apply this logic to education column
data.education = data.education.apply(clean_data)
data.education.unique()

array(['Graduate', 'Not Graduate'], dtype=object)

In [19]:
# convert the strings in education column to numerical values
education_mapping = {'Graduate': 1, 'Not Graduate': 0}
data['education'] = data['education'].map(education_mapping)
data

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,loan_status,assets
0,2,1,No,9600000,29900000,12,778,Approved,50700000
1,0,0,Yes,4100000,12200000,8,417,Rejected,17000000
2,3,1,No,9100000,29700000,20,506,Rejected,57700000
3,3,1,No,8200000,30700000,8,467,Rejected,52700000
4,5,0,Yes,9800000,24200000,20,382,Rejected,55000000
...,...,...,...,...,...,...,...,...,...
4264,5,1,Yes,1000000,2300000,12,317,Rejected,7400000
4265,0,0,Yes,3300000,11300000,20,559,Approved,20000000
4266,2,0,No,6500000,23900000,18,457,Rejected,39000000
4267,1,0,No,4100000,12800000,8,780,Approved,28800000


### Cleaning 'self_employed' data

In [21]:
# display unique values in 'self_employed' column
data.self_employed.unique()

array([' No', ' Yes'], dtype=object)

In [22]:
# apply the clean_data logic to self_employed column
data.self_employed = data.self_employed.apply(clean_data)
data.self_employed.unique()

array(['No', 'Yes'], dtype=object)

In [23]:
# convert the strings in self_employed column to numerical values
self_employed_mapping = {'Yes': 1, 'No': 0}
data['self_employed'] = data['self_employed'].map(self_employed_mapping)
data

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,loan_status,assets
0,2,1,0,9600000,29900000,12,778,Approved,50700000
1,0,0,1,4100000,12200000,8,417,Rejected,17000000
2,3,1,0,9100000,29700000,20,506,Rejected,57700000
3,3,1,0,8200000,30700000,8,467,Rejected,52700000
4,5,0,1,9800000,24200000,20,382,Rejected,55000000
...,...,...,...,...,...,...,...,...,...
4264,5,1,1,1000000,2300000,12,317,Rejected,7400000
4265,0,0,1,3300000,11300000,20,559,Approved,20000000
4266,2,0,0,6500000,23900000,18,457,Rejected,39000000
4267,1,0,0,4100000,12800000,8,780,Approved,28800000


### Cleaning 'loan_status' data

In [25]:
# display unique values in 'loan_status' column
data.loan_status.unique()

array([' Approved', ' Rejected'], dtype=object)

In [26]:
# apply the clean_data logic to loan_status column
data.loan_status = data.loan_status.apply(clean_data)
data.loan_status.unique()

array(['Approved', 'Rejected'], dtype=object)

In [27]:
# convert the strings in loan_status column to numerical values
loan_status_mapping = {'Approved': 1, 'Rejected': 0}
data['loan_status'] = data['loan_status'].map(loan_status_mapping)
data

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,loan_status,assets
0,2,1,0,9600000,29900000,12,778,1,50700000
1,0,0,1,4100000,12200000,8,417,0,17000000
2,3,1,0,9100000,29700000,20,506,0,57700000
3,3,1,0,8200000,30700000,8,467,0,52700000
4,5,0,1,9800000,24200000,20,382,0,55000000
...,...,...,...,...,...,...,...,...,...
4264,5,1,1,1000000,2300000,12,317,0,7400000
4265,0,0,1,3300000,11300000,20,559,1,20000000
4266,2,0,0,6500000,23900000,18,457,0,39000000
4267,1,0,0,4100000,12800000,8,780,1,28800000


# Creation of Model

In [29]:
# 'train_test_split' splits the dataset into training and testing sets
from sklearn.model_selection import train_test_split

In [30]:
# split the data as input data to the model and output data from the model
input_data = data.drop(columns = ['loan_status'])
output_data = data['loan_status']

In [31]:
input_data

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,assets
0,2,1,0,9600000,29900000,12,778,50700000
1,0,0,1,4100000,12200000,8,417,17000000
2,3,1,0,9100000,29700000,20,506,57700000
3,3,1,0,8200000,30700000,8,467,52700000
4,5,0,1,9800000,24200000,20,382,55000000
...,...,...,...,...,...,...,...,...
4264,5,1,1,1000000,2300000,12,317,7400000
4265,0,0,1,3300000,11300000,20,559,20000000
4266,2,0,0,6500000,23900000,18,457,39000000
4267,1,0,0,4100000,12800000,8,780,28800000


In [32]:
output_data

0       1
1       0
2       0
3       0
4       0
       ..
4264    0
4265    1
4266    0
4267    1
4268    1
Name: loan_status, Length: 4269, dtype: int64

In [33]:
# split the input and output data into training and testing data this time
x_train, x_test, y_train, y_test = train_test_split(input_data, output_data, test_size=0.2)

In [34]:
# find rows and columns in training and testing dataset
# y_train and y_test are one-dimensional vectors
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((3415, 8), (854, 8), (3415,), (854,))

In [35]:
# 'StandardScaler' standardizes features by removing the mean and scaling to unit variance
from sklearn.preprocessing import StandardScaler
# create an object for this
scaler = StandardScaler()

In [36]:
# scaling of the data
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [37]:
# 'LogisticRegression' implements a logistic regression model
from sklearn.linear_model import LogisticRegression

In [38]:
# creating an object 'model' that represents a logistic regression model
# the LogisticRegression class's __init__ method is called
# use dir(sklearn.linear_model) or help(sklearn.linear_model) for module description
model = LogisticRegression()

In [39]:
# fitting the training data into the model
model.fit(x_train_scaled, y_train)

In [40]:
# checking the model accuracy score
model.score(x_test_scaled, y_test)

0.9355971896955504

# Testing the Model

In [42]:
# entering input data
pred_data = pd.DataFrame([['2','1','0','9600000','29900000','12','778','50700000']], columns=['no_of_dependents','education','self_employed','income_annum','loan_amount','loan_term','cibil_score','assets'])

In [43]:
# scale this data
pred_data = scaler.transform(pred_data)

In [44]:
#predict loan approval
model.predict(pred_data)

array([1], dtype=int64)

# Saving the Trained Model

In [46]:
# used for serializing and deserializing Python objects
# serialization converts a Python object into a byte stream
# deserialization converts a byte stream back into a Python object
import pickle as pk

In [47]:
# pickle.dump() function takes the model object 
# and converts it into a binary format suitable for storage
pk.dump(model, open('Loan_Approval_Prediction_Model.pkl','wb'))

In [48]:
# it serializes the 'scaler' object, converting it into a byte stream.
pk.dump(scaler, open('loan_scalar.pkl','wb'))