In [1]:
import numpy as np
import pandas as pd
import io
import requests
import seaborn as sns
from matplotlib import pyplot as plt
from scipy.stats import zscore
from sklearn.svm import SVR
import pickle
import os
import joblib
import plotly.express as px
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import power_transform
from pandas.api.types import CategoricalDtype
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn.metrics import r2_score
%matplotlib inline

In [2]:
#Importing dataset
df=pd.read_csv('https://raw.githubusercontent.com/FlipRoboTechnologies/ML_-Datasets/main/Loan%20Application%20Status/loan_prediction.csv')

In [3]:
df.head()

Unnamed: 0,LP001002,Male,No,0,Graduate,No.1,5849,0.1,Unnamed: 8,360,1,Urban,Y
0,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
1,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
2,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
3,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
4,LP001011,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y


In [4]:
original_columns = df.columns.tolist()
original_columns

['LP001002',
 'Male',
 'No',
 '0',
 'Graduate',
 'No.1',
 '5849',
 '0.1',
 'Unnamed: 8',
 '360',
 '1',
 'Urban',
 'Y']

In [5]:
original_columns[5]='No'
original_columns[8]='8'
original_columns

['LP001002',
 'Male',
 'No',
 '0',
 'Graduate',
 'No',
 '5849',
 '0.1',
 '8',
 '360',
 '1',
 'Urban',
 'Y']

In [6]:
new_columns = ['loan_id', 'gender', 'married','dependants','education','self_employed','applicant_income','coapplicant_income','loan_amount','loan_amount_term','credit_history','property_area','loan_status']
df.columns = new_columns

In [7]:
df.loc[-1]=original_columns
df.index+=1

In [8]:
df.sort_index(inplace=True)

In [9]:
df

Unnamed: 0,loan_id,gender,married,dependants,education,self_employed,applicant_income,coapplicant_income,loan_amount,loan_amount_term,credit_history,property_area,loan_status
0,LP001002,Male,No,0,Graduate,No,5849,0.1,8,360,1,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [10]:
df.shape

(614, 13)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   loan_id             614 non-null    object
 1   gender              601 non-null    object
 2   married             611 non-null    object
 3   dependants          599 non-null    object
 4   education           614 non-null    object
 5   self_employed       582 non-null    object
 6   applicant_income    614 non-null    object
 7   coapplicant_income  614 non-null    object
 8   loan_amount         593 non-null    object
 9   loan_amount_term    600 non-null    object
 10  credit_history      564 non-null    object
 11  property_area       614 non-null    object
 12  loan_status         614 non-null    object
dtypes: object(13)
memory usage: 67.2+ KB


In [12]:
df.isnull().sum()

loan_id                0
gender                13
married                3
dependants            15
education              0
self_employed         32
applicant_income       0
coapplicant_income     0
loan_amount           21
loan_amount_term      14
credit_history        50
property_area          0
loan_status            0
dtype: int64

In [13]:
df.describe() 

Unnamed: 0,loan_id,gender,married,dependants,education,self_employed,applicant_income,coapplicant_income,loan_amount,loan_amount_term,credit_history,property_area,loan_status
count,614,601,611,599,614,582,614,614.0,593.0,600.0,564.0,614,614
unique,614,2,2,4,2,2,505,288.0,204.0,11.0,3.0,3,2
top,LP001002,Male,Yes,0,Graduate,No,2500,0.0,120.0,360.0,1.0,Semiurban,Y
freq,1,489,398,345,480,500,9,272.0,20.0,511.0,474.0,233,422


In [15]:
df['applicant_income']

0      5849
1      4583
2      3000
3      2583
4      6000
       ... 
609    2900
610    4106
611    8072
612    7583
613    4583
Name: applicant_income, Length: 614, dtype: object

In [16]:
df[['applicant_income', 'loan_amount']]

Unnamed: 0,applicant_income,loan_amount
0,5849,8
1,4583,128.0
2,3000,66.0
3,2583,120.0
4,6000,141.0
...,...,...
609,2900,71.0
610,4106,40.0
611,8072,253.0
612,7583,187.0


In [17]:
df.columns

Index(['loan_id', 'gender', 'married', 'dependants', 'education',
       'self_employed', 'applicant_income', 'coapplicant_income',
       'loan_amount', 'loan_amount_term', 'credit_history', 'property_area',
       'loan_status'],
      dtype='object')

In [18]:
#Data Processing

In [19]:
df.isnull().sum()

loan_id                0
gender                13
married                3
dependants            15
education              0
self_employed         32
applicant_income       0
coapplicant_income     0
loan_amount           21
loan_amount_term      14
credit_history        50
property_area          0
loan_status            0
dtype: int64

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   loan_id             614 non-null    object
 1   gender              601 non-null    object
 2   married             611 non-null    object
 3   dependants          599 non-null    object
 4   education           614 non-null    object
 5   self_employed       582 non-null    object
 6   applicant_income    614 non-null    object
 7   coapplicant_income  614 non-null    object
 8   loan_amount         593 non-null    object
 9   loan_amount_term    600 non-null    object
 10  credit_history      564 non-null    object
 11  property_area       614 non-null    object
 12  loan_status         614 non-null    object
dtypes: object(13)
memory usage: 67.2+ KB


In [25]:
df.loan_amount=df.loan_amount.astype(float)
df.loan_amount_term=df.loan_amount_term.astype(float)
df.credit_history=df.credit_history.astype(float)

In [30]:
# handle numerical missing data
df['loan_amount'] = df['loan_amount'].fillna(df['loan_amount'].mean())
df['loan_amount_term'] = df['loan_amount_term'].fillna(df['loan_amount_term'].mean())
df['credit_history'] = df['credit_history'].fillna(df['credit_history'].mean())

In [31]:
df.isnull().sum()

loan_id                0
gender                13
married                3
dependants            15
education              0
self_employed         32
applicant_income       0
coapplicant_income     0
loan_amount            0
loan_amount_term       0
credit_history         0
property_area          0
loan_status            0
dtype: int64

In [32]:
# handle categorical missing data
df['gender'].mode()[0]

'Male'

In [34]:
df['gender'] = df['gender'].fillna(df['gender'].mode()[0])
df['married'] = df['married'].fillna(df['married'].mode()[0])
df['dependants'] = df['dependants'].fillna(df['dependants'].mode()[0])
df['self_employed'] = df['self_employed'].fillna(df['self_employed'].mode()[0])

In [35]:
df.isnull().sum()

loan_id               0
gender                0
married               0
dependants            0
education             0
self_employed         0
applicant_income      0
coapplicant_income    0
loan_amount           0
loan_amount_term      0
credit_history        0
property_area         0
loan_status           0
dtype: int64

In [36]:
#Exploratory data analysis