### Importing Libraries

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('loan_approval_dataset.csv')
df

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4264,4265,5,Graduate,Yes,1000000,2300000,12,317,2800000,500000,3300000,800000,Rejected
4265,4266,0,Not Graduate,Yes,3300000,11300000,20,559,4200000,2900000,11000000,1900000,Approved
4266,4267,2,Not Graduate,No,6500000,23900000,18,457,1200000,12400000,18100000,7300000,Rejected
4267,4268,1,Not Graduate,No,4100000,12800000,8,780,8200000,700000,14100000,5800000,Approved


### Data Preprocessing

In [3]:
df.shape

(4269, 13)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4269 entries, 0 to 4268
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   loan_id                    4269 non-null   int64 
 1    no_of_dependents          4269 non-null   int64 
 2    education                 4269 non-null   object
 3    self_employed             4269 non-null   object
 4    income_annum              4269 non-null   int64 
 5    loan_amount               4269 non-null   int64 
 6    loan_term                 4269 non-null   int64 
 7    cibil_score               4269 non-null   int64 
 8    residential_assets_value  4269 non-null   int64 
 9    commercial_assets_value   4269 non-null   int64 
 10   luxury_assets_value       4269 non-null   int64 
 11   bank_asset_value          4269 non-null   int64 
 12   loan_status               4269 non-null   object
dtypes: int64(10), object(3)
memory usage: 433.7+ KB


In [8]:
df.isna().sum()

loan_id                      0
 no_of_dependents            0
 education                   0
 self_employed               0
 income_annum                0
 loan_amount                 0
 loan_term                   0
 cibil_score                 0
 residential_assets_value    0
 commercial_assets_value     0
 luxury_assets_value         0
 bank_asset_value            0
 loan_status                 0
dtype: int64

In [10]:
df.columns

Index(['loan_id', ' no_of_dependents', ' education', ' self_employed',
       ' income_annum', ' loan_amount', ' loan_term', ' cibil_score',
       ' residential_assets_value', ' commercial_assets_value',
       ' luxury_assets_value', ' bank_asset_value', ' loan_status'],
      dtype='object')

In [7]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
loan_id,4269.0,2135.0,1232.498,1.0,1068.0,2135.0,3202.0,4269.0
no_of_dependents,4269.0,2.498712,1.69591,0.0,1.0,3.0,4.0,5.0
income_annum,4269.0,5059124.0,2806840.0,200000.0,2700000.0,5100000.0,7500000.0,9900000.0
loan_amount,4269.0,15133450.0,9043363.0,300000.0,7700000.0,14500000.0,21500000.0,39500000.0
loan_term,4269.0,10.90045,5.709187,2.0,6.0,10.0,16.0,20.0
cibil_score,4269.0,599.9361,172.4304,300.0,453.0,600.0,748.0,900.0
residential_assets_value,4269.0,7472617.0,6503637.0,-100000.0,2200000.0,5600000.0,11300000.0,29100000.0
commercial_assets_value,4269.0,4973155.0,4388966.0,0.0,1300000.0,3700000.0,7600000.0,19400000.0
luxury_assets_value,4269.0,15126310.0,9103754.0,300000.0,7500000.0,14600000.0,21700000.0,39200000.0
bank_asset_value,4269.0,4976692.0,3250185.0,0.0,2300000.0,4600000.0,7100000.0,14700000.0


As the residential_assets_value column contains negative value so we have to replace it with mean value.

In [11]:
mean = df[' residential_assets_value'].mean()
round(mean,2)

7472616.54

In [12]:
df_n = df.mask(df[' residential_assets_value'] < 0,mean)
df_n[:2]

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1.0,2.0,Graduate,No,9600000.0,29900000.0,12.0,778.0,2400000.0,17600000.0,22700000.0,8000000.0,Approved
1,2.0,0.0,Not Graduate,Yes,4100000.0,12200000.0,8.0,417.0,2700000.0,2200000.0,8800000.0,3300000.0,Rejected


Now, education & self_employmed column is categorical, so we have to convert it into numeric one by using any encoding technique.

In [13]:
df_n[' education'] = df_n[' education'].map({' Graduate': 1, ' Not Graduate':0})

In [14]:
df_n[' self_employed'] = df_n[' self_employed'].map({' Yes': 1, ' No': 0})

In [15]:
df_n

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1.0,2.0,1.0,0.0,9600000.0,29900000.0,12.0,778.0,2400000.0,17600000.0,22700000.0,8000000.0,Approved
1,2.0,0.0,0.0,1.0,4100000.0,12200000.0,8.0,417.0,2700000.0,2200000.0,8800000.0,3300000.0,Rejected
2,3.0,3.0,1.0,0.0,9100000.0,29700000.0,20.0,506.0,7100000.0,4500000.0,33300000.0,12800000.0,Rejected
3,4.0,3.0,1.0,0.0,8200000.0,30700000.0,8.0,467.0,18200000.0,3300000.0,23300000.0,7900000.0,Rejected
4,5.0,5.0,0.0,1.0,9800000.0,24200000.0,20.0,382.0,12400000.0,8200000.0,29400000.0,5000000.0,Rejected
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4264,4265.0,5.0,1.0,1.0,1000000.0,2300000.0,12.0,317.0,2800000.0,500000.0,3300000.0,800000.0,Rejected
4265,4266.0,0.0,0.0,1.0,3300000.0,11300000.0,20.0,559.0,4200000.0,2900000.0,11000000.0,1900000.0,Approved
4266,4267.0,2.0,0.0,0.0,6500000.0,23900000.0,18.0,457.0,1200000.0,12400000.0,18100000.0,7300000.0,Rejected
4267,4268.0,1.0,0.0,0.0,4100000.0,12800000.0,8.0,780.0,8200000.0,700000.0,14100000.0,5800000.0,Approved


In [16]:
df_n.isna().sum()

loan_id                       0
 no_of_dependents             0
 education                   28
 self_employed               28
 income_annum                 0
 loan_amount                  0
 loan_term                    0
 cibil_score                  0
 residential_assets_value     0
 commercial_assets_value      0
 luxury_assets_value          0
 bank_asset_value             0
 loan_status                  0
dtype: int64

In [17]:
df_n.dropna(inplace=True)

In [18]:
df_n.isna().sum()

loan_id                      0
 no_of_dependents            0
 education                   0
 self_employed               0
 income_annum                0
 loan_amount                 0
 loan_term                   0
 cibil_score                 0
 residential_assets_value    0
 commercial_assets_value     0
 luxury_assets_value         0
 bank_asset_value            0
 loan_status                 0
dtype: int64

### Model Preperation

In [22]:
X = df_n.iloc[:,:12]
X[:2]

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value
0,1.0,2.0,1.0,0.0,9600000.0,29900000.0,12.0,778.0,2400000.0,17600000.0,22700000.0,8000000.0
1,2.0,0.0,0.0,1.0,4100000.0,12200000.0,8.0,417.0,2700000.0,2200000.0,8800000.0,3300000.0


In [23]:
y = df_n[' loan_status']
y[:2]

0     Approved
1     Rejected
Name:  loan_status, dtype: object

In [24]:
from sklearn.naive_bayes import GaussianNB,BernoulliNB,MultinomialNB

### MultinomialNB

In [25]:
model = MultinomialNB()
model

MultinomialNB()

In [26]:
model.fit(X,y)

MultinomialNB()

In [27]:
training = df_n[:3]
inp = training.iloc[:,:12]
inp

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value
0,1.0,2.0,1.0,0.0,9600000.0,29900000.0,12.0,778.0,2400000.0,17600000.0,22700000.0,8000000.0
1,2.0,0.0,0.0,1.0,4100000.0,12200000.0,8.0,417.0,2700000.0,2200000.0,8800000.0,3300000.0
2,3.0,3.0,1.0,0.0,9100000.0,29700000.0,20.0,506.0,7100000.0,4500000.0,33300000.0,12800000.0


In [28]:
model.predict(inp)

array([' Approved', ' Approved', ' Approved'], dtype='<U9')

### check goodness of data

In [29]:
model.score(X,y)*100

52.53477953312898

### BernoulliNB

In [30]:
model_1 = BernoulliNB()
model_1

BernoulliNB()

In [31]:
model_1.fit(X,y)

BernoulliNB()

In [32]:
model_1.predict(inp)

array([' Approved', ' Approved', ' Approved'], dtype='<U9')

In [33]:
model_1.score(X,y)*100

62.24946946474888

### GaussianNB

In [34]:
model_2 = GaussianNB()
model_2

GaussianNB()

In [35]:
model_2.fit(X,y)

GaussianNB()

In [36]:
model_2.predict(inp)

array([' Approved', ' Rejected', ' Approved'], dtype='<U9')

In [37]:
model_2.score(X,y)*100

76.32633812780004

### Conclusion: Out of three models of Naive Bayes algorithm, for GaussianNB model data accuracy is good i.e 76.32%. 