# Setting all Dependencies and Path

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cd /content/drive/MyDrive/DSCNITP/Loan Prediction

/content/drive/MyDrive/DSCNITP/Loan Prediction


# Importing all required Libraries

In [69]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,f1_score

# Loading Dataset

In [29]:
data=pd.read_csv('loan_data.csv')

In [30]:
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


# Data Preparation

## Dropping Unwanted Columns

In [31]:
data.drop(['Loan_ID'],axis=1,inplace=True)

## Dropping Null Value Rows

In [32]:
data.isnull().any()

Gender                True
Married               True
Dependents            True
Education            False
Self_Employed         True
ApplicantIncome      False
CoapplicantIncome    False
LoanAmount            True
Loan_Amount_Term      True
Credit_History        True
Property_Area        False
Loan_Status          False
dtype: bool

In [33]:
data.shape

(614, 12)

In [34]:
data.dropna(inplace=True)

In [35]:
data.shape

(480, 12)

## Encoding the Labels which have string categories 

In [36]:
from sklearn.preprocessing import LabelEncoder

In [37]:
le=LabelEncoder()
data['Gender']=le.fit_transform(data['Gender'])
le.classes_

array(['Female', 'Male'], dtype=object)

In [39]:
le=LabelEncoder()
data['Married']=le.fit_transform(data['Married'])
le.classes_

array(['No', 'Yes'], dtype=object)

In [40]:
le=LabelEncoder()
data['Education']=le.fit_transform(data['Education'])
le.classes_

array(['Graduate', 'Not Graduate'], dtype=object)

In [41]:
le=LabelEncoder()
data['Self_Employed']=le.fit_transform(data['Self_Employed'])
le.classes_

array(['No', 'Yes'], dtype=object)

In [42]:
le=LabelEncoder()
data['Property_Area']=le.fit_transform(data['Property_Area'])
le.classes_

array(['Rural', 'Semiurban', 'Urban'], dtype=object)

In [55]:
le=LabelEncoder()
data['Dependents']=le.fit_transform(data['Dependents'])
le.classes_

array(['0', '1', '2', '3+'], dtype=object)

In [56]:
data

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
1,1,1,1,0,0,4583,1508.0,128.0,360.0,1.0,0,N
2,1,1,0,0,1,3000,0.0,66.0,360.0,1.0,2,Y
3,1,1,0,1,0,2583,2358.0,120.0,360.0,1.0,2,Y
4,1,0,0,0,0,6000,0.0,141.0,360.0,1.0,2,Y
5,1,1,2,0,1,5417,4196.0,267.0,360.0,1.0,2,Y
...,...,...,...,...,...,...,...,...,...,...,...,...
609,0,0,0,0,0,2900,0.0,71.0,360.0,1.0,0,Y
610,1,1,3,0,0,4106,0.0,40.0,180.0,1.0,0,Y
611,1,1,1,0,0,8072,240.0,253.0,360.0,1.0,2,Y
612,1,1,2,0,0,7583,0.0,187.0,360.0,1.0,2,Y


## Dividing the Data into features and Labels

In [57]:
features=data.iloc[:,:-1]

In [58]:
features.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
1,1,1,1,0,0,4583,1508.0,128.0,360.0,1.0,0
2,1,1,0,0,1,3000,0.0,66.0,360.0,1.0,2
3,1,1,0,1,0,2583,2358.0,120.0,360.0,1.0,2
4,1,0,0,0,0,6000,0.0,141.0,360.0,1.0,2
5,1,1,2,0,1,5417,4196.0,267.0,360.0,1.0,2


In [59]:
labels=data.iloc[:,-1]

In [60]:
labels

1      N
2      Y
3      Y
4      Y
5      Y
      ..
609    Y
610    Y
611    Y
612    Y
613    N
Name: Loan_Status, Length: 480, dtype: object

## Splitting the features and labels to train and test sets

In [61]:
from sklearn.model_selection import train_test_split


In [62]:
feature_train,feature_test,label_train,label_test=train_test_split(features,labels,test_size=0.2,random_state=7)

# Model

In [63]:
model = LogisticRegression()

In [64]:
model.fit(feature_train,label_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

# Prediction with Test Data

In [66]:
label_pred = model.predict(feature_test)

In [80]:
label_pred

array(['Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'N',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'N',
       'N', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'N', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'N', 'Y', 'N', 'Y', 'Y'], dtype=object)

# Metrics

## Accuracy

In [67]:
accuracy_score(label_test,label_pred)

0.78125

## Confusion_Matrix

In [68]:
confusion_matrix(label_test,label_pred)

array([[15, 20],
       [ 1, 60]])

## Classification Report

In [71]:
print(classification_report(label_test,label_pred))

              precision    recall  f1-score   support

           N       0.94      0.43      0.59        35
           Y       0.75      0.98      0.85        61

    accuracy                           0.78        96
   macro avg       0.84      0.71      0.72        96
weighted avg       0.82      0.78      0.76        96



# Saving the model

In [74]:
import pickle

In [76]:
pickle.dump(model,open("model_logisticR.pkl","wb+"),protocol=pickle.HIGHEST_PROTOCOL)

# Loading the model

In [78]:
model_pickle=pickle.load(open('model_logisticR.pkl','rb+'))

In [79]:
model_pickle

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)