# Setting all Dependencies and Path

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cd /content/drive/MyDrive/DSCNITP/Loan Prediction

/content/drive/MyDrive/DSCNITP/Loan Prediction


# Importing all required Libraries

In [3]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

# Loading Dataset

In [4]:
data=pd.read_csv('loan_data.csv')

In [5]:
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


# Data Preparation

## Dropping Unwanted Columns

In [6]:
data.drop(['Loan_ID'],axis=1,inplace=True)

## Dropping Null Value Rows

In [7]:
data.isnull().any()

Gender                True
Married               True
Dependents            True
Education            False
Self_Employed         True
ApplicantIncome      False
CoapplicantIncome    False
LoanAmount            True
Loan_Amount_Term      True
Credit_History        True
Property_Area        False
Loan_Status          False
dtype: bool

In [8]:
data.shape

(614, 12)

In [9]:
data.dropna(inplace=True)

In [10]:
data.shape

(480, 12)

## Encoding the Labels which have string categories 

In [11]:
from sklearn.preprocessing import LabelEncoder

In [12]:
le=LabelEncoder()
data['Gender']=le.fit_transform(data['Gender'])
le.classes_

array(['Female', 'Male'], dtype=object)

In [13]:
le=LabelEncoder()
data['Married']=le.fit_transform(data['Married'])
le.classes_

array(['No', 'Yes'], dtype=object)

In [14]:
le=LabelEncoder()
data['Education']=le.fit_transform(data['Education'])
le.classes_

array(['Graduate', 'Not Graduate'], dtype=object)

In [15]:
le=LabelEncoder()
data['Self_Employed']=le.fit_transform(data['Self_Employed'])
le.classes_

array(['No', 'Yes'], dtype=object)

In [16]:
le=LabelEncoder()
data['Property_Area']=le.fit_transform(data['Property_Area'])
le.classes_

array(['Rural', 'Semiurban', 'Urban'], dtype=object)

In [17]:
le=LabelEncoder()
data['Dependents']=le.fit_transform(data['Dependents'])
le.classes_

array(['0', '1', '2', '3+'], dtype=object)

In [18]:
data

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
1,1,1,1,0,0,4583,1508.0,128.0,360.0,1.0,0,N
2,1,1,0,0,1,3000,0.0,66.0,360.0,1.0,2,Y
3,1,1,0,1,0,2583,2358.0,120.0,360.0,1.0,2,Y
4,1,0,0,0,0,6000,0.0,141.0,360.0,1.0,2,Y
5,1,1,2,0,1,5417,4196.0,267.0,360.0,1.0,2,Y
...,...,...,...,...,...,...,...,...,...,...,...,...
609,0,0,0,0,0,2900,0.0,71.0,360.0,1.0,0,Y
610,1,1,3,0,0,4106,0.0,40.0,180.0,1.0,0,Y
611,1,1,1,0,0,8072,240.0,253.0,360.0,1.0,2,Y
612,1,1,2,0,0,7583,0.0,187.0,360.0,1.0,2,Y


## Dividing the Data into features and Labels

In [19]:
features=data.iloc[:,:-1]

In [20]:
features.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
1,1,1,1,0,0,4583,1508.0,128.0,360.0,1.0,0
2,1,1,0,0,1,3000,0.0,66.0,360.0,1.0,2
3,1,1,0,1,0,2583,2358.0,120.0,360.0,1.0,2
4,1,0,0,0,0,6000,0.0,141.0,360.0,1.0,2
5,1,1,2,0,1,5417,4196.0,267.0,360.0,1.0,2


In [21]:
labels=data.iloc[:,-1]

In [22]:
labels

1      N
2      Y
3      Y
4      Y
5      Y
      ..
609    Y
610    Y
611    Y
612    Y
613    N
Name: Loan_Status, Length: 480, dtype: object

## Splitting the features and labels to train and test sets

In [23]:
from sklearn.model_selection import train_test_split


In [24]:
feature_train,feature_test,label_train,label_test=train_test_split(features,labels,test_size=0.2,random_state=7)

# Model

In [25]:
from sklearn.model_selection import GridSearchCV

In [27]:
KNN = KNeighborsClassifier()

In [28]:
KNN

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [29]:
params={
    'algorithm':["auto", "ball_tree", "kd_tree", "brute"],
    'n_neighbors':np.arange(5,20),
    'metric':["euclidean","manhattan","chebyshev","minkowski"]
}


Refer more distances here.  

https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.DistanceMetric.html#sklearn.neighbors.DistanceMetric

In [30]:
cv=GridSearchCV(KNN,param_grid=params)

In [31]:
cv.fit(feature_train,label_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=None,
             param_grid={'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
                         'metric': ['euclidean', 'manhattan', 'chebyshev',
                                    'minkowski'],
                         'n_neighbors': array([ 5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [32]:
model=cv.best_estimator_

In [33]:
cv.best_score_

0.7083732057416269

In [34]:
cv.best_params_

{'algorithm': 'auto', 'metric': 'manhattan', 'n_neighbors': 9}

# Prediction with Test Data

In [35]:
label_pred = model.predict(feature_test)

In [36]:
label_pred

array(['N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'N', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'N', 'Y', 'N'], dtype=object)

# Metrics

## Accuracy

In [37]:
accuracy_score(label_test,label_pred)

0.6354166666666666

## Confusion_Matrix

In [38]:
confusion_matrix(label_test,label_pred)

array([[ 7, 28],
       [ 7, 54]])

## Classification Report

In [39]:
print(classification_report(label_test,label_pred))

              precision    recall  f1-score   support

           N       0.50      0.20      0.29        35
           Y       0.66      0.89      0.76        61

    accuracy                           0.64        96
   macro avg       0.58      0.54      0.52        96
weighted avg       0.60      0.64      0.58        96



# Saving the model

In [40]:
import pickle

In [41]:
pickle.dump(model,open("model_KNN.pkl","wb+"),protocol=pickle.HIGHEST_PROTOCOL)

# Loading the model

In [42]:
model_pickle=pickle.load(open('model_KNN.pkl','rb+'))

In [43]:
model_pickle

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='manhattan',
                     metric_params=None, n_jobs=None, n_neighbors=9, p=2,
                     weights='uniform')