**Aim:** Write a program to exhibit the decision tree based KNN Algorithm <br>
**Theory:** In pattern recognition, the k-nearest neighbors algorithm (k-NN) is a non-parametric method proposed by Thomas Cover used for classification and regression.In both cases, the input consists of the k closest training examples in the feature space. The output depends on whether k-NN is used for classification or regression:

* In k-NN classification, the output is a class membership. An object is classified by a plurality vote of its neighbors, with the object being assigned to the class most common among its k nearest neighbors (k is a positive integer, typically small). If k = 1, then the object is simply assigned to the class of that single nearest neighbor.
* In k-NN regression, the output is the property value for the object. This value is the average of the values of k nearest neighbors.<br>




**Code:**<br>
Dataset used is **Banking Dataset**

In [1]:
import pandas as pd
import numpy as np    
import math
from random import seed
from random import randrange
from random import randint
from sklearn.preprocessing import LabelEncoder
import re

In [2]:
df = pd.read_csv('bank-full.csv',sep=";")


In [3]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [4]:
df.shape


(45211, 17)

In [5]:
df['month'].value_counts()

may    13766
jul     6895
aug     6247
jun     5341
nov     3970
apr     2932
feb     2649
jan     1403
oct      738
sep      579
mar      477
dec      214
Name: month, dtype: int64

In [6]:
col=['job','marital','education','default','housing','loan','contact','poutcome']

In [7]:
look_up = { 'jan':1,  'feb':2,  'mar':3,  'apr':4, 'may':5,
             'jun':6,  'jul':7,  'aug':8,  'sep':9, 'oct':10, 'nov':11, 'dec':12}

df['month'] = df['month'].apply(lambda x: look_up[x])

In [8]:
le=LabelEncoder()
for c in col:
    df[c]=le.fit_transform(df[c])
df    

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,4,1,2,0,2143,1,0,2,5,5,261,1,-1,0,3,no
1,44,9,2,1,0,29,1,0,2,5,5,151,1,-1,0,3,no
2,33,2,1,1,0,2,1,1,2,5,5,76,1,-1,0,3,no
3,47,1,1,3,0,1506,1,0,2,5,5,92,1,-1,0,3,no
4,33,11,2,3,0,1,0,0,2,5,5,198,1,-1,0,3,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,9,1,2,0,825,0,0,0,17,11,977,3,-1,0,3,yes
45207,71,5,0,0,0,1729,0,0,0,17,11,456,2,-1,0,3,yes
45208,72,5,1,1,0,5715,0,0,0,17,11,1127,5,184,3,2,yes
45209,57,1,1,1,0,668,0,0,1,17,11,508,4,-1,0,3,no


In [9]:
df['y']=le.fit_transform(df['y'])
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,4,1,2,0,2143,1,0,2,5,5,261,1,-1,0,3,0
1,44,9,2,1,0,29,1,0,2,5,5,151,1,-1,0,3,0
2,33,2,1,1,0,2,1,1,2,5,5,76,1,-1,0,3,0
3,47,1,1,3,0,1506,1,0,2,5,5,92,1,-1,0,3,0
4,33,11,2,3,0,1,0,0,2,5,5,198,1,-1,0,3,0


##  KNN Implementation

In [10]:
class KNearestNeighbors(object):
    
    def __init__(self,k):
        self.k=k
    @staticmethod
    def _euclidean_distance(row1,row2):
        v1,v2 = np.array(row1),np.array(row2)
        d=0 
        for i in range(len(v1)-1):
            d += (v1[i]-v2[2])**2

        return np.sqrt(d)

    def predict(self,train_set,test_instance):
        distances=[]

        for i in range(len(train_set)):
            dist=self._euclidean_distance(train_set[i][:-1],test_instance)
            distances.append((train_set[i],dist))

        distances.sort(key=lambda x:x[1])

        neighbours=[]

        for i in range(self.k):

            neighbours.append(distances[i][0])

        classes={}

        for i in range(len(neighbours)):
            response = neighbours[i][-1]

            if response in classes:
                classes[response] += 1
            else:
                classes[response] = 1

        sorted_classes =sorted(classes.items(),key=lambda x:x[1],reverse=True)

        return sorted_classes[0][0]

    @staticmethod
    def evaluate(y_true,y_pred):
        n_correct=0
        for act,pred in zip(y_true,y_pred):
            if act==pred:
                n_correct +=1
        return n_correct / len(y_true)    

     

In [11]:
df.rename(columns={'y':'class'},inplace=True)
df

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,class
0,58,4,1,2,0,2143,1,0,2,5,5,261,1,-1,0,3,0
1,44,9,2,1,0,29,1,0,2,5,5,151,1,-1,0,3,0
2,33,2,1,1,0,2,1,1,2,5,5,76,1,-1,0,3,0
3,47,1,1,3,0,1506,1,0,2,5,5,92,1,-1,0,3,0
4,33,11,2,3,0,1,0,0,2,5,5,198,1,-1,0,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,9,1,2,0,825,0,0,0,17,11,977,3,-1,0,3,1
45207,71,5,0,0,0,1729,0,0,0,17,11,456,2,-1,0,3,1
45208,72,5,1,1,0,5715,0,0,0,17,11,1127,5,184,3,2,1
45209,57,1,1,1,0,668,0,0,1,17,11,508,4,-1,0,3,0


In [12]:
df=df[:600]
df.shape

(600, 17)

In [13]:
def train_test_split(dataset,test_size=0.25):
    n_test=int(len(dataset)*test_size)
    test_set=dataset.sample(n_test)
    train_set=[]
    
    for ind in dataset.index:
        if ind in test_set.index:
            continue
        train_set.append(dataset.iloc[ind])
        
    train_set = pd.DataFrame(train_set).astype(float).values.tolist()
    test_set=test_set.astype(float).values.tolist()
    return train_set,test_set

train_set,test_set=train_test_split(df)
len(train_set),len(test_set)

(450, 150)

In [14]:
knn=KNearestNeighbors(k=5)
preds=[]

for row in test_set:
    predictor_only = row[:-1]
    prediction = knn.predict(train_set,predictor_only)
    preds.append(prediction)

actual = np.array(test_set)[:,-1]
knn.evaluate(actual,preds)


0.9733333333333334

In [15]:
print(preds)

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [16]:
from sklearn import metrics
# Print the confusion matrix

print(metrics.confusion_matrix(actual,preds))

# Print the precision and recall, among other metrics
print(metrics.classification_report(actual,preds, digits=3))

[[146   0]
 [  4   0]]
              precision    recall  f1-score   support

         0.0      0.973     1.000     0.986       146
         1.0      0.000     0.000     0.000         4

    accuracy                          0.973       150
   macro avg      0.487     0.500     0.493       150
weighted avg      0.947     0.973     0.960       150



  'precision', 'predicted', average, warn_for)
