In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

#Importing custom KNN
from knn import KNearestNeighborsClf


In [3]:
# Reading Data
data = pd.read_csv('iris.csv')
data.drop('Id', axis=1, inplace=True)

In [4]:
"""
class 1 = 'Iris-setosa'
class 2 = 'Iris-versicolor'
class 3 = 'Iris-virginica'
"""
def encode_y(d):
    if d=='Iris-setosa':
        return 1
    
    elif d=='Iris-versicolor':
        return 2

    elif d=='Iris-virginica':
        return 3


data['species'] = data['Species'].apply(lambda row : encode_y(row))
data.drop('Species', axis=1, inplace=True)

In [5]:
x=data[['SepalLengthCm'	,'SepalWidthCm'	,'PetalLengthCm','PetalWidthCm']]
y=data['species']
x_train, x_test, y_train, y_test = train_test_split(np.asarray(x), np.asarray(y), test_size=0.33, random_state=42)

<h3><b>Implementing Sklearns KNN 

In [6]:
"""
p int, default=2
Power parameter for the Minkowski metric. When p = 1, this is equivalent to using manhattan_distance (l1), and
euclidean_distance (l2) for p = 2. 
For arbitrary p, minkowski_distance (l_p) is used.
"""

knn = KNeighborsClassifier(n_neighbors=5,metric='minkowski',p=2)
knn.fit(x_train, y_train)

KNeighborsClassifier()

In [7]:
knn.predict(x_test)

array([2, 1, 3, 2, 2, 1, 2, 3, 2, 2, 3, 1, 1, 1, 1, 2, 3, 2, 2, 3, 1, 3,
       1, 3, 3, 3, 3, 3, 1, 1, 1, 1, 2, 1, 1, 3, 2, 1, 1, 1, 3, 2, 2, 1,
       1, 2, 2, 3, 2, 3], dtype=int64)

In [11]:
knn.predict_proba(x_test)[:50]

array([[0. , 1. , 0. ],
       [1. , 0. , 0. ],
       [0. , 0. , 1. ],
       [0. , 1. , 0. ],
       [0. , 1. , 0. ],
       [1. , 0. , 0. ],
       [0. , 1. , 0. ],
       [0. , 0. , 1. ],
       [0. , 0.6, 0.4],
       [0. , 1. , 0. ],
       [0. , 0.2, 0.8],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [0. , 0.6, 0.4],
       [0. , 0. , 1. ],
       [0. , 1. , 0. ],
       [0. , 1. , 0. ],
       [0. , 0. , 1. ],
       [1. , 0. , 0. ],
       [0. , 0.2, 0.8],
       [1. , 0. , 0. ],
       [0. , 0. , 1. ],
       [0. , 0. , 1. ],
       [0. , 0. , 1. ],
       [0. , 0. , 1. ],
       [0. , 0. , 1. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [0. , 1. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [0. , 0.4, 0.6],
       [0. , 1. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [0. , 0.2, 0.8],
       [0. , 0.8

In [12]:
confusion_matrix(y_test,knn.predict(x_test))

array([[19,  0,  0],
       [ 0, 15,  0],
       [ 0,  1, 15]], dtype=int64)

In [13]:
accuracy_score(y_test,knn.predict(x_test))

0.98

<h4><b>Custom Implemented KNN with distance_metric='eucledian'

In [14]:
clf = KNearestNeighborsClf(n_neighbors=5,distance_metric='eucledian')

In [15]:
clf.fit(x_train,y_train)

KNearestNeighborsClf(n_neighbors=5,distance_metric=eucledian)


In [16]:
y_pr=clf.predict(x_test)

In [17]:
clf.predict_proba(x_test)[:50]

[[0.0, 1.0, 0.0],
 [1.0, 0.0, 0.0],
 [0.0, 0.0, 1.0],
 [0.0, 1.0, 0.0],
 [0.0, 1.0, 0.0],
 [1.0, 0.0, 0.0],
 [0.0, 1.0, 0.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.6, 0.4],
 [0.0, 1.0, 0.0],
 [0.0, 0.2, 0.8],
 [1.0, 0.0, 0.0],
 [1.0, 0.0, 0.0],
 [1.0, 0.0, 0.0],
 [1.0, 0.0, 0.0],
 [0.0, 0.6, 0.4],
 [0.0, 0.0, 1.0],
 [0.0, 1.0, 0.0],
 [0.0, 1.0, 0.0],
 [0.0, 0.0, 1.0],
 [1.0, 0.0, 0.0],
 [0.0, 0.2, 0.8],
 [1.0, 0.0, 0.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0],
 [1.0, 0.0, 0.0],
 [1.0, 0.0, 0.0],
 [1.0, 0.0, 0.0],
 [1.0, 0.0, 0.0],
 [0.0, 1.0, 0.0],
 [1.0, 0.0, 0.0],
 [1.0, 0.0, 0.0],
 [0.0, 0.4, 0.6],
 [0.0, 1.0, 0.0],
 [1.0, 0.0, 0.0],
 [1.0, 0.0, 0.0],
 [1.0, 0.0, 0.0],
 [0.0, 0.2, 0.8],
 [0.0, 0.8, 0.2],
 [0.0, 1.0, 0.0],
 [1.0, 0.0, 0.0],
 [1.0, 0.0, 0.0],
 [0.0, 1.0, 0.0],
 [0.0, 0.6, 0.4],
 [0.0, 0.0, 1.0],
 [0.0, 1.0, 0.0],
 [0.0, 0.0, 1.0]]

In [18]:
y_test

array([2, 1, 3, 2, 2, 1, 2, 3, 2, 2, 3, 1, 1, 1, 1, 2, 3, 2, 2, 3, 1, 3,
       1, 3, 3, 3, 3, 3, 1, 1, 1, 1, 2, 1, 1, 3, 2, 1, 1, 1, 3, 2, 2, 1,
       1, 2, 3, 3, 2, 3], dtype=int64)

In [19]:
y_pr

array([2, 1, 3, 2, 2, 1, 2, 3, 2, 2, 3, 1, 1, 1, 1, 2, 3, 2, 2, 3, 1, 3,
       1, 3, 3, 3, 3, 3, 1, 1, 1, 1, 2, 1, 1, 3, 2, 1, 1, 1, 3, 2, 2, 1,
       1, 2, 2, 3, 2, 3], dtype=int64)

In [20]:
accuracy_score(y_test,y_pr)

0.98

In [21]:
confusion_matrix(y_test,y_pr)

array([[19,  0,  0],
       [ 0, 15,  0],
       [ 0,  1, 15]], dtype=int64)

<h3><b>Creating Dataset

In [22]:
x, y = make_classification(n_samples = 3000, n_features = 5, 
                            n_informative = 3, n_redundant = 2, n_classes = 2, weights = [0.7])
x_train, x_test, y_train, y_test = train_test_split(x, y, stratify = y, test_size = 0.3)

<h3><b>Implementing Sklearns KNN 

In [23]:
"""
p int, default=2
Power parameter for the Minkowski metric. When p = 1, this is equivalent to using manhattan_distance (l1), and
euclidean_distance (l2) for p = 2. 
For arbitrary p, minkowski_distance (l_p) is used.
"""

knn = KNeighborsClassifier(n_neighbors=5,metric='minkowski',p=2)
knn.fit(x_train, y_train)
knn.predict(x_test)


array([1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0,

In [24]:
knn.predict_proba(x_test)[:50]


array([[0. , 1. ],
       [1. , 0. ],
       [0.8, 0.2],
       [0. , 1. ],
       [1. , 0. ],
       [1. , 0. ],
       [0. , 1. ],
       [0.2, 0.8],
       [1. , 0. ],
       [1. , 0. ],
       [1. , 0. ],
       [1. , 0. ],
       [1. , 0. ],
       [1. , 0. ],
       [1. , 0. ],
       [0.8, 0.2],
       [1. , 0. ],
       [1. , 0. ],
       [1. , 0. ],
       [0. , 1. ],
       [0.8, 0.2],
       [1. , 0. ],
       [0. , 1. ],
       [0. , 1. ],
       [1. , 0. ],
       [1. , 0. ],
       [1. , 0. ],
       [1. , 0. ],
       [1. , 0. ],
       [1. , 0. ],
       [0. , 1. ],
       [1. , 0. ],
       [1. , 0. ],
       [1. , 0. ],
       [1. , 0. ],
       [0. , 1. ],
       [1. , 0. ],
       [1. , 0. ],
       [0.8, 0.2],
       [0. , 1. ],
       [1. , 0. ],
       [0.8, 0.2],
       [0. , 1. ],
       [0.4, 0.6],
       [1. , 0. ],
       [1. , 0. ],
       [1. , 0. ],
       [0. , 1. ],
       [1. , 0. ],
       [0.8, 0.2]])

In [25]:
confusion_matrix(y_test,knn.predict(x_test))

array([[604,  23],
       [ 38, 235]], dtype=int64)

In [26]:
accuracy_score(y_test,knn.predict(x_test))

0.9322222222222222

<h3><b>Implementing Custom KNN with distance_metric: 'eucledian'

In [27]:
clf1 = KNearestNeighborsClf(n_neighbors=5,distance_metric='eucledian')
clf1.fit(x_train,y_train)
y_pr=clf1.predict(x_test)


KNearestNeighborsClf(n_neighbors=5,distance_metric=eucledian)


In [28]:
clf1.predict_proba(x_test)[:50]

[[0.0, 1.0],
 [1.0, 0.0],
 [0.8, 0.2],
 [0.0, 1.0],
 [1.0, 0.0],
 [1.0, 0.0],
 [0.0, 1.0],
 [0.2, 0.8],
 [1.0, 0.0],
 [1.0, 0.0],
 [1.0, 0.0],
 [1.0, 0.0],
 [1.0, 0.0],
 [1.0, 0.0],
 [1.0, 0.0],
 [0.8, 0.2],
 [1.0, 0.0],
 [1.0, 0.0],
 [1.0, 0.0],
 [0.0, 1.0],
 [0.8, 0.2],
 [1.0, 0.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [1.0, 0.0],
 [1.0, 0.0],
 [1.0, 0.0],
 [1.0, 0.0],
 [1.0, 0.0],
 [1.0, 0.0],
 [0.0, 1.0],
 [1.0, 0.0],
 [1.0, 0.0],
 [1.0, 0.0],
 [1.0, 0.0],
 [0.0, 1.0],
 [1.0, 0.0],
 [1.0, 0.0],
 [0.8, 0.2],
 [0.0, 1.0],
 [1.0, 0.0],
 [0.8, 0.2],
 [0.0, 1.0],
 [0.4, 0.6],
 [1.0, 0.0],
 [1.0, 0.0],
 [1.0, 0.0],
 [0.0, 1.0],
 [1.0, 0.0],
 [0.8, 0.2]]

In [29]:
accuracy_score(y_test,y_pr)


0.9322222222222222

In [30]:
confusion_matrix(y_test,y_pr)

array([[604,  23],
       [ 38, 235]], dtype=int64)

<h3><b>Implementing Custom KNN with distance_metric: 'manhattan'

In [31]:
clf = KNearestNeighborsClf(n_neighbors=5,distance_metric='manhattan')
clf.fit(x_train,y_train)
y_pr=clf.predict(x_test)


KNearestNeighborsClf(n_neighbors=5,distance_metric=manhattan)


In [32]:
clf.predict_proba(x_test)[:50]

[[0.0, 1.0],
 [1.0, 0.0],
 [1.0, 0.0],
 [0.0, 1.0],
 [1.0, 0.0],
 [0.8, 0.2],
 [0.0, 1.0],
 [0.2, 0.8],
 [1.0, 0.0],
 [1.0, 0.0],
 [1.0, 0.0],
 [1.0, 0.0],
 [1.0, 0.0],
 [0.6, 0.4],
 [1.0, 0.0],
 [0.8, 0.2],
 [1.0, 0.0],
 [1.0, 0.0],
 [1.0, 0.0],
 [0.0, 1.0],
 [0.8, 0.2],
 [1.0, 0.0],
 [0.0, 1.0],
 [0.2, 0.8],
 [1.0, 0.0],
 [1.0, 0.0],
 [1.0, 0.0],
 [1.0, 0.0],
 [1.0, 0.0],
 [1.0, 0.0],
 [0.0, 1.0],
 [1.0, 0.0],
 [1.0, 0.0],
 [1.0, 0.0],
 [1.0, 0.0],
 [0.0, 1.0],
 [1.0, 0.0],
 [1.0, 0.0],
 [1.0, 0.0],
 [0.2, 0.8],
 [1.0, 0.0],
 [0.8, 0.2],
 [0.0, 1.0],
 [0.6, 0.4],
 [1.0, 0.0],
 [1.0, 0.0],
 [1.0, 0.0],
 [0.2, 0.8],
 [1.0, 0.0],
 [0.8, 0.2]]

In [33]:
accuracy_score(y_test,y_pr)


0.9344444444444444

In [34]:
confusion_matrix(y_test,y_pr)

array([[605,  22],
       [ 37, 236]], dtype=int64)

<h3><b>Implementing Custom KNN with distance_metric:'cosine_similarity'

In [35]:
clf = KNearestNeighborsClf(n_neighbors=5,distance_metric='cosine_similarity')
clf.fit(x_train,y_train)
y_pr=clf.predict(x_test)


KNearestNeighborsClf(n_neighbors=5,distance_metric=cosine_similarity)


In [36]:
clf.predict_proba(x_test)[:50]

[[0.2, 0.8],
 [1.0, 0.0],
 [1.0, 0.0],
 [0.0, 1.0],
 [1.0, 0.0],
 [0.2, 0.8],
 [0.0, 1.0],
 [0.2, 0.8],
 [1.0, 0.0],
 [1.0, 0.0],
 [1.0, 0.0],
 [1.0, 0.0],
 [1.0, 0.0],
 [0.8, 0.2],
 [1.0, 0.0],
 [1.0, 0.0],
 [0.8, 0.2],
 [1.0, 0.0],
 [1.0, 0.0],
 [0.0, 1.0],
 [0.8, 0.2],
 [1.0, 0.0],
 [0.2, 0.8],
 [0.0, 1.0],
 [1.0, 0.0],
 [1.0, 0.0],
 [0.6, 0.4],
 [0.8, 0.2],
 [1.0, 0.0],
 [0.8, 0.2],
 [0.0, 1.0],
 [1.0, 0.0],
 [1.0, 0.0],
 [0.8, 0.2],
 [1.0, 0.0],
 [0.0, 1.0],
 [1.0, 0.0],
 [1.0, 0.0],
 [0.6, 0.4],
 [0.0, 1.0],
 [1.0, 0.0],
 [0.6, 0.4],
 [0.0, 1.0],
 [0.4, 0.6],
 [1.0, 0.0],
 [1.0, 0.0],
 [1.0, 0.0],
 [0.0, 1.0],
 [1.0, 0.0],
 [0.8, 0.2]]

In [37]:
accuracy_score(y_test,y_pr)


0.9

In [39]:
confusion_matrix(y_test,y_pr)

array([[590,  37],
       [ 53, 220]], dtype=int64)