In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

df = pd.read_csv("Iris.csv", header="infer")
df = df.drop(columns=["Id"])
print(df)

     SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm  Species
0              5.1           3.5            1.4           0.2        0
1              4.9           3.0            1.4           0.2        0
2              4.7           3.2            1.3           0.2        0
3              4.6           3.1            1.5           0.2        0
4              5.0           3.6            1.4           0.2        0
..             ...           ...            ...           ...      ...
145            6.7           3.0            5.2           2.3        2
146            6.3           2.5            5.0           1.9        2
147            6.5           3.0            5.2           2.0        2
148            6.2           3.4            5.4           2.3        2
149            5.9           3.0            5.1           1.8        2

[150 rows x 5 columns]


In [2]:
X = df.iloc[:,:-1]
y = df.iloc[:, -1]
X_train , X_test, y_train, y_test = train_test_split(X, y, test_size=float(input("Split ratio : ")), stratify=y) 
print(f"X_test : {X_test.shape}, y_test : {y_test.shape}")

Split ratio : 0.2
X_test : (30, 4), y_test : (30,)


In [3]:
class KNN:
    def __init__(self, k=5):
        self.k = k
        
    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train
        
    def predict(self, X_test):
        distances = self._compute_distances(X_test)
        print(distances.shape)
        labels = self._get_top_k_labels(distances)
        y_pred = self._get_majority_vote(labels)
        return y_pred
    
    def _compute_distances(self, X_test):
        num_test = X_test.shape[0]
        num_train = self.X_train.shape[0]
        dists = np.zeros((num_test, num_train))
        for i in range(num_test):
            dists[i] = np.sqrt(np.sum((X_test.iloc[i] - self.X_train)**2, axis=1))
        return dists
    
    def _get_top_k_labels(self, distances):
        num_test = distances.shape[0]
        labels = np.zeros((num_test, self.k))
        for i in range(num_test):
            idx = np.argsort(distances[i])[:self.k]
            labels[i] = self.y_train.iloc[idx]
        return labels
    
    def _get_majority_vote(self, labels):
        majority = []
        for label in labels:
            majority.append(int(np.max(label)))
        return majority

In [4]:
model = KNN(3)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f"{classification_report(y_test, y_pred)}")

(30, 120)
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      0.70      0.82        10
           2       0.77      1.00      0.87        10

    accuracy                           0.90        30
   macro avg       0.92      0.90      0.90        30
weighted avg       0.92      0.90      0.90        30



In [5]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(3, weights='distance')
model.fit(X_train, y_train)
pred = model.predict(X_test)
print(f"Accuracy : {model.score(X_test, y_test)}")

Accuracy : 0.9333333333333333
