In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_diabetes

In [2]:
data = load_diabetes()
print(data.keys())

dict_keys(['data', 'target', 'frame', 'DESCR', 'feature_names', 'data_filename', 'target_filename', 'data_module'])


In [3]:
print(data.DESCR)

.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

  :Number of Instances: 442

  :Number of Attributes: First 10 columns are numeric predictive values

  :Target: Column 11 is a quantitative measure of disease progression one year after baseline

  :Attribute Information:
      - age     age in years
      - sex
      - bmi     body mass index
      - bp      average blood pressure
      - s1      tc, total serum cholesterol
      - s2      ldl, low-density lipoproteins
      - s3      hdl, high-density lipoproteins
      - s4      tch, total cholesterol / HDL
      - s5      ltg, possibly log of serum triglycerides level
      - s6      glu, blood sugar level

Note: Each of these 1

In [4]:
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=123)

In [5]:
from math import sqrt
from collections import Counter

In [6]:
from collections import Counter

class KNN:
    def __init__(self, n_neighbors=3):
        self.k = n_neighbors
        self.X_train = []
        self.y_train = []

    def fit(self, X, y):
        if len(X) != len(y):
            raise ValueError(
                f"X and y must have the same number of samples, got {len(X)} and {len(y)}"
            )
        self.X_train = X
        self.y_train = y

    def _euclidean_distance(self, a, b):
        return sum((ai - bi) ** 2 for ai, bi in zip(a, b)) ** 0.5

    def pred(self, X):
        predictions = []
        for x in X:
            distances = [
                (self._euclidean_distance(x, x_train), y)
                for x_train, y in zip(self.X_train, self.y_train)
            ]
            k_nearest = sorted(distances, key=lambda d: d[0])[:self.k]
            labels = [label for _, label in k_nearest]
            most_common = Counter(labels).most_common(1)[0][0]
            predictions.append(most_common)
        return predictions

    def score(self, X, y):
        y_pred = self.pred(X)
        correct = sum(1 for yp, yt in zip(y_pred, y) if yp == yt)
        return correct / len(y)

In [7]:
model = KNN()
model.fit(X_train, y_train)
acc = model.score(X_test, y_test)
print(acc)

0.011235955056179775


In [8]:
from sklearn.neighbors import KNeighborsClassifier

In [9]:
k_model = KNeighborsClassifier(n_neighbors=3)
k_model.fit(X_train, y_train)
acc = model.score(X_test, y_test)
print(acc)

0.011235955056179775
