In [3]:
from scipy.spatial.distance import euclidean
import numpy as np
from scipy import stats as st

from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.datasets import make_classification, make_regression
from sklearn.model_selection import train_test_split

from abc import ABC, abstractmethod
from collections import defaultdict

# k-Nearest Neighbors Algorithm

Для каждого наблюдения $x_{test} \in X_{test}$:
1. Рассчитать дистанцию $d$ до каждой точки в наборе данных $x_{train_{ij}}, \ i=1,...,N; \ j=1,...,P$
2. Определить значения $y_{train}$ для $k$ ближайших точек на основе $d$
3. Для регрессии предсказать значение $y_{test}$ как среднее значение $y_{train}$, для классификации как моду

# Implementation

In [7]:
class BasekNNEstimator(ABC):
    '''
    Базовый класс для алгоритма k-ближайших соседей
    '''
    
    def __init__(
        self, 
        k: int = 3,
        distance_func: callable = euclidean,
        weights: str = 'uniform'
    ) -> None:
        self.k = k
        self.distance = distance_func
        self.weights = weights
    
    def fit(self, X: np.array, y: np.array) -> None:
        self._X = X
        self._y = y
        
    @abstractmethod
    def _get_prediction(self, x_query: np.array) -> float:
        '''
        Возвращает предсказание для наблюдения на основании целевой переменной его ближайших соседей в обучающей выборке
        '''
        pass
    
    def _get_weights(self, nearest_neigbors: np.array) -> list[tuple]:
        '''
        Возвращает вес и целевую пременную ближайших соседей
        '''
        if self.weights == 'uniform':
            return [(1, y_train) for distance, y_train in nearest_neigbors]
        if self.weights == 'distance':
            return [(1/(distance+1e-9), y_train) for distance, y_train in nearest_neigbors]
        
    def predict(self, X: np.array) -> np.array:
        predictions = []
        for x_query in X:
            predictions.append(self._get_prediction(x_query))      
        return np.array(predictions)

In [8]:
class kNNClassifier(BasekNNEstimator):
    
    def _get_prediction(self, x_query: np.array) -> int:
        distances = sorted([(self.distance(x_train, x_query), y_train) for x_train, y_train in zip(self._X, self._y)])
        k_neighbors = distances[:self.k]
        weights = self._get_weights(k_neighbors)
        weights_by_class = defaultdict(list)
        for weight, _class in weights:
            weights_by_class[_class].append(weight)
        class_votes = [(sum(weights), _class) for _class, weights in weights_by_class.items()]
        return max(class_votes)[1]      

In [9]:
class kNNRegressor(BasekNNEstimator):

    def _get_prediction(self, x_query: np.array) -> float:
        distances = sorted([(self.distance(x_train, x_query), y_train) for x_train, y_train in zip(self._X, self._y)])
        k_neighbors = distances[:self.k]
        weights = np.array(self._get_weights(k_neighbors))
        return np.sum((weights[:, 0] * weights[:, 1])) / np.sum(weights[:, 0])

In [10]:
X, y = make_classification(n_samples=1000, n_features=6, n_informative=3, n_classes=2, flip_y=0.2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
model_custom = kNNClassifier(k=3)
model_sklearn = KNeighborsClassifier(n_neighbors=3)
model_custom.fit(X_train, y_train)
model_sklearn.fit(X_train, y_train)

custom_pred = model_custom.predict(X_test)
sklearn_pred = model_sklearn.predict(X_test)

print(f'custom accuracy score: {accuracy_score(y_test, custom_pred).round(3)}')
print(f'sklearn accuracy score: {accuracy_score(y_test, sklearn_pred).round(3)}')

custom accuracy score: 0.83
sklearn accuracy score: 0.83


In [12]:
model_custom = kNNClassifier(k=17, weights='distance')
model_sklearn = KNeighborsClassifier(n_neighbors=17, weights='distance')
model_custom.fit(X_train, y_train)
model_sklearn.fit(X_train, y_train)

custom_pred = model_custom.predict(X_test)
sklearn_pred = model_sklearn.predict(X_test)

print(f'custom accuracy score: {accuracy_score(y_test, custom_pred).round(3)}')
print(f'sklearn accuracy score: {accuracy_score(y_test, sklearn_pred).round(3)}')

custom accuracy score: 0.87
sklearn accuracy score: 0.87


In [13]:
X, y = make_regression(n_samples=1000, n_features=5, n_informative=4, noise=20, random_state=69)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69)

In [14]:
model_custom = kNNRegressor(k=3)
model_sklearn = KNeighborsRegressor(n_neighbors=3)
model_custom.fit(X_train, y_train)
model_sklearn.fit(X_train, y_train)

custom_pred = model_custom.predict(X_test)
sklearn_pred = model_sklearn.predict(X_test)

print(f'custom accuracy score: {mean_squared_error(y_test, custom_pred).round(3)}')
print(f'sklearn accuracy score: {mean_squared_error(y_test, sklearn_pred).round(3)}')

custom accuracy score: 2351.277
sklearn accuracy score: 2351.277


In [15]:
model_custom = kNNRegressor(k=5, weights='distance')
model_sklearn = KNeighborsRegressor(n_neighbors=5, weights='distance')
model_custom.fit(X_train, y_train)
model_sklearn.fit(X_train, y_train)

custom_pred = model_custom.predict(X_test)
sklearn_pred = model_sklearn.predict(X_test)

print(f'custom accuracy score: {mean_squared_error(y_test, custom_pred).round(3)}')
print(f'sklearn accuracy score: {mean_squared_error(y_test, sklearn_pred).round(3)}')

custom accuracy score: 1988.318
sklearn accuracy score: 1988.318
