In [3]:
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns
from sklearn import datasets
from sklearn.base import BaseEstimator
from sklearn.datasets import fetch_openml
dataset = fetch_openml("mnist_784")
from sklearn.datasets import fetch_20newsgroups

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [71]:
from sklearn.neighbors import KDTree

class MyKNeighborsClassifier(BaseEstimator):
    
    def __init__(self, n_neighbors, algorithm='brute'):
        #pass
        self.n_neighbors = n_neighbors
        self.algorithm = algorithm
    
    def fit(self, X, y):
        #pass
        self.X_train = X
        self.y = y
        if (self.algorithm == 'kd_tree'):
            self.tree = KDTree(X)
            
    def predict(self, X):
        #pass
        if (self.algorithm == 'brute'):
            from scipy.spatial.distance import cdist
            dist_matr = cdist(X, self.X_train)
            ind = np.argpartition(dist_matr, self.n_neighbors)
            ind = ind[:, :self.n_neighbors]
        else:
            q, ind = self.tree.query(X, k=self.n_neighbors)
        class_near_point = self.y[ind].astype(int)
        near = []
        for el in class_near_point:
            near.append(np.argmax(np.bincount(el)))
        return np.array(near)

In [57]:
iris = datasets.load_iris()

In [58]:
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.1, stratify=iris.target)

In [59]:
clf = KNeighborsClassifier(n_neighbors=2, algorithm='brute')
my_clf = MyKNeighborsClassifier(n_neighbors=2, algorithm='brute')

In [60]:
clf.fit(X_train, y_train)
my_clf.fit(X_train, y_train)

In [61]:
sklearn_pred = clf.predict(X_test)
my_clf_pred = my_clf.predict(X_test)
assert abs( accuracy_score(y_test, my_clf_pred) -  accuracy_score(y_test, sklearn_pred ) )<0.005, "Score must be simillar"

In [62]:
%time clf.fit(X_train, y_train)

Wall time: 3 ms


KNeighborsClassifier(algorithm='brute', n_neighbors=2)

In [63]:
%time my_clf.fit(X_train, y_train)

Wall time: 0 ns


In [64]:
%time clf.predict(X_test)

Wall time: 10 ms


array([1, 2, 2, 1, 2, 0, 0, 0, 0, 1, 0, 2, 2, 2, 1])

In [65]:
%time my_clf.predict(X_test)

Wall time: 2 ms


array([1, 2, 2, 1, 2, 0, 0, 0, 0, 1, 0, 2, 2, 2, 1], dtype=int64)

In [67]:
clf = KNeighborsClassifier(n_neighbors=2, algorithm='kd_tree')
my_clf = MyKNeighborsClassifier(n_neighbors=2, algorithm='kd_tree')

In [68]:
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.1, stratify=iris.target)

In [69]:
%time clf.fit(X_train, y_train)

Wall time: 43 ms


KNeighborsClassifier(algorithm='kd_tree', n_neighbors=2)

In [72]:
%time my_clf.fit(X_train, y_train)

Wall time: 1e+03 µs


In [73]:
%time clf.predict(X_test)

Wall time: 23 ms


array([2, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 2, 1, 2, 2])

In [74]:
%time my_clf.predict(X_test)


Wall time: 2 ms


array([2, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 2, 1, 2, 2], dtype=int64)

In [75]:
sklearn_pred = clf.predict(X_test)
my_clf_pred = my_clf.predict(X_test)
assert abs( accuracy_score(y_test, my_clf_pred) -  accuracy_score(y_test, sklearn_pred ) )<0.005, "Score must be simillar"

In [None]:
newsgroups = fetch_20newsgroups(subset='train',remove=['headers','footers', 'quotes'])