In [1]:
import pandas as pd
import numpy as np
import math

In [2]:
import math
class MyKNNClf():

    def __init__(self, k = 3, metric = 'euclidean', weights = 'uniform'):
        self.k = k
        self.train_size = None
        self.metric = metric
        self.weights = weights
        
    def __repr__(self):
        return f"MyKNNClf class: k={self.k}"

    def fit(self, X: pd.DataFrame, y: pd.Series):
        self.X_train = X
        self.y_train = y
        self.train_size = self.X_train.shape
        
        
    def metrics(self, X_test_row):
        if self.metric == 'chebyshev':
            return np.max(np.abs(self.X_train.values - X_test_row.values), axis = 1)
        elif self.metric == 'manhattan':
            return np.sum(np.abs(self.X_train.values - X_test_row.values), axis = 1)
        elif self.metric == 'cosin':
            return 1 - np.dot(self.X_train, X_test_row)/\
                    (np.linalg.norm(self.X_train) * np.linalg.norm(X_test_row))
        else:
            return np.sum((self.X_train.values-X_test_row.values)**2,axis = 1)**0.5

    def euclid(self, X_test_row):
        return np.where(self.euclid_proba(X_test_row) >= 0.5, 1, 0)
        
    def euclid_proba(self, X_test_row):
        dist = self.metrics(X_test_row)
        sort_val = np.argsort(dist)[:self.k]
        dist2 = np.sort(dist)[:self.k]
        y = self.y_train.iloc[sort_val]
        y.index = range(1, self.k+1, 1)
        if self.weights == 'uniform':
            y_pred = y.mean() 
        elif self.weights == 'rank':
            y_pred = np.sum(1/y.index[y==1])/np.sum(1/y.index)
        elif self.weights == 'distance':
            y_pred = np.sum(1/dist2[y.index[y==1]-1])/np.sum(1/dist2[y.index-1])
    
        return y_pred
    
    def predict(self, X_test):
        return X_test.apply(self.euclid, axis = 1)
    
    def predict_proba(self, X_test):
        return X_test.apply(self.euclid_proba, axis = 1)

In [3]:
from sklearn.datasets import make_classification

X, y = make_classification(n_samples=1000, n_features=14, n_informative=10, random_state=42)
X = pd.DataFrame(X)
y = pd.Series(y)
X.columns = [f'col_{col}' for col in X.columns]


In [4]:
from sklearn.model_selection import train_test_split

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [6]:
model = MyKNNClf(k = 3, weights = 'distance')

In [7]:
model.fit(X_train, y_train)

In [8]:
sum(model.predict(X_test))

160

In [9]:
sum(model.predict_proba(X_test))

160.61200228787382

In [10]:
X_train.shape

(670, 14)

In [11]:
np.sqrt(((X_test_np[:,None,:] - X_train_np)**2).sum(axis=2))

NameError: name 'X_test_np' is not defined

In [None]:
X_train = np.array([[-1.21345377, -1.54174037],
                    [ 1.50513153, -0.23814019],
                    [ 1.15413283, -0.04085176],
                    [ 1.6248147 ,  0.11745924],
                    [ 0.41190036,  0.16466746],
                    [-0.56861959, -0.18716551],
                    [-0.20204102, -1.46273951],
                    [ 0.79519222,  0.31601044],
                    [-0.47799507,  0.55453979],
                    [-0.10039373, -0.70497529]])
X_test = np.array([[-1.54277153, -0.12430863],
                   [ 1.15146114,  0.08724635],
                   [-0.99423371, -0.77970332],
                   [-0.16547913,  0.79244826],
                   [-0.42078791,  0.81083815]])

In [None]:
X_train_np = X_train.values
X_test_np = X_test.values

In [None]:
X_train_np

In [None]:
X_train

In [None]:
X_test_np[:,None,:]

In [None]:
X_train_np

In [12]:
a = np.array([[1,2,3],[4,5,6],[7,8,9]])

In [13]:
a = a[:,None,:]