# K近邻分类算法
## 公式笔记

* Lp距离公式
$$ 
L_{p}\left(x_{i}, x_{j}\right)=\left(\sum_{l=1}^{n}\left|x_{i}^{(l)}-x_{j}^{(l)}\right|^{p}\right)^{\frac{1}{p}}
$$

* $ p=2 $ 为欧氏距离
$$ 
L_{2}\left(x_{i}, x_{j}\right)=\left(\sum_{l=1}^{n}\left|x_{i}^{(l)}-x_{j}^{(l)}\right|^{2}\right)^{\frac{1}{2}}
 $$

* $ p=1 $ 为曼哈顿距离
$$ 
L_{1}\left(x_{i}, x_{j}\right)=\sum_{l=1}^{n}\left|x_{i}^{(l)}-x_{j}^{(l)}\right|
 $$

## KNNClassifier实现


In [129]:
import numpy as np
class kdTreeNode:
    def __init__(self, x, x_index, parent):
        '''
        所有节点共用x
        利用索引来表示该节点所覆盖的数据
        '''
        self.x = x
        self.x_index = x_index
        self.parent = parent
        self.edge_nodes_x_index = None
        self.dim_index = None
        self.left = None
        self.right = None
class kdTree:
    def __init__(self, distance_method = 'o'):
        self.dim_len = None
        self.x = None
        self.root = None
        if(distance_method == 'm'):
            self._dist_method = self._manhattan_distance
            self._dist_method_2d = self._manhattan_distance_2d
        else:
            self._dist_method = self._euler_dist
            self._dist_method_2d = self._euler_dist_2d
    def _euler_dist(self, x1, x):
        return np.sqrt(np.multiply(x-x1, x-x1).sum())
    def _manhattan_distance(self, x1, x):
        return np.abs(x-x1).sum()
    def _euler_dist_2d(self, x2d, x):
        return np.sum(np.multiply(x2d-x, x2d-x), axis = 1)
    def _manhattan_distance_2d(self, x2d, x):
        return np.sum(np.abs(x2d-x), axis = 1)
    def _median(self, arr):
        temp = arr.copy()
        temp.sort()
        return temp[len(temp) // 2]
    def _build_child_node(self, x, x_index, parent):
        if(0 == x_index.size):
            return None
        else:
            return kdTreeNode(x, x_index, parent)
    def _fit(self, dim_len, current_dim,  node):
        if(None == node):
            return
        node.dim_index = current_dim
        splited_x = node.x[node.x_index]
        median_value = self._median(splited_x[:, current_dim])   
        node.edge_nodes_x_index = splited_x[np.where(splited_x[:, current_dim] 
                                           == median_value)][:, -1]
        node.left = self._build_child_node(node.x, splited_x[np.where(splited_x[:, current_dim] 
                                           < median_value)][:, -1], node)
        node.right = self._build_child_node(node.x, splited_x[np.where(splited_x[:, current_dim] 
                                           > median_value)][:, -1], node)
        self._fit(dim_len, (current_dim + 1)%dim_len, node.left)
        self._fit(dim_len, (current_dim + 1)%dim_len, node.right)

    def fit(self, x):
        self.dim_len = x.shape[1]
        #保存原始索引
        self.x = np.hstack((x, np.arange(len(x)).reshape(-1, 1)))        
        self.root = kdTreeNode(self.x, np.arange(len(x)), None)
        self._fit(self.dim_len, 0, self.root)
    def _search_nreaest_leaf(self, target_x, node):
        #print('dim', node.dim_index, 'edge value', edge_value)
        #print(node.x[node.edge_nodes_x_index])
        if(node.left != None or node.right != None):
            edge_value = node.x[node.edge_nodes_x_index][0][node.dim_index]
            if(target_x[node.dim_index] == edge_value):
                if(None != node.left):
                    return self._search_nreaest_leaf(target_x, node.left)
                if(None != node.right):
                    return self._search_nreaest_leaf(target_x, node.right)
            elif(target_x[node.dim_index] < edge_value):
                if(None != node.left):
                    return self._search_nreaest_leaf(target_x, node.left)
                else:
                    return self._search_nreaest_leaf(target_x, node.right)
            else:
                if(None != node.right):
                    return self._search_nreaest_leaf(target_x, node.right)
                else:
                    return self._search_nreaest_leaf(target_x, node.left)
        else:
            return node
    def _search_nreaest_node(self, target_x, leaf_node):
        #print(leaf_node.x[leaf_node.edge_nodes_x_index][:, :-1])
        #print(target_x)
        nearest_distance = float('inf')
        nearest_node = None
        node = leaf_node
        while(node):
            distance = self._dist_method_2d(
                node.x[node.edge_nodes_x_index][:, :-1], target_x).min()
            if(distance <= nearest_distance):
                nearest_distance = distance
                nearest_node = node
            node = node.parent
        return nearest_node
    def _search_node_has_k(self, nearest_node, k):
        node = nearest_node
        while(node and k > node.x_index.shape[0]):
            #print(node.x_index.shape[0], k)
            node = node.parent
        return node
    def _get_k_index(self, target_x, node, k):
        copied_x = node.x[node.x_index].copy()
        distance_k = self._dist_method_2d(copied_x[:, :-1], target_x)
        res = np.hstack(
                (copied_x[:, -1].reshape(-1, 1), distance_k.reshape(-1, 1)))
        return res[np.argsort(res[:,-1])][0:k, 0]
    def search_nreaest_k(self, target_x, k):
        if(self.dim_len != len(target_x)):
            return None
        nearst_leaf = self._search_nreaest_leaf(target_x, self.root)
        nearest_node = self._search_nreaest_node(target_x, nearst_leaf)
        if(nearest_node.parent):
            #应该使用超圆与超矩形的相交方法，决定是否提升至父亲节点
            nearest_node = nearest_node.parent
        nearest_withk_node = self._search_node_has_k(nearest_node, k)
        if(None == nearest_withk_node):
            print('k too large')
            return None
        return self._get_k_index(target_x, nearest_withk_node, k)
    def show(self):
        print('original x:')
        print(self.x)
        queue = []
        queue.append(self.root)
        while(0 != len(queue)):
            print('edge x')
            print(queue[0].x[queue[0].edge_nodes_x_index])
            print('inner x')
            print(queue[0].x[queue[0].x_index])
            if(queue[0].left):
                queue.append(queue[0].left)
            if(queue[0].right):
                queue.append(queue[0].right)
            del queue[0]

tree = kdTree()
test_x = np.array([
    [2,3],
    [5,4],
    [9,6],
    [4,7],
    [8,1],
    [7,2],
])
tree.fit(test_x)
#tree.show()
tree.search_nreaest_k(np.array([5,7]), 1)



array([3])

* kd树已实现书中所述最近邻搜索，但是无法完成k近邻搜索，按照书中示例，可见是正确的

In [134]:
import numpy as np
from math import *

class KNNClassifier:
    def __init__(self, distance_method = "o"):
        self.kd_tree = None
        if(distance_method == 'm'):
            self._dist_method = self._manhattan_distance
        else:
            self._dist_method = self._euler_dist
    def _euler_dist(self, x1, x):
        return np.sqrt(np.multiply(x-x1, x-x1).sum())
    def _manhattan_distance(self, x1, x):
        return np.abs(x-x1).sum()
    def fit(self, x, y):
        self.x = x
        self.y = y
        self.kd_tree = kdTree()
        self.kd_tree.fit(x)
        return self
    def searchKNeighbour(self, x): 
        pass
test_x = np.array([
    [2,3],
    [5,4],
    [9,6],
    [4,7],
    [8,1],
    [7,2],
])