In [7]:
from tree import DecisionTreeNode
import numpy as np
import operator
import copy
from collections import Counter
import time

class kdTreeNode(DecisionTreeNode):
    def __init__(self,  label=None, feature_name=None, feature=None,value=None,split_point=None,data_index=None):
        self.label = label
        self.feature_name = feature_name
        self.feature = feature
        self.tree = {}
        self.value=value
        self.split_point=split_point
        self.data_index=data_index
        
        self.left_hrect=None
        self.right_hrect=None


class KNNClassifier():
    def __init__(self,n_neighbors=5):
        
        self.n_neighbors=n_neighbors
        self.kdtree=None
        
    def create_kdtree(self,data,label,depth,hrect):
            m,n=np.shape(data)
            if m==0:return None
            k=depth%n   #根据深度得当前用来构造树的特征维度k
            
            datasets=np.hstack((data,label.reshape((-1,1))))
            datasets=sorted(datasets,key=operator.itemgetter(k))
            datasets=np.array(datasets)
            
            new_node=kdTreeNode()
            new_node.label=[k,datasets[m//2,:-1],datasets[m//2,-1]]
            
            left_hrect = hrect.copy()
            right_hrect = hrect.copy()
            left_hrect[k,1]=datasets[m//2,k]
            right_hrect[k,0]=datasets[m//2,k]
            new_node.left_hrect=left_hrect
            new_node.right_hrect=right_hrect
            
            if m==1:return new_node
            
            new_node.tree['left']=self.create_kdtree(datasets[:m//2,:-1],datasets[:m//2,-1],depth+1,left_hrect)
            if m>2:new_node.tree['right']=self.create_kdtree(datasets[m//2+1:,:-1],datasets[m//2+1:,-1],depth+1,right_hrect)
                
            return new_node    
    
    def fit(self,X,y):
        #创建区域    
        hrect=np.zeros((X.shape[1],2))
        hrect[:,0]=X.min(axis=0)
        hrect[:,1]=X.max(axis=0)

        self.kdtree=self.create_kdtree(X,y,0,hrect)
    
    def find_leaf_node(self,datapoint):
        #寻找叶接待你
        
        stack=[]  #保存搜索路径
        node=self.kdtree
        while node!=None:
            stack.append(node)
            curr_k,curr_splitpoint=node.label[0],node.label[1]   #当前用来划分的数据及其维度
            if datapoint[curr_k]<curr_splitpoint[curr_k]:
                node=node.tree.get('left')
            else:node=node.tree.get('right')
        if stack[-1].tree.get('left')!=None and \
        self.euclidean_distance(datapoint,node.tree['left'].label[1])<self.euclidean_distance(datapoint,node.label[1]):
            node=node.tree['left']
            stack.append(node)
        return stack
    
    def euclidean_distance(self,data,radius):
        #欧氏距离
        if len(data.shape)==1:data=data.reshape((1,-1))
        d=np.sqrt(np.sum((data-radius)**2,axis=1))
        return d
    
    def intersect(self,hrect,radius,centroid):
        #判断质心为centroid，半径为radius的超球体与区域hrect是否相交
        p=centroid.copy()
        idx=p<hrect[:,0]
        p[idx]=hrect[idx,0]
        idx=p>hrect[:,1]
        p[idx]=hrect[idx,1]
        return self.euclidean_distance(p,centroid)<radius
        
    def find_nearest_k_node(self,datapoint):
        #寻找离datapoint最近的n_neighbors个点
        knn = [(np.inf, None)]*self.n_neighbors   #最近点到datapoint的距离及其标签

        def dfs(node):
            nonlocal knn
            if node==None:return
            d=self.euclidean_distance(datapoint,node.label[1])
            if d<knn[-1][0]:
                knn.pop()
                knn=sorted(knn+[(d,node.label[2])])
            
            #看以datapoint为质心，n_neighbors个点中最远距离为半径的超球体与左区域是否相交
            if self.intersect(node.left_hrect, knn[-1][0], datapoint):   
                dfs(node.tree.get('left'))
            if self.intersect(node.right_hrect, knn[-1][0], datapoint):
                dfs(node.tree.get('right'))
        dfs(self.kdtree)
        return knn
    
    def predict(self,data):
        pred=[]
        for x in data:
            knn=self.find_nearest_k_node(x)
 
            labels=[x[1] for x in knn]
            label=sorted(Counter(labels).items(),key=operator.itemgetter(1))[-1][0]
            pred.append(label)
        pred=np.array(pred)
        return pred

In [3]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

X, y = load_iris(return_X_y=True)
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

print('================================KNN分类结果================================')
model=KNNClassifier(n_neighbors=5)
model.fit(X_train,y_train)
t1=time.time()
y_pre=model.predict(X_test)
t2=time.time()
print('预测结果：',y_pre)
print('耗时：',(t2-t1))
print('正确率：',np.sum([1 if x==y else 0 for x,y in zip(y_pre,y_test)])/len(y_test))

print('================================KNN_sklearn实现================================')
from sklearn.neighbors import KNeighborsClassifier

clf=KNeighborsClassifier(n_neighbors=5)
clf.fit(X_train,y_train)
t1==time.time()
y_pre=clf.predict(X_test)
t2=time.time()
print('预测结果：',y_pre)
print('耗时：',(t2-t1))
print('正确率：',np.sum([1 if x==y else 0 for x,y in zip(y_pre,y_test)])/len(y_test))

print('================================不用kd树，直接求距离================================')
def f(datasets,y,datapoint,k):
    d=np.sqrt(np.sum((datasets-datapoint)**2,axis=1))
    dd=[(x,y) for x,y in zip(d,y)]
    dd.sort()
    return dd[:k]
y_pred=[]
t1=time.time()
for data in X_test:
    res=f(X_train,y_train,data,5)
    labels=[x[-1] for x in res]
    label=sorted(Counter(labels).items(),key=operator.itemgetter(1))[-1][0]
    y_pred.append(label)
t2=time.time()
print('预测结果：',y_pred)
print('耗时：',(t2-t1))
print('正确率：',np.sum([1 if x==y else 0 for x,y in zip(y_pre,y_test)])/len(y_test))

预测结果： [2. 1. 2. 2. 0. 0. 2. 0. 0. 2. 0. 0. 2. 2. 0. 0. 0. 2. 2. 2. 1. 2. 2. 0.
 0. 1. 1. 2. 2. 0.]
耗时： 0.18019628524780273
正确率： 0.9333333333333333
预测结果： [2 1 2 2 0 0 2 0 0 2 0 0 2 2 0 0 0 2 2 2 1 2 2 0 0 1 1 2 2 0]
耗时： 8.621437788009644
正确率： 0.9333333333333333
预测结果： [2, 1, 2, 2, 0, 0, 2, 0, 0, 2, 0, 0, 2, 2, 0, 0, 0, 2, 2, 2, 1, 2, 2, 0, 0, 1, 1, 2, 2, 0]
耗时： 0.01399087905883789
正确率： 0.9333333333333333
