In [1]:
from numpy import *
import operator

## 算法实现
步骤：
1. 计算已知点与当前点的距离
2. 按照距离递增排序
3. 选取与当前点距离最小的k个点
4. 确定前k个点所在的类别的频率
5. 返回预测类型

In [30]:
def classify0(inx,dataset,labels,k):
    size = dataset.shape[0]  #计算数据行数
    distMap = tile(inx,(size,1)) - dataset  #将输入数据（特征）复制为size行并减去训练数据的值，这是计算的第一步
    distMap **= 2 #平方
    distSum = distMap.sum(axis=1)
    distances = distSum ** 0.5
    sortedIndex = distances.argsort()  #返回排序后的数组的下标,顺序由小到大
    classcount = {}  #计算分类总数
    for i in range(k):  #遍历距离最小的前k个点
        label = labels[sortedIndex[i]]  
        #这里的sortedIndex[i]是上述排序后的数组的值，比如距离最近的点的下标为2，则sortedIndex[0]=2；从而得到label2
        classcount[label] = classcount.get(label,0) + 1 #在classcount字典中给对应的分类计数+1，这里注意get的用法，由于是字典，不能直接用下标
    sortedclasscount = sorted(classcount.items(),key=operator.itemgetter(1),reverse=True)
    #这里是给classcount字典排序，python3里废除了iteritem，items可将字典放进一个元组中，然后再利用sorted方法进行排序，得到最终结果
    return sortedclasscount[0][0]

## 测试

In [31]:
group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
labels = ['A','A','B','B']
inputdata = [2.0,1.0]
output = classify0([0,0],group,labels,3)
output

'B'

## 一个完整的实例
数据为《机器学习实战》里的海伦约会数据datingTestSet.txt，输入的特征为3维

### V1.0 
首先需要将txt文件转为numpy

In [40]:
def file2matrix(filename):
    fr = open(filename)
    arraylines = fr.readlines()  #返回的是字符串数组
    numberoflines = len(arraylines)
    returnMat = zeros((numberoflines,3))  #返回的n*3的矩阵
    classLableVector = []  #返回的标签
    index = 0
    for item in arraylines:
        line = item.strip()  #去掉头尾指定换行符
        listFromline = line.split('\t')  #以空格为分隔符
        returnMat[index,:] = listFromline[0:3]
        classLableVector.append(listFromline[3])
        index += 1
    return returnMat,classLableVector

In [46]:
datingDataMat,datingDataLabels = file2matrix('./data/datingTestSet.txt')
datingDataMat.shape

(1000, 3)

In [67]:
train_x = datingDataMat[0:996,:]
train_y = datingDataLabels[0:996]
test_x = datingDataMat[996:1000,:]
for i in range(4):
    truth = datingDataLabels[i+996]
    predict = classify0(test_x[i],train_x,train_y,5)
    print("*" * 40)
    print("The prediction is : ",predict)
    print("The truth is : ", truth)

****************************************
The prediction is :  didntLike
The truth is :  didntLike
****************************************
The prediction is :  largeDoses
The truth is :  largeDoses
****************************************
The prediction is :  didntLike
The truth is :  largeDoses
****************************************
The prediction is :  didntLike
The truth is :  largeDoses


经手工调试，发现在k值较大时得到的结果较为准确，但在k值较小时正确率较低

### V1.1
书上也给出了提示，由于特征值的取值范围差异较大，因此取值较大的特征在计算距离时会产生较大的影响。为了消除这种不平衡，需要进行归一化。

In [60]:
def autoNorm(dataset):
    minVals = dataset.min(0)
    maxVals = dataset.max(0)
    ranges = maxVals-minVals
    normDataSet = zeros(shape(dataset))
    m = dataset.shape[0]
    normDataset = dataset - tile(minVals,(m,1))
    normDataset /= tile(ranges,(m,1))
    return normDataset,ranges,minVals

In [68]:
norm,ranges,minVals = autoNorm(datingDataMat)
train_x_norm = norm[0:996,:]
train_y = datingDataLabels[0:996]
test_x_norm = norm[996:1000,:]
for i in range(4):
    truth = datingDataLabels[i+996]
    predict = classify0(test_x_norm[i],train_x_norm,train_y,5)
    print("*" * 40)
    print("The prediction is : ",predict)
    print("The truth is : ", truth)

****************************************
The prediction is :  didntLike
The truth is :  didntLike
****************************************
The prediction is :  largeDoses
The truth is :  largeDoses
****************************************
The prediction is :  largeDoses
The truth is :  largeDoses
****************************************
The prediction is :  largeDoses
The truth is :  largeDoses
