## import 导入模块

In [1]:
%matplotlib notebook
import numpy as np
import operator
import matplotlib.pyplot as plt


## 生成数据函数

In [3]:
def createDataSet():
    group = np.array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
    labels = ['A','A','B','B']
    return group,labels

In [4]:
group, labels = createDataSet()

In [5]:
group

array([[1. , 1.1],
       [1. , 1. ],
       [0. , 0. ],
       [0. , 0.1]])

In [6]:
labels

['A', 'A', 'B', 'B']

In [19]:
plt.scatter(group[:,0],group[:,1])

<IPython.core.display.Javascript object>

<matplotlib.collections.PathCollection at 0x12585588>

In [20]:
for i in range(4):
    plt.text(group[i][0],group[i][1], labels[i], horizontalalignment='right')

## k近邻算法伪代码
1. 计算已知类别数据集中的点与当前之间的距离

2. 按照距离递增次序排序

3. 选取与当前点距离最小的k个点

4. 确定前k个点所在的类别的出现频率

5. 返回前k个点出现频率最高的类别作为当前点的预测分类

In [69]:
def classify0(inx, dataSet, labels,k):
    dataSetSize = dataSet.shape[0]
    # 距离计算
    diffMat = np.tile(inx, (dataSetSize,1))-dataSet
    sqDiffMat = diffMat**2
    sqDistances = sqDiffMat.sum(axis=1)
    distances = sqDistances**0.5
    sortedDistIndicies = distances.argsort()
    classCount = {}
    # 选择距离最小的k个点
    for i in range(k):
        voteLabel = labels[sortedDistIndicies[i]]
        classCount[voteLabel] = classCount.get(voteLabel,0)+1
    # 排序
    sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
    return sortedClassCount[0][0]

In [27]:
print(np.tile([1,2],(4,1)).shape)
print(group.sum(axis=1))
dist = group.sum(axis=1)
dist.argsort()

(4, 2)
[2.1 2.  0.  0.1]


array([2, 3, 1, 0], dtype=int64)

In [70]:
classify0([0,0], group, labels,3)

'B'

In [32]:
classCount={'A':2,'B':4}
sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)

[('B', 4), ('A', 2)]

In [34]:
classCount.items()

dict_items([('A', 2), ('B', 4)])

## sorted函数
```
>>> sorted({1: 'D', 2: 'B', 3: 'B', 4: 'E', 5: 'A'})
[1, 2, 3, 4, 5]
```
```
>>> sorted("This is a test string from Andrew".split(), key=str.lower)
['a', 'Andrew', 'from', 'is', 'string', 'test', 'This']
```
```
>>> student_tuples = [
        ('john', 'A', 15),
        ('jane', 'B', 12),
        ('dave', 'B', 10),
]
>>> sorted(student_tuples, key=lambda student: student[2])   # sort by age
[('dave', 'B', 10), ('jane', 'B', 12), ('john', 'A', 15)]
```
```
>>> class Student:
        def __init__(self, name, grade, age):
                self.name = name
                self.grade = grade
                self.age = age
        def __repr__(self):
                return repr((self.name, self.grade, self.age))
        def weighted_grade(self):
                return 'CBA'.index(self.grade) / float(self.age)

>>> student_objects = [
        Student('john', 'A', 15),
        Student('jane', 'B', 12),
        Student('dave', 'B', 10),
]
>>> sorted(student_objects, key=lambda student: student.age)   # sort by age
[('dave', 'B', 10), ('jane', 'B', 12), ('john', 'A', 15)]
```
```
>>> from operator import itemgetter, attrgetter, methodcaller

>>> sorted(student_tuples, key=itemgetter(2))
[('dave', 'B', 10), ('jane', 'B', 12), ('john', 'A', 15)]

>>> sorted(student_objects, key=attrgetter('age'))
[('dave', 'B', 10), ('jane', 'B', 12), ('john', 'A', 15)]
```
```
>>> sorted(student_tuples, key=itemgetter(1,2))
[('john', 'A', 15), ('dave', 'B', 10), ('jane', 'B', 12)]

>>> sorted(student_objects, key=attrgetter('grade', 'age'))
[('john', 'A', 15), ('dave', 'B', 10), ('jane', 'B', 12)]
```

## 从文本解析数据

In [36]:
def file2matrix(filename):
    fr = open(filename)
    arrayOfLines = fr.readlines()
    numberOfLines = len(arrayOfLines)
    # 得到文件数
    returnMat = np.zeros((numberOfLines,3))
    # 返回创建的numpy矩阵
    classLabelVector = []
    index = 0
    # 解析文件数据到列表
    for line in arrayOfLines:
        line = line.strip()
        listFromLine = line.split('\t')
        returnMat[index,:] = listFromLine[0:3]
        classLabelVector.append(int(listFromLine[-1]))
        index += 1
    return returnMat, classLabelVector

In [39]:
datingDataMat, datingLabels = file2matrix('../dataSet/Ch02/datingTestSet2.txt')

In [40]:
datingDataMat

array([[4.0920000e+04, 8.3269760e+00, 9.5395200e-01],
       [1.4488000e+04, 7.1534690e+00, 1.6739040e+00],
       [2.6052000e+04, 1.4418710e+00, 8.0512400e-01],
       ...,
       [2.6575000e+04, 1.0650102e+01, 8.6662700e-01],
       [4.8111000e+04, 9.1345280e+00, 7.2804500e-01],
       [4.3757000e+04, 7.8826010e+00, 1.3324460e+00]])

In [41]:
datingLabels[0:20]

[3, 2, 1, 1, 1, 1, 3, 3, 1, 3, 1, 1, 2, 1, 1, 1, 1, 1, 2, 3]

## 分析数据：使用Matplotlib创建散点图

In [43]:
fig = plt.figure(1)
ax = fig.add_subplot(111)
ax.scatter(datingDataMat[:,1],datingDataMat[:,2],15.0*np.array(datingLabels),
           15.0*np.array(datingLabels))
plt.show()

<IPython.core.display.Javascript object>

## 归一化

In [61]:
def autoNorm(dataSet):
    minVals = dataSet.min(0)
    maxVals = dataSet.max(axis=0)
    ranges = maxVals-minVals
    normDataSet = np.zeros_like(dataSet)
    m = dataSet.shape[0]
    normDataSet = dataSet - np.tile(minVals, (m,1))
    normDataSet = normDataSet /np.tile(ranges,(m,1))
    return normDataSet, ranges, minVals

In [54]:
minVals = datingDataMat.min(axis=0)
minVals.shape

(3,)

In [55]:
maxVals = datingDataMat.max(axis=0)

In [57]:
ranges = maxVals-minVals
print(ranges)
normDataSet = np.zeros_like(datingDataMat)
print(normDataSet)

[9.1273000e+04 2.0919349e+01 1.6943610e+00]
[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 ...
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]


In [62]:
normMat, ranges, minVals = autoNorm(datingDataMat)

In [63]:
normMat

array([[0.44832535, 0.39805139, 0.56233353],
       [0.15873259, 0.34195467, 0.98724416],
       [0.28542943, 0.06892523, 0.47449629],
       ...,
       [0.29115949, 0.50910294, 0.51079493],
       [0.52711097, 0.43665451, 0.4290048 ],
       [0.47940793, 0.3768091 , 0.78571804]])

In [64]:
ranges

array([9.1273000e+04, 2.0919349e+01, 1.6943610e+00])

In [65]:
minVals

array([0.      , 0.      , 0.001156])

## 测试算法，验证分类器

In [72]:
def datingClassTest():
    hoRatio = 0.10
    datingDataMat, datingLabels = file2matrix('../dataSet/Ch02/datingTestSet2.txt')
    normMat, ranges, minVals = autoNorm(datingDataMat)
    m = normMat.shape[0]
    numTestVecs = int(m*hoRatio)
    errorCount = 0.0
    for i in range(numTestVecs):
        classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:], 
                                    datingLabels[numTestVecs:m],3)
        print("the classifer came back with: %d, the real answer is %d" 
              %(classifierResult,datingLabels[i]))
        if (classifierResult != datingLabels[i]):
            errorCount += 1
    print("the total error rate is : %f" %(errorCount/float(numTestVecs)))
        

In [73]:
datingClassTest()

the classifer came back with: 3, the real answer is 3
the classifer came back with: 2, the real answer is 2
the classifer came back with: 1, the real answer is 1
the classifer came back with: 1, the real answer is 1
the classifer came back with: 1, the real answer is 1
the classifer came back with: 1, the real answer is 1
the classifer came back with: 3, the real answer is 3
the classifer came back with: 3, the real answer is 3
the classifer came back with: 1, the real answer is 1
the classifer came back with: 3, the real answer is 3
the classifer came back with: 1, the real answer is 1
the classifer came back with: 1, the real answer is 1
the classifer came back with: 2, the real answer is 2
the classifer came back with: 1, the real answer is 1
the classifer came back with: 1, the real answer is 1
the classifer came back with: 1, the real answer is 1
the classifer came back with: 1, the real answer is 1
the classifer came back with: 1, the real answer is 1
the classifer came back with

## 预测函数

In [87]:
def classifyPerson():
    resultList=['not at all', 'in small doses', 'in large doses']
    game_time = float(input("time playing games?"))
    miles = float(input("miles earned per year?"))
    ice =  float(input("ice cream consumed per year"))
    datingDataMat, datingLabels = file2matrix('../dataSet/Ch02/datingTestSet2.txt')
    normMat, ranges, minVals = autoNorm(datingDataMat)
    inArr = np.array([miles, game_time, ice])
    classifyResulet= classify0((inArr-minVals)/ranges, normMat, datingLabels,3)
    print("you will probably like this person: ", resultList[classifyResulet -1])
#     return percent

In [89]:
classifyPerson()

time playing games?10
miles earned per year?10000
ice cream consumed per year0.5
you will probably like this person:  in small doses
