In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
np.set_printoptions(precision=4, threshold=15,suppress=True)
pd.options.display.max_rows = 20
%matplotlib inline

**Finding the Nearest Neighbors**

For the simple task of finding the nearest neighbors between two sets of data, the unsupervised algorithms within
sklearn.neighbors can be used:

In [None]:
from sklearn.neighbors import NearestNeighbors
X = np.array([[-1., -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
nbrs = NearestNeighbors(n_neighbors=2, algorithm='ball_tree').fit(X)  # 训练集
nbrs

In [None]:
NearestNeighbors?

nbrs.kneighbors?
```
Signature: nbrs.kneighbors(X=None, n_neighbors=None, return_distance=True)
Docstring:
Finds the K-neighbors of a point.
Returns indices of and distances to the neighbors of each point.

Parameters
----------
X : array-like, shape (n_query, n_features),                 or (n_query, n_indexed) if metric == 'precomputed'
    The query point or points.
    If not provided, neighbors of each indexed point are returned.
    In this case, the query point is not considered its own neighbor.

n_neighbors : int
    Number of neighbors to get (default is the value
    passed to the constructor).

return_distance : boolean, optional. Defaults to True.
    If False, distances will not be returned

Returns
-------
dist : array
    Array representing the lengths to points, only present if
    return_distance=True

ind : array
    Indices of the nearest points in the population matrix.

Examples
--------
In the following example, we construct a NeighborsClassifier
class from an array representing our data set and ask who's
the closest point to [1,1,1]

>>> samples = [[0., 0., 0.], [0., .5, 0.], [1., 1., .5]]
>>> from sklearn.neighbors import NearestNeighbors
>>> neigh = NearestNeighbors(n_neighbors=1)
>>> neigh.fit(samples) # doctest: +ELLIPSIS
NearestNeighbors(algorithm='auto', leaf_size=30, ...)
>>> print(neigh.kneighbors([[1., 1., 1.]])) # doctest: +ELLIPSIS
(array([[0.5]]), array([[2]]))

As you can see, it returns [[0.5]], and [[2]], which means that the
element is at distance 0.5 and is the third element of samples
(indexes start at 0). You can also query for multiple points:

>>> X = [[0., 1., 0.], [1., 0., 1.]]
>>> neigh.kneighbors(X, return_distance=False) # doctest: +ELLIPSIS
array([[1],
       [2]]...)
File:      d:\python3.7\lib\site-packages\sklearn\neighbors\base.py
Type:      method
```

In [None]:
distances, indices = nbrs.kneighbors(X)  # 查询集

In [None]:
distances  # (dis1, dis2)最近的2个点的距离

In [None]:
indices  # 最近的2个点的index

Because the query set matches the training set, the nearest neighbor of each point is the point itself, at a distance ofzero

In [None]:
nbrs.kneighbors_graph(X).toarray()
# produce a sparse graph(稀疏图) showing the connections between neighboring points:

**KDTree and BallTree Classes**


In [None]:
from sklearn.neighbors import KDTree
kdt = KDTree(X, leaf_size=30, metric='euclidean')  # 欧式距离  叶节点
kdt

In [None]:
kdt.query(X, k=2, return_distance=False)

## sklearn.neighbors.KNeighborsClassifier

- n_neighbors: 临近点个数
- p: 距离度量 Power parameter for the Minkowski metric 默认2
- algorithm: 近邻算法，可选{'auto', 'ball_tree', 'kd_tree', 'brute'}
 * 'auto': 根据传递给`fit`方法的数据自行推断使用的算法
 * 'brute': 暴力求解 复杂度$O(DN^2)$
 * 'kd_tree': 使用KDTree, 对于小于20的D,$O(DlogN)$;对于较大的D接近$O(DN)$
 * 'ball_tree': 使用BallTree, 复杂度$O(DlogN)$
- weights: 确定近邻的权重
 * 'uniform': 均匀分布. 所有邻近点的权重是一致的
 * 'dustance': 与距离成反比. 距离近的点的权重更大
 * [callbale]: a user-defined function which accepts an array of distances, and returns an
    array of the same shape containing the weights.
- leaf_size: Leaf size passed to BallTree or KDTree. 对于小数据集 (n小于30), log(N)相当于N, 暴力算法比基于树的算法更加有效, 控制了查询切换到暴力计算样本数量. 默认30


使用鸢尾花数据进行分析

In [None]:
from sklearn.datasets import load_iris
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.preprocessing import normalize, StandardScaler
from sklearn.model_selection import train_test_split
# from figures import plot_2d_separator

iris = load_iris()
X = iris.data # [:100, :2]
Y = iris.target # [:100]
X

In [None]:
# 创建标准化器 4个特征都进行处理
X_std1 = normalize(X[:, :2], 'l2')
X_std1

In [None]:
X_std2 = normalize(X[:, 2:], 'l2')
X_std2

In [None]:
# X_std = np.concatenate((X_std1, X_std2), axis=1)
standardizer = StandardScaler().fit(X)
X_std = standardizer.transform(X)
X_std

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X_std, Y, test_size=0.2)  # 将样本分为训练集和测试集

In [None]:
X_train

In [None]:
Y

In [None]:
knn = KNeighborsClassifier(algorithm='kd_tree', n_neighbors=5, weights='distance').fit(X_train, Y_train)
knn

In [None]:
knn.score(X_test, Y_test)  # 使用测试集测试正确率

In [None]:
test_node = [4.9, 3. , 1.4, 0.2]
node = standardizer.transform([test_node])

In [None]:
knn.predict(node)

In [None]:
plt.scatter(X_train[Y_train == 0, 0], X_train[Y_train == 0, 1], s=40, label='0')
plt.scatter(X_train[Y_train == 1, 0], X_train[Y_train == 1, 1], s=40, label='1', marker='s')  # 方型
plt.xlabel("first feature")
plt.ylabel("second feature")
plt.legend()