In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
np.set_printoptions(precision=4, threshold=15,suppress=True)
pd.options.display.max_rows = 20
%matplotlib inline

**Finding the Nearest Neighbors**

For the simple task of finding the nearest neighbors between two sets of data, the unsupervised algorithms within
sklearn.neighbors can be used:

In [2]:
from sklearn.neighbors import NearestNeighbors
X = np.array([[-1., -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
nbrs = NearestNeighbors(n_neighbors=2, algorithm='ball_tree').fit(X)  # 训练集
nbrs

NearestNeighbors(algorithm='ball_tree', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=2, p=2,
                 radius=1.0)

In [3]:
NearestNeighbors?

nbrs.kneighbors?
```
Signature: nbrs.kneighbors(X=None, n_neighbors=None, return_distance=True)
Docstring:
Finds the K-neighbors of a point.
Returns indices of and distances to the neighbors of each point.

Parameters
----------
X : array-like, shape (n_query, n_features),                 or (n_query, n_indexed) if metric == 'precomputed'
    The query point or points.
    If not provided, neighbors of each indexed point are returned.
    In this case, the query point is not considered its own neighbor.

n_neighbors : int
    Number of neighbors to get (default is the value
    passed to the constructor).

return_distance : boolean, optional. Defaults to True.
    If False, distances will not be returned

Returns
-------
dist : array
    Array representing the lengths to points, only present if
    return_distance=True

ind : array
    Indices of the nearest points in the population matrix.

Examples
--------
In the following example, we construct a NeighborsClassifier
class from an array representing our data set and ask who's
the closest point to [1,1,1]

>>> samples = [[0., 0., 0.], [0., .5, 0.], [1., 1., .5]]
>>> from sklearn.neighbors import NearestNeighbors
>>> neigh = NearestNeighbors(n_neighbors=1)
>>> neigh.fit(samples) # doctest: +ELLIPSIS
NearestNeighbors(algorithm='auto', leaf_size=30, ...)
>>> print(neigh.kneighbors([[1., 1., 1.]])) # doctest: +ELLIPSIS
(array([[0.5]]), array([[2]]))

As you can see, it returns [[0.5]], and [[2]], which means that the
element is at distance 0.5 and is the third element of samples
(indexes start at 0). You can also query for multiple points:

>>> X = [[0., 1., 0.], [1., 0., 1.]]
>>> neigh.kneighbors(X, return_distance=False) # doctest: +ELLIPSIS
array([[1],
       [2]]...)
File:      d:\python3.7\lib\site-packages\sklearn\neighbors\base.py
Type:      method
```

In [4]:
distances, indices = nbrs.kneighbors(X)  # 查询集

In [5]:
distances  # (dis1, dis2)最近的2个点的距离

array([[0.    , 1.    ],
       [0.    , 1.    ],
       [0.    , 1.4142],
       [0.    , 1.    ],
       [0.    , 1.    ],
       [0.    , 1.4142]])

In [6]:
indices  # 最近的2个点的index

array([[0, 1],
       [1, 0],
       [2, 1],
       [3, 4],
       [4, 3],
       [5, 4]])

Because the query set matches the training set, the nearest neighbor of each point is the point itself, at a distance ofzero

In [7]:
nbrs.kneighbors_graph(X).toarray()
# produce a sparse graph(稀疏图) showing the connections between neighboring points:

array([[1., 1., 0., 0., 0., 0.],
       [1., 1., 0., 0., 0., 0.],
       [0., 1., 1., 0., 0., 0.],
       [0., 0., 0., 1., 1., 0.],
       [0., 0., 0., 1., 1., 0.],
       [0., 0., 0., 0., 1., 1.]])

**KDTree and BallTree Classes**


In [8]:
from sklearn.neighbors import KDTree
kdt = KDTree(X, leaf_size=30, metric='euclidean')  # 欧式距离  叶节点
kdt

<sklearn.neighbors.kd_tree.KDTree at 0x563ca0544b40>

In [9]:
kdt.query(X, k=2, return_distance=False)

array([[0, 1],
       [1, 0],
       [2, 1],
       [3, 4],
       [4, 3],
       [5, 4]])

# 最近邻分类
scikit-learn 实现了两种不同的最近邻分类器： 
- KNeighborsClassifier 基于每个查询点的 k 个最近邻实现，其中 k 是用户指定的整数值。
- RadiusNeighborsClassifier 基于每个查询点的固定半径 r 内的邻居数量实现， 其中 r 是用户指定的浮点数值。

## sklearn.neighbors.KNeighborsClassifier

- n_neighbors: 临近点个数
- p: 距离度量 Power parameter for the Minkowski metric 默认2
- algorithm: 近邻算法，可选{'auto', 'ball_tree', 'kd_tree', 'brute'}
 * 'auto': 根据传递给`fit`方法的数据自行推断使用的算法
 * 'brute': 暴力求解 复杂度$O(DN^2)$
 * 'kd_tree': 使用KDTree, 对于小于20的D,$O(DlogN)$;对于较大的D接近$O(DN)$
 * 'ball_tree': 使用BallTree, 复杂度$O(DlogN)$
- weights: 确定近邻的权重
 * 'uniform': 均匀分布. 所有邻近点的权重是一致的
 * 'dustance': 与距离成反比. 距离近的点的权重更大
 * [callbale]: a user-defined function which accepts an array of distances, and returns an
    array of the same shape containing the weights.
- leaf_size: Leaf size passed to BallTree or KDTree. 对于小数据集 (n小于30), log(N)相当于N, 暴力算法比基于树的算法更加有效, 控制了查询切换到暴力计算样本数量. 默认30


使用鸢尾花数据进行分析, 对四个特征都进行了分析

In [10]:
from sklearn.datasets import load_iris
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.preprocessing import normalize, StandardScaler
from sklearn.model_selection import train_test_split
# from figures import plot_2d_separator

iris = load_iris()
X = iris.data # [:100, :2]
Y = iris.target # [:100]
X

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       ...,
       [6.5, 3. , 5.2, 2. ],
       [6.2, 3.4, 5.4, 2.3],
       [5.9, 3. , 5.1, 1.8]])

In [11]:
def normalize_data_1(data):
    # 使用归一化预处理数据
    return normalize(data, 'l2')

def normalize_data_2(data):
    # 使用归一化预处理数据
    X_std1 = normalize(data[:, :2], 'l2')  # 缩放至L2范数为1
    X_std2 = normalize(data[:, 2:], 'l2')
    return np.concatenate((X_std1, X_std2), axis=1)

In [12]:
def standard_data(data):
    # 将输入向量 X 上的每个特征缩放到 [0,1] 或 [- 1，+1]， 或将其标准化，使其均值为 0，方差为 1
    standardizer = StandardScaler().fit(data)  
    return standardizer

In [13]:
x_indices = np.arange(X.shape[0])
x_indices

array([  0,   1,   2, ..., 147, 148, 149])

In [14]:
X_train_indices, X_test_indices, Y_train, Y_test = train_test_split(x_indices, Y, test_size=0.2)
# 将样本分为训练集和测试集

In [15]:
X_test_indices

array([ 18,  34,  49, ...,  94, 144,  70])

In [28]:
# X_std = np.concatenate((X_std1, X_std2), axis=1)
standardizer = StandardScaler().fit(X[X_train_indices])
X_std = standardizer.transform(X[X_train_indices])
X_std

array([[ 1.0238, -0.1677,  0.6835,  0.6514],
       [-1.1009, -1.7864, -0.2868, -0.292 ],
       [ 1.0238,  0.5261,  1.0831,  1.1905],
       ...,
       [-1.1009,  1.2199, -1.3713, -1.3702],
       [-0.101 , -0.8614,  0.1698, -0.292 ],
       [ 1.0238,  0.0636,  1.026 ,  1.5948]])

In [17]:
X_normalize_1 = normalize_data_1(X)
X_normalize_1

array([[0.8038, 0.5516, 0.2206, 0.0315],
       [0.8281, 0.507 , 0.2366, 0.0338],
       [0.8053, 0.5483, 0.2228, 0.0343],
       ...,
       [0.7165, 0.3307, 0.5732, 0.2205],
       [0.6747, 0.37  , 0.5876, 0.2503],
       [0.6903, 0.351 , 0.5967, 0.2106]])

In [18]:
X_normalize_2 = normalize_data_2(X)
X_normalize_2

array([[0.8245, 0.5658, 0.9899, 0.1414],
       [0.8529, 0.5222, 0.9899, 0.1414],
       [0.8266, 0.5628, 0.9884, 0.1521],
       ...,
       [0.908 , 0.4191, 0.9333, 0.359 ],
       [0.8768, 0.4808, 0.92  , 0.3919],
       [0.8914, 0.4532, 0.943 , 0.3328]])

In [31]:
# 使用原始数据进行测试
knn1 = KNeighborsClassifier(weights='distance').fit(X[X_train_indices], Y_train)
knn1

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='distance')

In [32]:
knn1.score(X[X_test_indices], Y_test)  # 使用测试集测试正确率

0.9333333333333333

In [34]:
# standard
knn2 = KNeighborsClassifier(weights='distance').fit(X_std, Y_train)
X_standard_test = standardizer.transform(X[X_test_indices])
knn2.score(X_standard_test, Y_test)  # 使用测试集测试正确率

0.9333333333333333

In [35]:
# normalize 1
knn3 = KNeighborsClassifier(weights='distance').fit(X_normalize_1[X_train_indices], Y_train)
knn3.score(X[X_test_indices], Y_test)  # 使用测试集测试正确率

0.9666666666666667

In [36]:
# normalize 2
knn4 = KNeighborsClassifier(weights='distance').fit(X_normalize_2[X_train_indices], Y_train)
knn4.score(X[X_test_indices], Y_test)  # 使用测试集测试正确率

0.8333333333333334

In [24]:
test_node = [5.9, 3. , 5.1, 1.8]
# node = standardizer.transform([test_node])
knn1.predict([test_node])

array([2])

## 基于半径的 KNN 分类器
**sklearn.neighbors.RadiusNeighborsClassifier**
如果数据是不均匀采样的，那么 `RadiusNeighborsClassifier` 中的基于半径的近邻分类可能是更好的选择。用户指定一个固定半径 ，使得稀疏邻居中的点使用较少的最近邻来分类。对于高维参数空间，这个方法会由于所谓的 “维度灾难” 而变得不那么有效。

`RadiusNeighborsClassifier`与`KNeighborsClassifier`非常相似，但有两个参数除外。 首先，在`RadiusNeighborsClassifier`中，我们需要指定固定区域的半径`radius`(默认1.0)，用于确定观测是否是半径内的邻居。 将半径设置为某个值，最好将其视为任何其他超参数，并在模型选择期间对其进行调整。 第二个有用的参数是`outlier_label`(默认None)，它表示半径内没有观测的观测的标签 - 这本身通常可以是识别异常值的有用工具

In [25]:
# 创建标准化器
standardizer = StandardScaler()
# 标准化特征
X_std = standardizer.fit_transform(X)
X_std

array([[-0.9007,  1.019 , -1.3402, -1.3154],
       [-1.143 , -0.132 , -1.3402, -1.3154],
       [-1.3854,  0.3284, -1.3971, -1.3154],
       ...,
       [ 0.7957, -0.132 ,  0.8196,  1.0539],
       [ 0.4322,  0.7888,  0.9333,  1.4488],
       [ 0.0687, -0.132 ,  0.7628,  0.7907]])

In [26]:
# 训练半径邻居分类器
rnn = RadiusNeighborsClassifier(radius=.5, n_jobs=-1).fit(X_std, Y)
rnn

RadiusNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                          metric_params=None, n_jobs=-1, outlier_label=None,
                          p=2, radius=0.5, weights='uniform')

In [38]:
# 创建新观测点
new_observations = [[5, 3, 1, 0], [6, 3 , 4.9, 1.9]]
rnn.predict(standardizer.transform(new_observations))

array([0, 2])