# 数据归一化
- 某些情况下部分feature之间的距离明显大于其他feature的距离，导致部分feature主导了距离的计算
- 因此要进行每个feature的数据归一化

## 将所有的数据映射到同一尺度
- 最值归一化 normalization ：把所有数据映射到 0-1 之间， 适用于分布有明显边界的情况（如分数，rgb值），受outlier影响较大
- 均值方差归一化 standarlization ：把所有数据归一到均值为0，方差为1的分布中，适用于没有明显边界的分布，推荐使用 （x - 均值）/方差

In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [2]:
x = np.random.randint(0, 100, size = 100)
x

array([90, 54, 26, 28, 72, 80, 70, 30, 41, 52, 50,  8,  0, 26, 89, 37, 52,
       22, 29, 24, 19, 13, 75,  6, 88, 47, 24, 41, 10, 43, 74, 85, 94, 12,
        4, 82, 51, 60, 49,  0, 64, 63,  9, 86, 21, 40, 25, 36,  7, 83, 72,
       88, 78, 60, 86, 53, 53, 84, 27, 65, 98, 99, 43, 75, 65, 99, 45, 85,
       77, 58, 18, 71,  1,  4,  7, 92, 35, 21, 88, 56,  9,  6,  1, 55, 32,
       33, 48, 88, 63, 13,  9, 10, 18, 34, 38, 77, 99, 26, 11, 16])

In [3]:
(x - np.min(x))/(np.max(x) - np.min(x))           # 最值归一化

array([0.90909091, 0.54545455, 0.26262626, 0.28282828, 0.72727273,
       0.80808081, 0.70707071, 0.3030303 , 0.41414141, 0.52525253,
       0.50505051, 0.08080808, 0.        , 0.26262626, 0.8989899 ,
       0.37373737, 0.52525253, 0.22222222, 0.29292929, 0.24242424,
       0.19191919, 0.13131313, 0.75757576, 0.06060606, 0.88888889,
       0.47474747, 0.24242424, 0.41414141, 0.1010101 , 0.43434343,
       0.74747475, 0.85858586, 0.94949495, 0.12121212, 0.04040404,
       0.82828283, 0.51515152, 0.60606061, 0.49494949, 0.        ,
       0.64646465, 0.63636364, 0.09090909, 0.86868687, 0.21212121,
       0.4040404 , 0.25252525, 0.36363636, 0.07070707, 0.83838384,
       0.72727273, 0.88888889, 0.78787879, 0.60606061, 0.86868687,
       0.53535354, 0.53535354, 0.84848485, 0.27272727, 0.65656566,
       0.98989899, 1.        , 0.43434343, 0.75757576, 0.65656566,
       1.        , 0.45454545, 0.85858586, 0.77777778, 0.58585859,
       0.18181818, 0.71717172, 0.01010101, 0.04040404, 0.07070

In [4]:
x2 = np.random.randint(0, 100, (50, 2))

In [5]:
x2 = np.array(x2, dtype = float)

In [6]:
x2[:,0] = (x2[:, 0] - np.mean(x2[:, 0])) / np.std(x2[:,0])
x2[:,1] = (x2[:, 1] - np.mean(x2[:, 1])) / np.std(x2[:, 1])          # 均值方差归一化

In [7]:
x2[:10,0]

array([-0.66287691,  1.56106077, -1.48788604, -1.09331645, -0.73461683,
       -1.09331645, -0.55526702, -0.94983661,  0.377352  , -1.2367963 ])

In [8]:
x2[:10, 1]

array([-1.61792482, -1.28788576, -0.51779462,  0.03227049,  1.46243975,
        1.49911076,  1.2057427 ,  0.39898055,  1.42576874,  0.36230955])

In [9]:
np.mean(x2[:,0])

9.658940314238862e-17

In [12]:
np.std(x2[:,1])

0.9999999999999999

In [11]:
np.std(x2[:,0])

0.9999999999999998

# 对测试数据集如何归一化
- 训练数据集有均值和方差 （mean_train 和 std_train)
- 测试数据集由于各种原因（如真实环境中每次仅增加一项样本，无法计算均值和方差），所以仍使用训练集的方差和均值进行归一化。
- 即， （x_test - mean_train）/ std_train
- 在scikit_learn 中封装了 Scalar 模型（包含原算法模型）

# StandardScaler

In [15]:
import numpy as np
from sklearn import datasets

In [16]:
digits = datasets.load_iris()
x = digits.data
y = digits.target

In [17]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 666)

In [18]:
from sklearn.preprocessing import StandardScaler            # 引入标准归一化函数，MinMaxScaler为最值归一化的函数

In [19]:
standardscaler = StandardScaler()          # 新建实例

In [20]:
standardscaler.fit(x_train)         # 对x_train进行归一化，得到均值和方差

StandardScaler(copy=True, with_mean=True, with_std=True)

In [21]:
standardscaler.mean_               # 归一化之后的均值

array([5.83416667, 3.0825    , 3.70916667, 1.16916667])

In [22]:
standardscaler.scale_            # 归一化之后的方差

array([0.81019502, 0.44076874, 1.76295187, 0.75429833])

In [23]:
standardscaler.transform(x_train)

array([[-0.90616043,  0.94720873, -1.30982967, -1.28485856],
       [-1.15301457, -0.18717298, -1.30982967, -1.28485856],
       [-0.16559799, -0.64092567,  0.22169257,  0.17345038],
       [ 0.45153738,  0.72033239,  0.95909217,  1.49918578],
       [-0.90616043, -1.3215547 , -0.40226093, -0.0916967 ],
       [ 1.43895396,  0.2665797 ,  0.56203085,  0.30602392],
       [ 0.3281103 , -1.09467835,  1.07253826,  0.30602392],
       [ 2.1795164 , -0.18717298,  1.63976872,  1.2340387 ],
       [-0.78273335,  2.30846679, -1.25310662, -1.4174321 ],
       [ 0.45153738, -2.00218372,  0.44858475,  0.43859746],
       [ 1.80923518, -0.41404933,  1.46959958,  0.83631808],
       [ 0.69839152,  0.2665797 ,  0.90236912,  1.49918578],
       [ 0.20468323,  0.72033239,  0.44858475,  0.571171  ],
       [-0.78273335, -0.86780201,  0.10824648,  0.30602392],
       [-0.53587921,  1.40096142, -1.25310662, -1.28485856],
       [-0.65930628,  1.40096142, -1.25310662, -1.28485856],
       [-1.0295875 ,  0.

In [24]:
x_train = standardscaler.transform(x_train)
x_test = standardscaler.transform(x_test)         # 得到 归一化之后的训练集和测试集

In [25]:
from sklearn.neighbors import KNeighborsClassifier    # 按原模型进行knn算法

In [26]:
knn_clf = KNeighborsClassifier(n_neighbors = 4)

In [27]:
knn_clf.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=4, p=2,
           weights='uniform')

In [28]:
knn_clf.score(x_test, y_test)

1.0