# 算法性能判断

**全部数据**   输入   **模型**     直接在真实 **环境**  中使用

- 模型很差怎么办？
- 真实环境难以拿到真实label？

因此，将数据集分为**训练集**和**测试集**，通过测试数据直接判断模型好坏，从而改进模型

即 **train test split**

# 测试算法

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets

In [2]:
iris = datasets.load_iris()

In [3]:
iris.keys()

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names'])

In [5]:
x = iris.data
y = iris.target

In [6]:
x.shape

(150, 4)

In [7]:
y.shape

(150,)

# Train Test Split

In [8]:
y                  # 测试集需要一定原则，可以先乱序 shuffle

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [9]:
shuffle_index = np.random.permutation(len(x))
shuffle_index                                     # 生成乱序的索引

array([132,  54, 103,  24,  21, 147,  72, 144,   3,  38, 100,  41, 138,
        88,  65, 107,  62,  64,  57,  51,  42,  77,  87,  79,  76,  83,
        40,  93,   0,  60, 148, 104,   7,  45,  12,  98,   4, 115, 140,
        16, 116,  14, 126, 133,  81,  47,  59,  49,  69, 102, 129,  39,
        29,  82,  70,  35,  37, 143,  74,  30,  17,  75, 142, 110,  68,
        27,  31,  34,  80,  85, 119,  84,  71, 127, 112,  20,  25,  97,
       118, 139,  46,  53, 109, 141,  18,  23,  19, 108, 149, 146, 124,
        50,  52,  78,  90,  15, 106, 101,  92,  91, 117, 113,  89,   2,
        94, 120, 105,  58,  28, 128,  10,  95,  11,  63,  86,   1, 145,
        56, 111,  61,   5,   8, 135,   6,  26, 131, 122,  22,  36,  33,
        99, 130,   9,  44,  43,  66,  48, 125,  96,  55, 121,  67,  13,
       134, 137, 123,  73, 114,  32, 136])

In [10]:
test_ratio = 0.2
test_size = int(len(x) * test_ratio)
test_size                         # 测试集规模

30

In [11]:
test_index = shuffle_index[:test_size]     # 测试集的索引
train_index = shuffle_index[test_size: ]   # 训练集的索引

In [12]:
x_train = x[train_index]
y_train = y[train_index]                 # 训练集数据

x_test = x[test_index]
y_test = y[test_index]                   # 测试集数据

# Sklearn 中的算法

In [25]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 666) # 参数

In [26]:
print(x_train.shape)
print(y_train.shape)

(120, 4)
(120,)


In [27]:
print(x_test.shape)
print(y_test.shape)

(30, 4)
(30,)


In [28]:
from sklearn.neighbors import KNeighborsClassifier

In [29]:
kNN_Classifier = KNeighborsClassifier(n_neighbors = 7)

In [30]:
kNN_Classifier.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=7, p=2,
           weights='uniform')

In [31]:
y_predict = kNN_Classifier.predict(x_test)

In [32]:
y_test

array([1, 2, 1, 2, 0, 1, 1, 2, 1, 1, 1, 0, 0, 0, 2, 1, 0, 2, 2, 2, 1, 0,
       2, 0, 1, 1, 0, 1, 2, 2])

In [33]:
sum(y_test  == y_predict)/len(y_test)

1.0