## 03 测试我们的算法
![IMAGE](https://ws1.sinaimg.cn/large/006tKfTcly1fs39dv04zpj30oq0dajw6.jpg)


In [12]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets 

In [13]:
iris = datasets.load_iris()
iris.keys()

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names'])

In [14]:
X = iris.data

In [15]:
y = iris.target

In [16]:
X.shape

(150, 4)

In [17]:
y.shape

(150,)

### train_test_split

分离出一部分数据做训练，另外一部分数据做测试。

In [18]:
y # y是按照顺序的，我们要把它打乱

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

我们不能只打乱 X，要 x 和 y 一起配对打乱。
**小妙招**：我们打乱索引，原来的数据顺序不发生变化，我们用打乱的索引来提取数据。

In [19]:
shuffled_indexes = np.random.permutation(len(X))
shuffled_indexes

array([ 96,   9,  22,  20,  65,  63,  94,  42, 135,  82,  15, 108, 138,
        58,  29,  35, 137, 116, 101,  49,   4, 111,  85,  86,  36, 119,
       118,  45, 114,  30,  56,  17,  90, 102, 110,  12,  91,  46,  47,
       149,  14,  50, 113,  81,   7,  83,  60,  51,  75,  74, 107,  95,
        72,  79, 139,  32, 130,  73,  39,  59, 112, 142,  40, 143,  57,
       144, 141,  18,  16,  33, 123,  34,  44,  98, 127,  87,   2, 125,
        84, 126,  38,  43, 109,  26, 104,  31,  55, 106, 134,   3, 136,
        52, 129,  27, 117, 145, 146,  88,  24, 140, 128,   1, 120,  28,
        37,  92,  66, 131, 133,  78,  13,   6, 121,  97,  19, 122,   8,
       103,  48,  64,   0,  25,  89,   5,  93,  41,  11, 148, 132, 124,
       147,  99,  62,  53,  61,  80,  67,  54,  23,  76,  10, 100,  68,
       105,  77,  69,  71,  70, 115,  21])

In [20]:
test_ratio = 0.2
test_size = int(len(X) * test_ratio)

In [21]:
test_indexes = shuffled_indexes[:test_size] 
train_indexes = shuffled_indexes[test_size:]

In [22]:
X_train = X[train_indexes] #❤️ fancy indexing
y_train = y[train_indexes]

X_test = X[test_indexes]
y_test = y[test_indexes]

In [23]:
print(X_train.shape)
print(y_train.shape)

(120, 4)
(120,)


In [24]:
print(X_test.shape)
print(y_test.shape)

(30, 4)
(30,)


#### 封装
把刚才写的过程封装：
```python
def train_test_split(X, y, test_ratio=0.2, seed=None):
    """将数据 X 和 y 按照test_ratio分割成X_train, X_test, y_train, y_test"""
    # 行数左右必须保持一致
    assert X.shape[0] == y.shape[0], \ 
        "the size of X must be equal to the size of y"
        
    assert 0.0 <= test_ratio <= 1.0, \
        "test_ration must be valid"

    if seed:
        np.random.seed(seed)

    shuffled_indexes = np.random.permutation(len(X))

    test_size = int(len(X) * test_ratio)
    test_indexes = shuffled_indexes[:test_size]
    train_indexes = shuffled_indexes[test_size:]

    X_train = X[train_indexes]
    y_train = y[train_indexes]

    X_test = X[test_indexes]
    y_test = y[test_indexes]

    return X_train, X_test, y_train, y_test


```

In [25]:
from playML.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [26]:
print(X_train.shape)
print(y_train.shape)

(120, 4)
(120,)


In [27]:
print(X_test.shape)
print(y_test.shape)

(30, 4)
(30,)


#### 测试我们的算法

In [28]:
from playML.kNN import KNNClassifier

my_knn_clf = KNNClassifier(k=3)
my_knn_clf.fit(X_train, y_train)
y_predict = my_knn_clf.predict(X_test)

In [29]:
y_predict

array([2, 1, 1, 0, 1, 1, 1, 2, 2, 0, 0, 0, 2, 1, 1, 1, 2, 1, 0, 2, 2, 2, 1,
       2, 0, 2, 0, 0, 0, 2])

In [30]:
y_test

array([2, 1, 1, 0, 2, 1, 1, 2, 2, 0, 0, 0, 2, 1, 2, 2, 2, 1, 0, 2, 2, 2, 1,
       1, 0, 2, 0, 0, 0, 2])

In [31]:
sum(y_predict == y_test)

26

In [32]:
sum(y_predict == y_test) / len(y_test) ## 有多少个元素一样

0.8666666666666667

#### sklearn中的train_test_split

In [33]:
from sklearn.model_selection import train_test_split

train_test_split

<function sklearn.model_selection._split.train_test_split>

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)

random_state 就是我们设计的随机种子

In [35]:
print(X_train.shape)
print(y_train.shape)

(120, 4)
(120,)


In [36]:
print(X_test.shape)
print(y_test.shape)

(30, 4)
(30,)
