# 4.3 测试我们的kNN算法

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets

In [2]:
iris = datasets.load_iris()

In [3]:
X = iris.data
y = iris.target

In [4]:
X.shape

(150, 4)

In [5]:
y.shape

(150,)

### 4.3.1 train_test_split
- 数据集是有规律的，需要先对数据进行乱序
- 但X和y是分离的，乱序时不能把对应关系打乱
    - 方法一：将X和y合并成一个矩阵，再进行乱序
    - 方法二：将索引进行乱序，选取一定比例作为测试，利用fancyIndexing的性质来生产新的矩阵

In [6]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [7]:
# 方法一
# X_merg = np.concatenate([X,y.reshape(-1,1)])
# x_merg = np.hstack(X, y)

In [8]:
# 方法二
shuffle_indexes = np.random.permutation(len(X))

In [9]:
shuffle_indexes

array([ 10, 102,  41,  35,  20,  89, 115,  59,   5,  88,   8, 133,   4,
        99,  69, 106,  13, 130,   0, 113,  81,  60,  78, 138,  84, 135,
         9,  49, 124, 121, 125, 105,  80, 146,  83, 107, 114, 149, 122,
       127,  96,  97,  18,  75,  14,  28,  61,  36,  51,  48,   6,  27,
        74,  45, 134,  65,  38,   2, 120, 104,  42, 139, 148,  47, 112,
         3,  12, 143,  33,  31,  54,  39,  55,  23, 132,  64, 123, 103,
       126,  94,  37,  82,  72,  85, 136,  26,  43, 142, 141, 137, 144,
       129, 128,  58,  29, 145,  67,  22, 117,  63,  17, 111, 108,  11,
        93,  30,  79,  70,  44, 140,  56, 101,  53,  25, 109,  98, 118,
        34,  32,   7,  92,  66, 110,  95, 119,  24,  77,  86, 147,  40,
       131,  16,  52,  91,  87,   1,  71,  73,  90,  15,  21, 116,  57,
        50,  19,  46,  62,  76,  68, 100])

In [10]:
test_ratio = 0.2
test_size = int(len(X) * test_ratio)

In [11]:
test_size

30

In [12]:
# 乱序后的前20%作为测试集，后80%最为训练集
test_indexes = shuffle_indexes[:test_size]
train_indexes = shuffle_indexes[test_size:]

In [13]:
X_train = X[train_indexes]
y_train = y[train_indexes]

X_test = X[test_indexes]
y_test = y[test_indexes]


In [14]:
print(X_train.shape)
print(y_train.shape)

(120, 4)
(120,)


In [15]:
print(X_test.shape)
print(y_test.shape)

(30, 4)
(30,)


### 4.3.2 使用我们的算法

In [16]:
from playML.model_selection import train_test_split

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [18]:
print(X_train.shape)
print(y_train.shape)

(120, 4)
(120,)


In [19]:
print(X_test.shape)
print(y_test.shape)

(30, 4)
(30,)


In [20]:
from playML.kNN import KNNClassifer

In [21]:
my_knn_clf = KNNClassifer(k=3)

In [22]:
my_knn_clf.fit(X_train, y_train)

KNN(k=3)

In [23]:
y_predict = my_knn_clf.predict(X_test)

In [24]:
y_predict

array([0, 1, 0, 1, 1, 0, 2, 2, 0, 1, 0, 2, 0, 0, 2, 2, 1, 1, 1, 2, 2, 1,
       1, 1, 0, 1, 0, 0, 1, 2])

In [25]:
y_test

array([0, 1, 0, 1, 1, 0, 2, 2, 0, 1, 0, 2, 0, 0, 2, 1, 1, 1, 1, 2, 1, 1,
       1, 1, 0, 1, 0, 0, 1, 2])

In [26]:
sum(y_predict == y_test)

28

In [36]:
sum(y_predict == y_test) / len(y_test)

0.23333333333333334

### 4.3.3 sklearn中的train_test_split

In [65]:
from sklearn.model_selection import train_test_split

In [82]:
# test_ratio=0.2 ，random_seed=666
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=667)

In [83]:
print(X_train.shape)
print(y_train.shape)

print(X_test.shape)
print(y_test.shape)

(120, 4)
(120,)
(30, 4)
(30,)


In [84]:
from sklearn.neighbors import KNeighborsClassifier

In [85]:
kNN_classifier = KNeighborsClassifier(n_neighbors=3)

In [86]:
kNN_classifier.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=3, p=2,
           weights='uniform')

In [87]:
y_predict = kNN_classifier.predict(X_test)

In [88]:
sum(y_predict == y_test)

29

In [89]:
sum(y_predict == y_test) / len(y_test)

0.9666666666666667