### train_test_split 的实现

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets

In [20]:
# 得到鸢尾花数据集
iris = datasets.load_iris()

In [3]:
X = iris.data
y=iris.target

### train_test_split 的实现

这个测试的方法：先将我们用到的数据分为两部分。一部分是训练数据集，用来建立模型，另一部分是测试 数据集，用于测试。如果测试部分的数据符合模型则说明模型是性能好的模型

In [4]:
# 1.将数据进行乱序处理
# 进行乱序处理：因为X和y 的值都是按照种类顺序进行排序的，所以要进行随机处理。并且保证 X，y 随机的规则是一样的
# 方法1:将 X 中的元素进行一个索引的随机排列
shuffle_indexes = np.random.permutation(len(X))
shuffle_indexes
# 方法二：将 y 结果看做一个属性，使得X 成为一个

array([ 24,  42, 112,   3, 146,  44,  18, 130,  79, 116,  67,   9,  26,
       132,   2, 101,   4,  45, 145,  62, 148, 127,  57, 122,  12, 129,
        51,  90, 142,  47, 126, 133,  49,  86,  69, 119,  54, 105,  27,
       123,  88, 143, 137, 114,  91, 125, 147,  50,  21,  53,  19, 139,
        46,  83, 140,  36,  10,  40,  87, 141, 104,  98,  89,  75,   7,
        22, 106, 103,  58, 107,   6, 102,  16, 136,  92,  34, 118,  84,
        33,  60,  43,  41,  17, 128,  80,  29,  78,   0,  85,  68,  95,
        15,  73, 144, 121,  74,  28,  72,  82,  55,  20,  37, 149, 131,
        99,  93,  63,  32,  66,  61, 115,  14,  77, 124,  96, 113, 110,
        31,  25,  23,  38,  71,  35, 100, 109,  64, 134,  76,  52, 138,
        70, 135,  81,  97,  48, 117,  59, 120,  65,  11,   5, 111,   8,
       108,  56,  30,  39,  94,  13,   1])

In [5]:
# 2. 规定测试数据集的大小
test_ratio = 0.2
test_size = int(len(X)*test_ratio)

In [6]:
# 前面为测试集的下标，取出相应测试集的样本下标，成为一维数组
test_indexes = shuffle_indexes[:test_size]
# 后面的为训练集的下标
train_indexes = shuffle_indexes[test_size:]

In [7]:
# 根据下标（索引值），形成相应的训练数组
X_train = X[train_indexes]
y_train = y[train_indexes]

In [8]:
# 根据下标（索引值），形成相应的测试数组
X_test = X[test_indexes]
y_test = y[test_indexes]

### train_test_split 封装实现

In [9]:
from pycharm_knn.model_selection import train_test_split

In [10]:
# 1.将数据进行分类
X_train,X_test,y_train,y_test = train_test_split(X,y)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(120, 4)
(120,)
(30, 4)
(30,)


In [11]:
# 2.使用算法
from pycharm_knn.KNN import KNNClassifier

In [12]:
# 创建实例对象
my_knn_clf = KNNClassifier(k=3)

In [13]:
# 数据拟合
my_knn_clf.fit(X_train,y_train)

KNN(k=3)

In [14]:
# 预测结果
y_predict = my_knn_clf.predict(X_test)
y_predict

array([1, 0, 2, 0, 2, 0, 2, 1, 0, 0, 1, 0, 2, 1, 2, 2, 0, 0, 2, 1, 0, 2,
       0, 0, 2, 0, 2, 1, 1, 1])

In [15]:
# 将预测结果与 y_text 的数据进行比对就可以知道准确率了
# 即数两个数组中有几个元素一样
sum(y_predict == y_test)

30

In [16]:
# 求准确率是多少
sum(y_predict == y_test)/len(y_test)

1.0

### sklearn 中的 train_test_split

In [21]:
# 引用 sklearn 的 train_test_split
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

In [18]:
# 1.test_size 默认是 0.2,random_state 随机种子是  666
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=666)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(120, 4)
(120,)
(30, 4)
(30,)


In [24]:
#2.创建实例对象，建立模型
KNN_Classifier = KNeighborsClassifier(n_neighbors=6)

In [None]:
#3.数据拟合
KNN_Classifier.fit(X_train,y_train)

### 数据测试

In [27]:
# 4.预测
y_predict = KNN_Classifier.predict(X_test)
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [30]:
#5.计算 y_predit 与 y_test 的重合个数
chonghe=sum(y_predict == y_test)

In [31]:
# 6.计算重合的比率
chonghe / len(y_test)

1.0