## 留出法
### 1、简单的留出法

In [15]:
from sklearn.datasets import load_iris
import numpy as np

# X为数据的特征，y为数据的标签
X, y = load_iris(return_X_y=True)
print(X[:2])

# X, y的形状
print("X的形状:",X.shape)
print("y的形状:",y.shape)

# 留出后100个作为测试集，其余为训练集（留出法）
X_train = X[:-50]
y_train = y[:-50]
X_test = X[-50:]
y_test = y[-50:]

# 打印样本数
print("训练集样本数：",len(X_train))
print("测试集样本数：",len(X_test))

[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]]
X的形状: (150, 4)
y的形状: (150,)
训练集样本数： 100
测试集样本数： 50


### 2、留出法中的数据分布

In [13]:
print("训练集中的标签：",y_train)
print("测试集中的标签：",y_test)

训练集中的标签： [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
测试集中的标签： [2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2]


### 3、加入随机打散的留出法

In [18]:
#permutation函数生成一个从0到149的随机排列数组。返回值是array类型
shuffle_indexes = np.random.permutation(len(X))
print(shuffle_indexes)

[ 29  97  33   6  38  56  81  76  17   7   2  85  95 120  70  28  39 133
 103  36  21 102  80 146  25  68  79   5 147 145 131  58   0  37 129 119
  82  69  15  59  43 122  74  96  31 113  78  34  11  89 137  62  35 124
  61  83  44 140 106  52 142  41 132  23  51  55  77 127 149  20  19  27
  75   3  50 143  92  93 101  42  14  66 110  57   9 135  26  16 109 128
  67 130  47 105  63 139 148 141   8 111 107  40  54  72  32 108  87  13
  73 138  64  22 117 125  65  94  91  90  10 126 114   4  84  71 144 123
  49 115 104  30  88 116 112  18  24  46  86  45  60  12  53   1 136  98
 118 134 121 100  99  48]


In [23]:
test_ratio = 0.3
#通过设置比例的方式来确定测试集的数量，要注意结果可能不是整数，需要强制转换
test_size = int(test_ratio * len(X))
print(test_size)

45


In [24]:
test_indexes = shuffle_indexes[:test_size] #前百分之二十是测试数据集
train_indexes = shuffle_indexes[test_size:]

X_train = X[train_indexes]
y_train = y[train_indexes]

X_test = X[test_indexes]
y_test = y[test_indexes]

print(y_test)

[0 1 0 0 0 1 1 1 0 0 0 1 1 2 1 0 0 2 2 0 0 2 1 2 0 1 1 0 2 2 2 1 0 0 2 2 1
 1 0 1 0 2 1 1 0]


### 4、Sklearn中的留出法

In [29]:
from sklearn.model_selection import train_test_split

# help(train_test_split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
print(y_test)

[0 2 2 1 1 1 1 1 2 1 0 2 0 0 1 0 2 1 0 2 2 1 2 2 1 1 2 0 1 2 1 1 0 2 2 0 0
 1 1 2 2 0 1 0 2]


### 5、留出法的问题

In [61]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# help(train_test_split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# 训练模型
clf = LogisticRegression().fit(X_train, y_train)

#help(clf.score)
# 输出模型得分
print(clf.score(X_test, y_test))  # 不同的分割，结果相差很大

0.9555555555555556


## 交叉验证法
### 1、实现交叉验证法

In [108]:
num_folds = 5
num_fold_samples = len(X)/num_folds
res = []
shuffle_indexes = np.random.permutation(len(X))
X,y = X[shuffle_indexes], y[shuffle_indexes]
for fold in range(num_folds):
    #计算测试集开始和结束的索引
    start_inx = int(np.ceil(fold*num_fold_samples))
    end_inx = int(np.ceil((fold+1)*num_fold_samples))
    print(start_inx,end_inx)
    #取到训练集和测试集
    X_train = X[np.r_[:start_inx,end_inx:len(X)]]
    y_train = y[np.r_[:start_inx,end_inx:len(X)]]
    X_test = X[start_inx:end_inx]
    y_test = y[start_inx:end_inx]
    #训练和测试
    clf = LogisticRegression().fit(X_train, y_train)
    res.append(clf.score(X_test, y_test))
print("结果列表：", res)
print("最终结果：", np.average(res))

0 114
114 228
228 342
342 456
456 569
结果列表： [0.9385964912280702, 0.956140350877193, 0.9385964912280702, 0.9649122807017544, 0.9557522123893806]
最终结果： 0.9507995652848937


### 2、Sklearn中的交叉验证

In [95]:
from sklearn.model_selection import KFold
#shuffle_indexes = np.random.permutation(len(X))
#X,y = X[shuffle_indexes], y[shuffle_indexes]
kf = KFold(n_splits=5)
res = []
for train_index, test_index in kf.split(X):
    X_train, y_train = X[train_index], y[train_index]
    X_test, y_test = X[test_index], y[test_index]
    clf = LogisticRegression().fit(X_train, y_train)
    res.append(clf.score(X_test, y_test))
print("结果列表：", res)
print("最终结果：", np.average(res))

结果列表： [0.9666666666666667, 0.9666666666666667, 0.9333333333333333, 0.9333333333333333, 0.9666666666666667]
最终结果： 0.9533333333333334


### 3、作业
在 “1、实现交叉验证法” 基础上实现留10法