- 使用DataReader类替代SimpleDataReader类，可以使用留出法监控训练过程  
  - 既有训练集，又有测试集
  - 提供GenerateValidationSet方法，生成验证集    

In [None]:
import pathlib as Path
import numpy as np


class DataReader(object):
    def __init__(self, train_file, test_file):
        self.train_file_name = train_file
        self.test_file_name = test_file
        self.num_train = 0
        self.num_test = 0
        self.num_validation = 0
        self.num_feature = 0
        self.num_category = 0
        self.XTrain = None
        self.YTrain = None
        self.XTest = None
        self.YTest = None
        self.XTrainRaw = None
        self.YTrainRaw = None
        self.XTestRaw = None
        self.YTestRaw = None  # 未归一化
        self.XVld = None  # validation feature set
        self.YVld = None  # validation label set

    # read data from file
    def ReadData(self):
        train_file = Path(self.train_file_name)
        if train_file.exists():
            self.XTrain = self.XTrainRaw
            self.YTrain = self.YTrainRaw

        test_file = Path(self.test_file_name)
        if test_file.exists():
            self.XTest = self.XTestRaw
            self.YTest = self.YTestRaw

    # 特征值归一化
    def NormalizeX(self):
        x_merge = np.vstack((self.XTrainRaw, self.XTestRaw))  # 先合并再归一化
        x_merge_norm - self.__NormalizeX(x_merge)
        train_count = self.XTrainRaw.shape[0]
        self.XTrain = x_merge_norm[0:train_count, :]
        self.XTest = x_merge_norm[train_count:, :]

    # 标签值归一化
    # 对于回归问题，则把所有值映射到[0,1]之间
    # 对于分类问题:二分类任务，把标签值变为0/1（base是原始数据中负类的标签值）
    # 对于分类问题:多分类任务，把标签值变为onehot编码

    def NormalizeY(self, nettype, base=0):
        if nettype == NetType.fitting:
            ...
        elif nettype == NetType.BinaryClassifier:
            ...
        elif nettype == NetType.MultipleClassifier:
            ...

    # 样本打乱顺序后进行验证集生成
    def Validation(self, k=0):
        self.num_validation = (int)(self.num_train / k)
        self.num_train = self.num_train - self.num_validation
        # validation set
        self.XVld = self.XTrain[0 : self.num_validation]
        self.YVld = self.YTrain[0 : self.num_validation]
        # train set
        self.XTrain = self.XTrain[self.num_validation :]
        self.YTrain = self.YTrain[self.num_validation :]

    # 获取批量样本
    def GetBatchTrainSamples(self, batch_size, iteration):
        start = iteration * batch_size
        end = start + batch_size
        batch_X = self.XTrain[start:end, :]
        batch_Y = self.YTrain[start:end, :]
        return batch_X, batch_Y
    #使用相同seed来打乱
    def Shuffle(self):
        seed = np.random.randint(0, 100)
        np.random.seed(seed)
        XP = np.random.permutation(self.XTrainRaw)
        np.random.seed(seed)
        YP = np.random.permutation(self.YTrainRaw)
        self.XTrainRaw = XP
        self.YTrainRaw = YP
        