In [1]:
import h5py
import numpy as np
from tqdm.notebook import tqdm
from multiprocessing import Pool
from sklearn.preprocessing import OneHotEncoder
import os
import datetime
import glob

In [2]:
train_f = sorted(glob.glob('../stock_price/train/*.hdf'))
# train_f.remove('../stock_price/train/20190425_20190510.hdf')
test_f = sorted(glob.glob('../stock_price/test/*.hdf'))

In [3]:
train_f

['../stock_price/train\\20190401_20190412.hdf',
 '../stock_price/train\\20190412_20190424.hdf',
 '../stock_price/train\\20190425_20190510.hdf',
 '../stock_price/train\\20190510_20190522.hdf',
 '../stock_price/train\\20190523_20190604.hdf',
 '../stock_price/train\\20190604_20190617.hdf',
 '../stock_price/train\\20190617_20190628.hdf',
 '../stock_price/train\\20190628_20190710.hdf',
 '../stock_price/train\\20190710_20190723.hdf',
 '../stock_price/train\\20190723_20190731.hdf']

In [4]:
def get_tick_hour_minute_str(tick):
    dt = datetime.datetime(1, 1, 1) + datetime.timedelta(microseconds=tick / 10)
    return dt.strftime("%H%M")

def get_tick_date_time(tick):
    dt = datetime.datetime(1, 1, 1) + datetime.timedelta(microseconds=tick / 10)
    return dt

def get_tick_weekday(tick):
    dt = datetime.datetime(1, 1, 1) + datetime.timedelta(microseconds=tick / 10)
    return dt.weekday()

def encode_time(dt):
    # 对时间数据编码
    hm = int(dt.strftime("%H%M"))
    if hm <= 1000:
        hm = 0
    elif 1000 <= hm and hm <= 1500:
        hm = 1
    else:
        raise ValueError('时间数据出错')
    return hm

# 转化为可以作用于numpy的函数
get_hour_minute_str_ = np.frompyfunc(get_tick_hour_minute_str, 1, 1)
get_tick_date_time_ = np.frompyfunc(get_tick_date_time, 1, 1)
get_tick_weekday_ = np.frompyfunc(get_tick_weekday, 1, 1)
encode_time_ = np.frompyfunc(encode_time, 1, 1)

categories = [np.array([0, 1, 10, 11, 20, 21, 30, 31, 40, 41],
                       dtype=object)]

In [5]:
def parallelize_data_file(file_list, func, n_cores=4):
    file_split = np.array_split(file_list, n_cores)
    pool = Pool(n_cores)
    print('start pool map:', len(file_split))
    result_files = pool.map(func, file_split)
    pool.close()
    pool.join()

def data_pro_parallelize(hdf_files):
    data_list = []
    for f in hdf_files.tolist():
        data_pro(f)

In [6]:
class MyDataset_p():
    def __init__(self, file_name, label_len=10, initpoint=1000):
        self.name = file_name
        self.__read_data__()
        self.n = self.y.shape[0]
        self.indexes = np.arange(self.n)
        self.mask = list(range(15))  # [1, 3, 4, 5, 6, 7, 8, 9]# [0, 2, 10, 11, 12, 13, 14]
        self.label_len = label_len
        self.shift = 9
        self.initpoint = initpoint

    def __read_data__(self):
        f = h5py.File(self.name, 'r')
        self.x = f['x'][:]
        self.y = f['y'][:]
        self.ts = f['timestamp'][:]
        f.close()
        self.var = np.var(self.y)
        self.mean = np.mean(self.y)
        
        self.y = (self.y - self.mean) / np.sqrt(self.var)
#         codedtime = encode_time_(get_tick_date_time_(self.ts))
#         codedtime = codedtime.reshape(len(codedtime), 1)
#         index = (codedtime == 0)
#         index = np.where(index)[0]
#         
#         self.x = x[index]
#         self.y = y[index]

    def __call__(self):
        batch_index = self.indexes

        # 向过去取历史时间序列

        batch_index = self.indexes[0 + self.shift + self.label_len: ]
        Y = self.y[batch_index]
        
        y_len = batch_index.shape[0]
        temp = self.y[0: ]
        for j in range(self.label_len + self.shift):
            Y = np.hstack((temp[-1 - j - y_len: -1 - j], Y))

        pY = np.empty((Y.shape), dtype=np.float32)
        pY[:, 0] = self.initpoint
        for j in range(1, self.label_len + self.shift + 1):
            pY[:, j] = (Y[:, j] / 100 + 1) * pY[:, j - 1]

        # 计算价格

        Y = Y[:, :, np.newaxis]
        pY = pY[:, :, np.newaxis]
        Y = np.concatenate((Y, pY), axis=2)
        
        self.x = self.x[19:]
        self.ts = self.ts[19:]

        return self.x, Y, self.ts, self.var, self.mean

    def __len__(self):
        return int(np.ceil(self.n / self.batch_size))

    def __del__(self):
        del self.x, self.y, self.indexes, self.ts

In [9]:
def data_pro(file):
    fname = os.path.splitext(os.path.split(file)[-1])[0]
    print('Processing'+fname)
    dataset = MyDataset_p(file)
    x, y, ts, v, m = dataset()
    
    with h5py.File('test_/' + fname+'.hdf', 'a') as new_f:
        new_f.create_dataset('x', data=x)
        new_f.create_dataset('y', data=y)
        new_f.create_dataset('timestamp', data=ts)
        new_f.create_dataset('var', data=v)
        new_f.create_dataset('mean', data=m)
    print('Done'+fname)

In [10]:
for i in test_f:
    data_pro(i)

ProcessingIC1909_20190801
DoneIC1909_20190801
ProcessingIC1909_20190802
DoneIC1909_20190802
ProcessingIC1909_20190805
DoneIC1909_20190805
ProcessingIC1909_20190806
DoneIC1909_20190806
ProcessingIC1909_20190807
DoneIC1909_20190807
ProcessingIC1909_20190808
DoneIC1909_20190808
ProcessingIC1909_20190809
DoneIC1909_20190809
ProcessingIC1909_20190812
DoneIC1909_20190812
ProcessingIC1909_20190813
DoneIC1909_20190813
ProcessingIC1909_20190814
DoneIC1909_20190814
ProcessingIC1909_20190815
DoneIC1909_20190815
ProcessingIC1910_20190819
DoneIC1910_20190819
ProcessingIC1910_20190820
DoneIC1910_20190820
ProcessingIC1910_20190821
DoneIC1910_20190821
ProcessingIC1910_20190822
DoneIC1910_20190822
ProcessingIC1910_20190823
DoneIC1910_20190823
ProcessingIC1910_20190826
DoneIC1910_20190826
ProcessingIC1910_20190827
DoneIC1910_20190827
ProcessingIC1910_20190828
DoneIC1910_20190828
ProcessingIC1910_20190829
DoneIC1910_20190829
ProcessingIC1910_20190830
DoneIC1910_20190830
ProcessingIC1910_20190902
DoneIC19