In [1]:
%%time
from atrader import *
import pandas as pd
import numpy as np

code = get_code_list_set('SZAG','2005-01-01','2019-05-31')
data = pd.read_csv('../data/szag_050101_190531.csv')
data = data.merge(code[['code','name']], on='code')

Wall time: 5.04 s


### preprocess

In [2]:
def get_asset_code(df,name_col,thresh_hold):
    '''
    get assets that have trade records more than thresh_hold
    '''
    count_trade = df.groupby('name',as_index=False).agg({'close':'count'})
    count_trade = count_trade[count_trade.close>thresh_hold]
    count_trade['is_ST_or_S'] = count_trade['name'].apply(lambda x: 1 \
                                                          if x.startswith('*') or x.startswith('S') else 0)
    count_trade = count_trade[count_trade.is_ST_or_S==0]
    return count_trade

def get_valid_asset(df,asset_code,last_date):
    '''
    get assets whose last trade time are later than last_date
    '''
    df = df[df.name.isin(asset_code)]
    df = df.sort_values(['time'])
    last_trade_time = df.groupby(['name'],as_index=False)['time'].last()
    last_trade_time['time'] = pd.to_datetime(last_trade_time['time'], format='%Y-%m-%d')
    last_trade_time = last_trade_time[last_trade_time.time>=pd.to_datetime(last_date)]
    return last_trade_time

def get_lag_feat(df,feat,gb_c,lag):
    '''
    get lag features
    '''
    df = df.sort_values(gb_c+['time'])
    df['lag_'+str(np.abs(lag))+'_'+str(feat)] = df.groupby(gb_c)[feat].shift(lag)
    return df

In [3]:
data['time'] = pd.to_datetime(data['time'])
data = data.sort_values(['name','time'])
asset_code = get_asset_code(data,'name',3000)
valid_asset = get_valid_asset(data, asset_code.name.unique(), '2019-05-28 15:00:00')
data = data.merge(valid_asset[['name']], on='name')
data = get_lag_feat(data, 'close', ['name'], 1)

### build model
#### train val split

In [58]:
assets = data.name.unique()
dates = np.sort(data.time.unique())

train_dates = dates[:3000]
val_dates = dates[3000:]

num_cols = ['return_1d']
cat_cols = ['is_null']
cat_dim = {}
cat_dim['is_null'] = 2

#### training set generators

In [48]:
from itertools import product

# Hyper param
INP_DIM = 1
EMB_DIM = 10
SEQ_LEN = 10
OUT_DIM = 1

i = 0
frames = pd.DataFrame(product(assets, dates[i:i+SEQ_LEN+1]),columns=['name','time'])
train_df = frames.merge(data[['name','time','close','lag_1_close']], on=['name','time'], how='left')
train_df['return_1d'] = (train_df['close']-train_df['lag_1_close'])/train_df['lag_1_close']
train_df['is_null'] = 0
train_df.loc[train_df['return_1d'].isnull()==True, 'is_null'] = 1
train_df['return_1d'] = train_df['return_1d'].fillna(0)


X = {}

num_X = []
for col in num_cols:
    num_X.append(pd.pivot_table(train_df[['name','time','return_1d']], values='return_1d', columns='time',index='name').values[:,:-1] )
num_X = np.vstack(num_X).reshape(-1,SEQ_LEN,1)
X['num'] = num_X

for col in cat_cols:
    X[col] =  pd.pivot_table(train_df[['name','time',col]], values=col, columns='time',index='name').values[:,:SEQ_LEN]
    cat_dim[col] = len(train_df[col].unique())
train_y = pd.pivot_table(train_df[['name','time','return_1d']], values='return_1d', columns='time',index='name').values[:,1:]
train_y = train_y.reshape(-1,SEQ_LEN,1)

In [49]:
from keras.layers import  *
from keras.callbacks import *
from keras.models import *

# define batch size(always train with all assets within a trading day)
BS = len(train_df.name.unique())

num_in = Input(batch_shape=(BS,SEQ_LEN,INP_DIM),name='num')
num_lstm = LSTM(EMB_DIM*len(cat_cols),return_sequences=True)(num_in)
cat_in = [Input(batch_shape=(BS,SEQ_LEN),name=x) for x in cat_cols]
cat_emb = []
for i,col in enumerate(cat_cols):
    cat_emb.append(Embedding(cat_dim[col], EMB_DIM)(cat_in[i]))
X_in = Concatenate()([num_lstm]+cat_emb)
lstm_out = LSTM(64,return_sequences=True)(X_in)
final_out = LSTM(OUT_DIM, return_sequences=True)(lstm_out)

model = Model(inputs=[num_in]+cat_in, outputs=final_out)
model.summary()
model.compile(loss='mse',optimizer='adam')

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
num (InputLayer)                (702, 10, 1)         0                                            
__________________________________________________________________________________________________
is_null (InputLayer)            (702, 10)            0                                            
__________________________________________________________________________________________________
lstm_18 (LSTM)                  (702, 10, 10)        480         num[0][0]                        
__________________________________________________________________________________________________
embedding_10 (Embedding)        (702, 10, 10)        20          is_null[0][0]                    
__________________________________________________________________________________________________
concatenat

### Train

In [53]:
es = EarlyStopping(monitor='loss',patience=10)
red_lr = ReduceLROnPlateau(monitor='loss',min_lr=0.0005,patience=5)
model.fit(X, train_y,steps_per_epoch=1, epochs=100,callbacks=[es,red_lr])


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x1d7fdbef630>

<tf.Tensor 'input_5:0' shape=(702, 10, 1) dtype=float32>