In [1]:
%%time
from atrader import *
import pandas as pd
import numpy as np

code = get_code_list_set('SZAG','2005-01-01','2019-05-31')
data = pd.read_csv('../data/szag_050101_190531.csv')
data = data.merge(code[['code','name']], on='code')

Wall time: 4min 40s


### preprocess

In [2]:
def get_asset_code(df,name_col,thresh_hold):
    '''
    get assets that have trade records more than thresh_hold
    '''
    count_trade = df.groupby('name',as_index=False).agg({'close':'count'})
    count_trade = count_trade[count_trade.close>thresh_hold]
    count_trade['is_ST_or_S'] = count_trade['name'].apply(lambda x: 1 \
                                                          if x.startswith('*') or x.startswith('S') else 0)
    count_trade = count_trade[count_trade.is_ST_or_S==0]
    return count_trade

def get_valid_asset(df,asset_code,last_date):
    '''
    get assets whose last trade time are later than last_date
    '''
    df = df[df.name.isin(asset_code)]
    df = df.sort_values(['time'])
    last_trade_time = df.groupby(['name'],as_index=False)['time'].last()
    last_trade_time['time'] = pd.to_datetime(last_trade_time['time'], format='%Y-%m-%d')
    last_trade_time = last_trade_time[last_trade_time.time>=pd.to_datetime(last_date)]
    return last_trade_time

def get_lag_feat(df,feat,gb_c,lag):
    '''
    get lag features
    '''
    df = df.sort_values(gb_c+['time'])
    df['lag_'+str(np.abs(lag))+'_'+str(feat)] = df.groupby(gb_c)[feat].shift(lag)
    return df

In [3]:
data['time'] = pd.to_datetime(data['time'])
data = data.sort_values(['name','time'])
asset_code = get_asset_code(data,'name',3000)
valid_asset = get_valid_asset(data, asset_code.name.unique(), '2019-05-28 15:00:00')
data = data.merge(valid_asset[['name']], on='name')
data = get_lag_feat(data, 'close', ['name'], 1)

In [4]:
data.head()

Unnamed: 0,time,code,open,high,low,close,volume,amount,open_interest,name,lag_1_close
0,2005-01-04 15:00:00,sse.600742,1.12844,1.13348,1.10829,1.11333,3056.0,1357672.0,,一汽富维,
1,2005-01-05 15:00:00,sse.600742,1.11584,1.14355,1.10073,1.13348,2400.0,1071997.0,,一汽富维,1.11333
2,2005-01-06 15:00:00,sse.600742,1.13348,1.13851,1.11081,1.11584,2027.0,900922.0,,一汽富维,1.13348
3,2005-01-07 15:00:00,sse.600742,1.11333,1.14859,1.11333,1.13096,2304.0,1036593.0,,一汽富维,1.11584
4,2005-01-10 15:00:00,sse.600742,1.12844,1.14607,1.12592,1.14355,2695.0,1218918.0,,一汽富维,1.13096


### feature engineering

In [9]:
def get_beta(df):
    pass

def create_confidence(x):
    '''
    idea: 
    '''
    if x>0.05:
        return 

def get_label(df,mkt_res):
    if mkt_res:
        pass
    else:
        df['return_1d'] = (df['close']-df['lag_1_close'])/df['lag_1_close']
    
    df['confidence'] = df['retu']
    return df

data = get_label(data, False)

In [12]:
data.head()

Unnamed: 0,time,code,open,high,low,close,volume,amount,open_interest,name,lag_1_close,return_1d
0,2005-01-04 15:00:00,sse.600742,1.12844,1.13348,1.10829,1.11333,3056.0,1357672.0,,一汽富维,,
1,2005-01-05 15:00:00,sse.600742,1.11584,1.14355,1.10073,1.13348,2400.0,1071997.0,,一汽富维,1.11333,0.0181
2,2005-01-06 15:00:00,sse.600742,1.13348,1.13851,1.11081,1.11584,2027.0,900922.0,,一汽富维,1.13348,-0.01556
3,2005-01-07 15:00:00,sse.600742,1.11333,1.14859,1.11333,1.13096,2304.0,1036593.0,,一汽富维,1.11584,0.01354
4,2005-01-10 15:00:00,sse.600742,1.12844,1.14607,1.12592,1.14355,2695.0,1218918.0,,一汽富维,1.13096,0.01114


#### build model

In [22]:
from keras.layers import  *
from keras.callbacks import *
from keras.models import *


# Hyper param
INP_DIM = 1
EMB_DIM = 10
SEQ_LEN = 10
OUT_DIM = 1
BS = len(assets)

assets = data.name.unique()
dates = np.sort(data.time.unique())
num_cols = ['return_1d']
cat_cols = ['is_traded']
cat_dim = {}
cat_dim['is_traded'] = 2
train_dates = dates[:3000]
val_dates = dates[3000-SEQ_LEN:]

def build_model():
    num_in = Input(batch_shape=(BS,SEQ_LEN,INP_DIM),name='num')
    num_lstm = LSTM(EMB_DIM,,activations='relu',return_sequences=True)(num_in)
    cat_in = [Input(batch_shape=(BS,SEQ_LEN),name=x) for x in cat_cols]
    cat_emb = []
    for i,col in enumerate(cat_cols):
        cat_emb.append(Embedding(cat_dim[col], EMB_DIM)(cat_in[i]))
    X_in = Concatenate()([num_lstm]+cat_emb)
    lstm_out = LSTM(64,activations='relu',return_sequences=True)(X_in)
    final_out = LSTM(OUT_DIM, activations='tanh',return_sequences=True)(lstm_out)

    model = Model(inputs=[num_in]+cat_in, outputs=final_out)
    model.summary()
    model.compile(loss='mse',optimizer='adam')
    return model


#### Train

In [38]:
from itertools import product
from tqdm import *

es = EarlyStopping(monitor='loss',patience=10)
red_lr = ReduceLROnPlateau(monitor='loss',min_lr=0.0005,patience=5)
epochs = 10
X_in = []
ys = []

model = build_model()
for epoch in range(epochs):
    
    for i in trange(len(train_dates)-SEQ_LEN-1):
        
        if epoch == 0:
            
            X= {}
            
            frames = pd.DataFrame(product(assets, dates[i:i+SEQ_LEN+1]),columns=['name','time'])
            train_df = frames.merge(data[['name','time','close','lag_1_close','return_1d']], on=['name','time'], how='left')
            train_df['is_traded'] = 0
            train_df.loc[train_df['return_1d'].isnull()==True, 'is_null'] = 1
            train_df['return_1d'] = train_df['return_1d'].fillna(0)

            num_X = []
            for col in num_cols:
                num_X.append(pd.pivot_table(train_df[['name','time','return_1d']], values='return_1d', columns='time',index='name').values[:,:-1] )
            num_X = np.vstack(num_X).reshape(-1,SEQ_LEN,1)
            X['num'] = num_X
            
            for col in cat_cols:
                X[col] =  pd.pivot_table(train_df[['name','time',col]], values=col, columns='time',index='name').values[:,:SEQ_LEN]
                cat_dim[col] = len(train_df[col].unique())
            
            train_y = pd.pivot_table(train_df[['name','time','return_1d']], values='return_1d', columns='time',index='name').values[:,1:]
            train_y = train_y.reshape(-1,SEQ_LEN,1)
            
            X_in.append(X)
            ys.append(train_y)
            
            history = model.fit(X,train_y,epochs=1,batch_size=BS,verbose=0)
            
        else:
            
            model.fit(X_in[i], ys[i], epochs=1, batch_size=BS)
            
    print(print('epoch: {},  loss: {}'.format(epoch, np.mean(history.history['loss']))))

 40%|██████████████████████████████▉                                               | 1187/2989 [14:17<36:47,  1.22s/it]

KeyboardInterrupt: 

{'num': array([[[ 0.        ],
         [ 0.01809955],
         [-0.01555556],
         ...,
         [ 0.00440529],
         [-0.00657895],
         [-0.03090508]],
 
        [[ 0.        ],
         [ 0.02493766],
         [ 0.01459854],
         ...,
         [ 0.00932401],
         [-0.02771363],
         [-0.02612827]],
 
        [[ 0.        ],
         [ 0.02583026],
         [ 0.01258993],
         ...,
         [ 0.02108963],
         [-0.02237522],
         [-0.02992958]],
 
        ...,
 
        [[ 0.        ],
         [ 0.06459948],
         [-0.00728155],
         ...,
         [-0.00243902],
         [-0.00488998],
         [-0.04422604]],
 
        [[ 0.        ],
         [ 0.01644737],
         [-0.00647249],
         ...,
         [-0.00949367],
         [-0.01597444],
         [-0.03896104]],
 
        [[ 0.        ],
         [ 0.01180438],
         [-0.00333333],
         ...,
         [-0.00668896],
         [ 0.003367  ],
         [-0.0385906 ]]]), 'is_traded':