In [1]:
%%time
from atrader import *
import pandas as pd
import numpy as np

code = get_code_list_set('SZAG','2005-01-01','2019-05-31')
data = pd.read_csv('../data/szag_050101_190531.csv')
data = data.merge(code[['code','name']], on='code')

Wall time: 5.05 s


### preprocess

In [2]:
def get_asset_code(df,name_col,thresh_hold):
    '''
    get assets that have trade records more than thresh_hold
    '''
    count_trade = df.groupby('name',as_index=False).agg({'close':'count'})
    count_trade = count_trade[count_trade.close>thresh_hold]
    count_trade['is_ST_or_S'] = count_trade['name'].apply(lambda x: 1 \
                                                          if x.startswith('*') or x.startswith('S') else 0)
    count_trade = count_trade[count_trade.is_ST_or_S==0]
    return count_trade

def get_valid_asset(df,asset_code,last_date):
    '''
    get assets whose last trade time are later than last_date
    '''
    df = df[df.name.isin(asset_code)]
    df = df.sort_values(['time'])
    last_trade_time = df.groupby(['name'],as_index=False)['time'].last()
    last_trade_time['time'] = pd.to_datetime(last_trade_time['time'], format='%Y-%m-%d')
    last_trade_time = last_trade_time[last_trade_time.time>=pd.to_datetime(last_date)]
    return last_trade_time

In [3]:
data['time'] = pd.to_datetime(data['time'])
data = data.sort_values(['name','time'])
asset_code = get_asset_code(data,'name',3000)
valid_asset = get_valid_asset(data, asset_code.name.unique(), '2019-05-28 15:00:00')
data = data.merge(valid_asset[['name']], on='name')
data.head()

Unnamed: 0,time,code,open,high,low,close,volume,amount,open_interest,name
0,2005-01-04 15:00:00,sse.600742,1.12844,1.13348,1.10829,1.11333,3056.0,1357672.0,,一汽富维
1,2005-01-05 15:00:00,sse.600742,1.11584,1.14355,1.10073,1.13348,2400.0,1071997.0,,一汽富维
2,2005-01-06 15:00:00,sse.600742,1.13348,1.13851,1.11081,1.11584,2027.0,900922.0,,一汽富维
3,2005-01-07 15:00:00,sse.600742,1.11333,1.14859,1.11333,1.13096,2304.0,1036593.0,,一汽富维
4,2005-01-10 15:00:00,sse.600742,1.12844,1.14607,1.12592,1.14355,2695.0,1218918.0,,一汽富维


### feature engineering

In [4]:
from itertools import product

assets = data.name.unique()
dates = np.sort(data.time.unique())
num_cols = ['close','high','low','open']
cat_cols = ['is_traded']

def get_beta(df):
    pass

def create_confidence(x):
    '''
    idea: 
    '''
    if x>0.05:
        return 0
    elif x<-0.05:
        return 0
    elif x<0:
        return -1
    elif x>0:
        return 1
    return x

def get_label(df,l,mkt_res=False,tar_col='close'):
    '''
    the confidence of the future l days' return 
    '''
    res = df.copy()
    future_df = df[['name','time',tar_col]]
    
    if mkt_res:
        pass
    
    future_df['fut_'+str(l)+'_'+tar_col] =  future_df.groupby(['name'])['close'].shift(-5)
    res = res.merge(future_df[[x for x in future_df.columns if x not in [tar_col]]],on=['name','time'],how='inner')
    res['return_fut'] = (res['fut_'+str(l)+'_'+tar_col]-res['close'])/res['close']
    res['confidence'] = res['return_fut'].apply(lambda x: create_confidence(x))
    
    return res


def pad_df(df,assets=assets,dates=dates):
    data = df.copy()
    frames = pd.DataFrame(list(product(assets, dates)), columns=['name','time'])
    train_df = frames.merge(data[['name','time']+num_cols+cat_cols], on=['name','time'], how='left')
    train_df = train_df.sort_values(['name','time'])
    train_df[[x for x in num_cols if x not in ['return_1d']]] = train_df[[x for x in num_cols if x not in ['return_1d']]].fillna(method='ffill')
    train_df[cat_cols] = train_df[cat_cols].fillna(0)
    return train_df

def get_lag_feat(df,gb_c,feat,l):
    tmp = df.groupby(gb_c)[feat].shift(l)
    tmp.columns = ['lag_'+str(l)+'_'+x for x in tmp.columns]
    res = df.copy()
    res = pd.concat([res,tmp],axis=1)
    return res

def get_close_hist_return(df,l):
    res = df.copy()
    res['close_hist_'+str(l)+'_return'] = (res['close']-res['lag_'+str(l)+'_close'])/res['lag_'+str(l)+'_close']
    return res

# get traded
data['is_traded'] = 1

# pad df
train_df = pad_df(data)

# get lag feat
lags = [5, 10]
for l in lags:
    train_df = get_lag_feat(train_df,['name'],cat_cols+num_cols,l)
    
# get close hist feat
for l in lags:
    train_df = get_close_hist_return(train_df,l)
    
# get label
ret_l = 10
train_df = get_label(train_df,ret_l)

# drop na
train_df = train_df.dropna(axis=0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [5]:
pd.options.display.max_columns=999
train_df[train_df.time>=pd.to_datetime(dates[0])+pd.offsets.BusinessDay(max(lags))].tail()

Unnamed: 0,name,time,close,high,low,open,is_traded,lag_5_is_traded,lag_5_close,lag_5_high,lag_5_low,lag_5_open,lag_10_is_traded,lag_10_close,lag_10_high,lag_10_low,lag_10_open,close_hist_5_return,close_hist_10_return,fut_10_close,return_fut,confidence
2454191,龙溪股份,2019-05-20 15:00:00,10.89,11.09,10.71,11.01,1.0,1.0,10.03,10.15,9.83,9.95,1.0,8.8,9.49,8.7,9.34,0.08574,0.2375,11.27,0.03489,1.0
2454192,龙溪股份,2019-05-21 15:00:00,10.8,11.05,10.71,10.8,1.0,1.0,9.98,10.12,9.8,9.9,1.0,9.4,9.42,8.97,8.97,0.08216,0.14894,11.85,0.09722,0.0
2454193,龙溪股份,2019-05-22 15:00:00,11.2,11.35,10.71,10.71,1.0,1.0,10.09,10.26,10.01,10.14,1.0,9.48,9.79,9.1,9.23,0.11001,0.18143,11.95,0.06696,0.0
2454194,龙溪股份,2019-05-23 15:00:00,11.03,11.33,10.89,11.11,1.0,1.0,11.0,11.1,10.09,10.14,1.0,9.53,9.76,9.41,9.41,0.00273,0.1574,11.75,0.06528,0.0
2454195,龙溪股份,2019-05-24 15:00:00,11.03,11.13,10.83,11.03,1.0,1.0,10.95,11.49,10.81,11.01,1.0,9.96,10.0,9.42,9.54,0.00731,0.10743,11.88,0.07706,0.0


In [6]:
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder().fit(train_df['name'])
train_df['name'] = enc.transform(train_df['name'])

### build model

In [7]:
from keras.layers import  *
from keras.callbacks import *
from keras.models import *

# data generation param
cat_cols = ['name']+[x for x in train_df.columns if x.endswith('is_traded')]
cat_dim = {x:len(train_df[x].unique()) for x in cat_cols}
num_cols = [x for x in train_df.columns if x not in ['time','return_fut','confidence','fut_10_close']+cat_cols]


# Hyper param
INP_DIM = len(num_cols)
EMB_DIM = 10
OUT_DIM = 1
BS = len(assets)

# train test split
train_dates = dates[2999]
val_dates = dates[3000]

def build_model():
    
    # Input
    num_in = Input(batch_shape=(BS,INP_DIM),name='num')
    cat_in = [Input(batch_shape=(BS,1),name=x) for x in cat_cols]
    cat_emb = []
    for i,col in enumerate(cat_cols):
        cat_emb.append(Embedding(cat_dim[col], EMB_DIM)(cat_in[i]))
        
    # hidden   
    cat_logit = Concatenate()([Flatten()(x) for x in cat_emb])
    cat_logit = Dense(INP_DIM,activation='relu')(cat_logit)
    cat_logit = Dropout(0.5)(cat_logit)
    X_in = Concatenate()([num_in,cat_logit])
    X_logit = Dense(4*INP_DIM, activation='relu')(X_in)
    X_logit = Dropout(0.5)(X_logit)
    X_logit = Dense(int(INP_DIM),activation='relu')(X_logit)
    X_logit = Dropout(0.5)(X_logit)
    
    # output
    final_out = Dense(OUT_DIM,activation='tanh')(X_logit)
    
    model = Model(inputs=[num_in]+cat_in, outputs=final_out)
    #model.summary()
    model.compile(loss='mse',optimizer='adam')
    return model


Using TensorFlow backend.


### Train

In [9]:
train = train_df[(train_df.time>=pd.to_datetime(dates[0])+pd.offsets.BusinessDay(max(lags)))&(train_df.time<train_dates)]
val = train_df[train_df.time>=val_dates]

X_train = train[cat_cols+num_cols+['time','confidence']]
X_val = val[cat_cols+num_cols+['time','confidence']]

es = EarlyStopping(monitor='val_loss',patience=2)
red_lr = ReduceLROnPlateau(monitor='val_loss',min_lr=0.0001,patience=1)
mc = ModelCheckpoint('model/NN_baseline_{epoch:04d}_{val_loss:.4f}.h5',save_best_only=True)
model = build_model()

def data_generator(df,cat_cols=cat_cols,num_cols=num_cols):
    dates = df.sort_values(['name','time'])['time'].unique()
    count = 0
    while True:
        for i,date in enumerate(dates):
            X = {'num':df[df.time==date][num_cols].values}
            
            for cat in cat_cols:
                X[cat] = df[df.time==date][cat].values.reshape(-1,1)
                
            y = df[df.time==date]['confidence'].values
            
            count += 1
            if count == 10:
                yield (X,y,np.array([i+1]*BS)/len(dates))
                count = 0
                
val_X = {'num':X_val[num_cols].values}
for cat in cat_cols:
    val_X[cat] = X_val[cat].values.reshape(-1)
val_y = X_val['confidence'].values


generator = data_generator(X_train)
# model.fit_generator(generator,steps_per_epoch=300,epochs=100,
#                     validation_data=(val_X,val_y),callbacks=[es,red_lr,mc],workers=0,max_queue_size=30)
model = load_model('model/NN_baseline_0002_0.6910.h5')

In [11]:
val.time.max()

Timestamp('2019-05-24 15:00:00')

In [12]:
val.time.min()

Timestamp('2017-05-12 15:00:00')

### test baseline"

In [15]:
feat = ['open','high','low','close']
train[feat+['confidence']].head()

Unnamed: 0,open,high,low,close,confidence
10,1.09821,1.13348,1.09821,1.12088,1.0
11,1.11836,1.12592,1.11081,1.1234,1.0
12,1.11584,1.12088,1.09317,1.11081,1.0
13,1.10073,1.15363,1.08814,1.13599,-1.0
14,1.15866,1.16874,1.14103,1.15111,0.0


In [24]:
inp = Input(shape=(len(feat),))
h = Dense(10,activation='relu')(inp)
h = Dropout(0.5)(h)
out = Dense(1,activation='tanh')(h)
m = Model(inp,out)
m.compile(loss='mse',optimizer='adam')

mc = ModelCheckpoint('model/test_baseline.h5',save_best_only=True)
m.fit(train[feat],train['confidence'],validation_data=(val[feat],val['confidence']),batch_size=1000,epochs=10,callbacks=[es,mc])

Train on 2095289 samples, validate on 347696 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10


<keras.callbacks.History at 0x20ddba77400>