In [2]:
import tushare as ts
import pandas as pd
import numpy as np

## get raw data from tushare

In [3]:
pro=ts.pro_api("***") # tushare key
data=pro.query('stock_basic', exchange='', list_status='L', fields='ts_code,market,exchange,industry,list_date,list_status,delist_date')
ts_code_list=data["ts_code"].tolist()

df=None
for i,ts_code in enumerate(ts_code_list):
    new_df = pro.daily(**{
        "ts_code": ts_code,
        "trade_date": "",
        "start_date": 20100101,
        "end_date": 20201231,
        "offset": "",
        "limit": ""
    }, fields=[
        "ts_code",
        "trade_date",
        "open",
        "high",
        "low",
        "close",
        "pre_close",
        "change",
        "pct_chg",
        "vol",
        "amount"
    ])
    df=new_df if df is None else pd.concat([df,new_df])

In [4]:
df_daily=df.sort_values(by=['ts_code', 'trade_date'], ascending=[True, True])
df_daily=df_daily.reset_index()
df_daily.head()

In [6]:
# get return
df_daily["return"]=df_daily.groupby("ts_code").shift(-2).apply(lambda x:x["close"]/x["pre_close"]-1,axis=1)
df_daily[abs(df_daily["return"])>0.95]=np.nan

## FE: manual

In [11]:
## price
df_daily[["sma_open","sma_close","sma_high","sma_low"]]=\
df_daily.groupby("ts_code")[["open","close","high","low"]].rolling(5).mean().values

for col in ["open","close","high","low"]:
    df_daily[col+"_s0"]=df_daily.groupby("ts_code").shift(0)[col]*5/15
    df_daily[col+"_s1"]=df_daily.groupby("ts_code").shift(1)[col]*4/15
    df_daily[col+"_s2"]=df_daily.groupby("ts_code").shift(2)[col]*3/15
    df_daily[col+"_s3"]=df_daily.groupby("ts_code").shift(3)[col]*2/15
    df_daily[col+"_s4"]=df_daily.groupby("ts_code").shift(4)[col]*1/15
    df_daily["wma_"+col]=df_daily[[col+"_s0",col+"_s1",col+"_s2",col+"_s3",col+"_s4"]].sum(axis=1,skipna=False)
    
    df_daily[col+"_s1"]=df_daily.groupby("ts_code").shift(1)[col]
    df_daily["diff_"+col+"_s0"]=df_daily[col]/df_daily[col+"_s1"]-1
    df_daily["diff_"+col+"_s1"]=df_daily.groupby("ts_code").shift(1)["diff_"+col+"_s0"]
    df_daily["diff_"+col+"_s2"]=df_daily.groupby("ts_code").shift(2)["diff_"+col+"_s0"]
    df_daily["diff_"+col+"_s3"]=df_daily.groupby("ts_code").shift(3)["diff_"+col+"_s0"]
    df_daily["diff_"+col+"_s4"]=df_daily.groupby("ts_code").shift(4)["diff_"+col+"_s0"]
    df_daily["sma_diff_"+col]=df_daily[["diff_"+col+"_s0","diff_"+col+"_s1","diff_"+col+"_s2","diff_"+col+"_s3","diff_"+col+"_s4"]].mean(axis=1,skipna=False)
    
    df_daily["norm_sma_"+col]=df_daily[col]/df_daily["sma_"+col]-1
    df_daily["norm_wma_"+col]=df_daily[col]/df_daily["wma_"+col]-1
    
    df_daily=df_daily.drop([col+"_s0",col+"_s1",col+"_s2",col+"_s3",col+"_s4"],axis=1)

In [12]:
## change pct_chg
df_daily[["sma_change","sma_pct_chg"]]=\
df_daily.groupby("ts_code")[["change","pct_chg"]].rolling(5).mean().values

for col in ["change","pct_chg"]:
    df_daily[col+"_s0"]=df_daily.groupby("ts_code").shift(0)[col]*5/15
    df_daily[col+"_s1"]=df_daily.groupby("ts_code").shift(1)[col]*4/15
    df_daily[col+"_s2"]=df_daily.groupby("ts_code").shift(2)[col]*3/15
    df_daily[col+"_s3"]=df_daily.groupby("ts_code").shift(3)[col]*2/15
    df_daily[col+"_s4"]=df_daily.groupby("ts_code").shift(4)[col]*1/15
    df_daily["wma_"+col]=df_daily[[col+"_s0",col+"_s1",col+"_s2",col+"_s3",col+"_s4"]].sum(axis=1,skipna=False)
    df_daily=df_daily.drop([col+"_s0",col+"_s1",col+"_s2",col+"_s3",col+"_s4"],axis=1)

In [13]:
## vol, amount
df_daily["log_vol"]=np.log(df_daily["vol"])
df_daily["log_vol_s1"]=df_daily.groupby("ts_code").shift(1)["log_vol"]
df_daily["diff_log_vol"]=df_daily["log_vol"]/df_daily["log_vol_s1"]-1
df_daily=df_daily.drop(["log_vol_s1"],axis=1)
df_daily["sma_diff_log_vol"]=df_daily.groupby("ts_code")["diff_log_vol"].rolling(5).mean().values

df_daily["log_amount"]=np.log(df_daily["amount"])
df_daily["log_amount_s1"]=df_daily.groupby("ts_code").shift(1)["log_amount"]
df_daily["diff_log_amount"]=df_daily["log_amount"]/df_daily["log_amount_s1"]-1
df_daily=df_daily.drop(["log_amount_s1"],axis=1)
df_daily["sma_diff_log_amount"]=df_daily.groupby("ts_code")["diff_log_amount"].rolling(5).mean().values

In [16]:
## ctc
df_daily["ctc"]=df_daily["close"]/df_daily["pre_close"]-1

col="ctc"
df_daily[col+"_s0"]=df_daily.groupby("ts_code").shift(0)[col]*5/15
df_daily[col+"_s1"]=df_daily.groupby("ts_code").shift(1)[col]*4/15
df_daily[col+"_s2"]=df_daily.groupby("ts_code").shift(2)[col]*3/15
df_daily[col+"_s3"]=df_daily.groupby("ts_code").shift(3)[col]*2/15
df_daily[col+"_s4"]=df_daily.groupby("ts_code").shift(4)[col]*1/15
df_daily["wma_"+col]=df_daily[[col+"_s0",col+"_s1",col+"_s2",col+"_s3",col+"_s4"]].sum(axis=1,skipna=False)

col="ctc"
df_daily[col+"_s0"]=df_daily.groupby("ts_code").shift(0)[col]
df_daily[col+"_s1"]=df_daily.groupby("ts_code").shift(1)[col]
df_daily[col+"_s2"]=df_daily.groupby("ts_code").shift(2)[col]
df_daily[col+"_s3"]=df_daily.groupby("ts_code").shift(3)[col]
df_daily[col+"_s4"]=df_daily.groupby("ts_code").shift(4)[col]

df_daily[["sma_ctc"]]=\
df_daily.groupby("ts_code")[["ctc_s0"]].rolling(5).mean().values

df_daily=df_daily.drop(["ctc"],axis=1)

In [4]:
feature_list=[
    "change","sma_change","wma_change",
    "pct_chg","sma_pct_chg","wma_pct_chg",
    "diff_open_s0","diff_open_s1","diff_open_s2","diff_open_s3","diff_open_s4","sma_diff_open","norm_sma_open","norm_wma_open",
    "diff_close_s0","diff_close_s1","diff_close_s2","diff_close_s3","diff_close_s4","sma_diff_close","norm_sma_close","norm_wma_close",
    "diff_high_s0","diff_high_s1","diff_high_s2","diff_high_s3","diff_high_s4","sma_diff_high","norm_sma_high","norm_wma_high",
    "diff_low_s0","diff_low_s1","diff_low_s2","diff_low_s3","diff_low_s4","sma_diff_low","norm_sma_low","norm_wma_low",
    "diff_log_vol","sma_diff_log_vol","diff_log_amount","sma_diff_log_amount",
]
df_daily_clean=df_daily.dropna(axis=0)
df_daily_feature=df_daily_clean[feature_list].copy()

## feature normalization: clip

In [None]:
## change pct_chg
for col in ["change","sma_change","wma_change","pct_chg","sma_pct_chg","wma_pct_chg"]:
    t=df_daily_feature[[col]].quantile([.05,.95]).abs().mean().values
    print(t)
    df_daily_feature.loc[:,col]=df_daily_feature[[col]].clip(-t,t)
    
## price
for col_part1 in ["open","close","high","low"]:
    for col_part2 in ["s0","s1","s2","s3","s4"]:
        col = "diff_"+col_part1+"_"+col_part2
        t=df_daily_feature[[col]].quantile([.01,.99]).abs().mean().values
        print(t)
        df_daily_feature.loc[:,col]=df_daily_feature[[col]].clip(-t,t)
    
    for col_part3 in ["sma_diff_","norm_sma_","norm_wma_"]: 
        col=col_part3+col_part1
        t=df_daily_feature[[col]].quantile([.01,.99]).abs().mean().values
        print(t)
        df_daily_feature.loc[:,col]=df_daily_feature[[col]].clip(-t,t)
        
## vol,amount
for col in ["diff_log_vol","sma_diff_log_vol","diff_log_amount","sma_diff_log_amount"]:
    t=df_daily_feature[[col]].quantile([.01,.99]).abs().mean().values
    print(t)
    df_daily_feature.loc[:,col]=df_daily_feature[[col]].clip(-t,t)

## feature normalization: zscore 

In [12]:
# except change & pct_chg
df_tmp=df_daily_feature[["change","sma_change","wma_change","pct_chg","sma_pct_chg","wma_pct_chg"]].copy()
df_daily_feature = (df_daily_feature - df_daily_feature.mean())/df_daily_feature.std(ddof=0)
df_daily_feature[["change","sma_change","wma_change","pct_chg","sma_pct_chg","wma_pct_chg"]]=df_tmp[["change","sma_change","wma_change","pct_chg","sma_pct_chg","wma_pct_chg"]]

In [13]:
df_daily_feature.describe()

Unnamed: 0,change,sma_change,wma_change,pct_chg,sma_pct_chg,wma_pct_chg,diff_open_s0,diff_open_s1,diff_open_s2,diff_open_s3,...,diff_low_s2,diff_low_s3,diff_low_s4,sma_diff_low,norm_sma_low,norm_wma_low,diff_log_vol,sma_diff_log_vol,diff_log_amount,sma_diff_log_amount
count,4047079.0,4047079.0,4047079.0,4047079.0,4047079.0,4047079.0,4047079.0,4047079.0,4047079.0,4047079.0,...,4047079.0,4047079.0,4047079.0,4047079.0,4047079.0,4047079.0,4047079.0,4047079.0,4047079.0,4047079.0
mean,0.001994164,0.001774752,0.001586605,0.02441761,0.03291832,0.03220003,2.3998560000000002e-17,2.287668e-17,3.369833e-17,3.82741e-18,...,1.455952e-17,-1.325548e-17,1.598778e-18,2.853879e-18,-1.1512080000000001e-17,-1.385242e-18,1.400242e-17,-1.1735930000000002e-17,2.0660550000000002e-17,1.2167830000000002e-17
std,0.2848042,0.1329774,0.1449582,2.2304,1.039218,1.129101,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-0.635,-0.293,-0.3206667,-4.49,-2.046661,-2.244333,-3.313361,-3.312844,-3.313065,-3.31269,...,-3.280353,-3.280249,-3.28005,-3.162946,-3.176289,-3.206833,-2.716579,-2.751629,-2.717637,-2.759546
25%,-0.12,-0.058,-0.06333333,-1.3333,-0.63,-0.6713333,-0.4972795,-0.4975681,-0.4975074,-0.4976044,...,-0.4407705,-0.4407205,-0.4409146,-0.5163607,-0.483652,-0.4657443,-0.6752563,-0.6611769,-0.6759028,-0.6610347
50%,0.0,2.284284e-15,6.938894e-18,0.0,0.03,0.02534,-0.01250608,-0.01230546,-0.01234787,-0.01223466,...,0.03810602,0.03814754,0.03818125,0.02044898,0.02563847,0.03256079,-0.09293069,-0.0598908,-0.09549179,-0.06374113
75%,0.12,0.062,0.066,1.3557,0.694,0.7413333,0.4835162,0.4834947,0.4835118,0.4836749,...,0.4729201,0.4729028,0.4730134,0.5307077,0.5273723,0.5212694,0.5812313,0.5980222,0.5791432,0.5947486
max,0.635,0.293,0.3206667,4.49,2.046661,2.244333,3.288349,3.288233,3.288369,3.288221,...,3.268422,3.268134,3.267953,3.116776,3.173908,3.213383,2.703455,2.618152,2.705767,2.631336


In [14]:
added_col=[
    "ctc_s0","ctc_s1","ctc_s2","ctc_s3","ctc_s4","wma_ctc","sma_ctc",
    "ts_code","trade_date","return",
]
df_daily_feature[added_col]=df_daily_clean[added_col]

## FE: gplearn 

In [21]:
from gplearn.genetic import SymbolicRegressor, SymbolicTransformer
from scipy.stats import pearsonr

In [378]:
df_daily_feature_train=df_daily_feature[df_daily_feature["trade_date"]<20200000]
df_daily_feature_test=df_daily_feature[df_daily_feature["trade_date"]>20200000]

df_daily_return_train=df_daily_return[df_daily_return["trade_date"]<20200000]
df_daily_return_test=df_daily_return[df_daily_return["trade_date"]>20200000]

X_train=np.array(df_daily_feature_train.iloc[:,:-2])
y_train=np.array(df_daily_return_train.loc[:,"return"])

X_test=np.array(df_daily_feature_test.iloc[:,:-2])
y_test=np.array(df_daily_return_test.loc[:,"return"])

In [52]:
function_set = ['add', 'sub', 'mul', 'div', 'log', 'sqrt', 'abs', 'neg', 'max', 'min']

gp = SymbolicTransformer(generations=3, population_size=1000,
                         hall_of_fame=100, n_components=10,
                         function_set=function_set,
                         parsimony_coefficient=0.0005,
                         max_samples=0.9, verbose=1,
                         random_state=0, n_jobs=10)

gp.fit(X_train,y_train)

    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0    11.96        0.0118126        7        0.0518513        0.0470059     40.35m
   1     4.28        0.0280321        7        0.0518283        0.0472717     27.03m
   2     3.48        0.0345581        6        0.0618487        0.0639176     24.35m
   3     5.92        0.0297654        6        0.0622575        0.0602434     24.66m
   4     6.64        0.0316171        7        0.0642224        0.0641897     23.80m
   5     6.88        0.0430464       17        0.0714363        0.0728204     22.96m
   6     8.07        0.0421087       19        0.0743223        0.0707659     22.56m
   7     9.83        0.0420344       21        0.0771666        0.0753203     21.92m
   8    13.39        0.0465328       22         0.087711        0.0883511  

SymbolicTransformer(function_set=['add', 'sub', 'mul', 'div', 'log', 'sqrt',
                                  'abs', 'neg', 'max', 'min'],
                    generations=25, max_samples=0.9, n_components=3, n_jobs=80,
                    parsimony_coefficient=0.0005, random_state=0, verbose=1)

In [47]:
print(gp)
X_new=gp.transform(X_train)
for i in range(X_new.shape[-1]):
    print(f"{i} {pearsonr(X_new[:,i],y_train)[0]}")

[mul(mul(X36, X18), add(X32, X47)),
 sqrt(neg(sub(X21, X18))),
 sub(add(max(sqrt(sub(X21, X14)), log(abs(X1))), add(neg(min(X43, X4)), sub(log(X15), neg(X29)))), neg(abs(add(sub(X13, X22), add(-0.356, X6))))),
 log(min(X30, X10)),
 abs(X7),
 mul(X14, X46),
 sqrt(add(neg(X28), add(X16, X10))),
 mul(X8, X46),
 neg(mul(min(sub(sub(X43, X48), min(X35, X41)), sub(sub(X13, X8), mul(X10, X7))), min(div(log(X3), log(X45)), abs(max(X38, X44))))),
 max(X7, X40)]
0 -0.051369363595478
1 0.04368087215610385
2 0.03857874088585454
3 0.03783958612024296
4 0.03444672759015915
5 -0.032523298094035015
6 0.03258299984355576
7 0.031448304368178547
8 -0.02886343491306072
9 0.02796623738151545


In [65]:
df_f=pd.DataFrame(gp0.transform(X)[:,:3])
df_f["return"]=df_daily_feature["return"]
df_f.corr()

Unnamed: 0,0,1,2,3
0,1.0,-0.024023,-0.102959,-0.546287
1,-0.024023,1.0,0.62978,0.422797
2,-0.102959,0.62978,1.0,0.598505
3,-0.546287,0.422797,0.598505,1.0
