In [9]:
# import packages
import os
os.environ["TF_FORCE_GPU_ALLOW_GROWTH"] = "true"
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, log_loss
from bt_classes import my_backtest, test_indicator
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.pylab import rcParams
import seaborn as sns
from scipy.stats import pearsonr, spearmanr
#importing required libraries
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, LSTM, CuDNNLSTM
from keras.callbacks import EarlyStopping
from keras.utils import to_categorical
from tqdm import tqdm
from sa import *
from utils import *
# follow the literature
# we don't use min-max scaling here, use partial mean-std scaling instead
from sklearn.preprocessing import StandardScaler
from itertools import chain
rcParams['figure.figsize'] = 20,10
# df = pd.read_csv('../res/input0130.csv')

orig_df = pd.read_csv('../xau_1d_20y.csv')
orig_df['datetime'] = pd.to_datetime(orig_df['date'])
orig_df = orig_df.set_index('datetime')

df = orig_df.copy()
df['log_r'] = np.log(df['close']) - np.log(df['open'])
df['label'] = np.sign(df['log_r'].shift(-1))
df['label'][df['label']==-1] = 0
df['label'] = df['label'].fillna(0)


# Please select the last activation layer.
layer_names = ['lstm_2']

default_upper_bound = 2000
default_n_bucket = 1000
default_n_classes = 2
class Args(): #创建一个类
    def __init__(self): #定义初始化信息。
        self.is_classification = True
        self.save_path = ''
        self.d = 'lstm_r'
        self.num_classes = 2
        self.lsa = True
        self.dsa = True
        self.target = 'none'
        self.batch_size = 128
        self.var_threshold = 1e-5
        self.upper_bound = 2000
        self.n_bucket = 1000
        self.is_classification = True
args = Args()

def lstm_model(sample_len=240,para_a=42, para_b=17,drop1=0.05,drop2=0.02):
    model = Sequential()
    # model.add(LSTM(units=para_a, dropout=0.1, return_sequences=True, input_shape=(sample_len,1),activation='tanh'))# (25,15)-57, (42,17)-58
    # model.add(LSTM(units=para_b, dropout=0.08, activation='tanh'))
    model.add(CuDNNLSTM(units=para_a, return_sequences=True, input_shape=(sample_len,1)))# (25,15)-57, (42,17)-58
    model.add(Dropout(drop1))
    model.add(Activation('tanh'))
    model.add(CuDNNLSTM(units=para_b))
    model.add(Dropout(drop2))
    model.add(Activation('tanh'))
    # model.add(Dropout(0.08))# 加了之后同原先效果差不多，（应该一定程度上）可以防止过拟合
    model.add(Dense(2, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [2]:
# this experiment is intended for trying to calculate the transition probability matrix rollingly.
# firstly let's define some useful functions
def get_transtitions(y_true, y_pred):
    '''
    To generate transition probability matrix with y_true, y_pred of an any period.
    '''
    y_output = y_pred
    y = y_true
    continues_suc = 0
    continues_fail = 0
    result = []
    maxx = 0
    for i in range(0,len(y_output)):
        if y_output[i] == y[i]:
            continues_suc+=1
            if continues_fail!=0:
                result.append(-continues_fail)
                if continues_fail > maxx:
                    maxx = continues_fail
                continues_fail = 0
        else:
            continues_fail+=1
            if continues_suc != 0:
                result.append(continues_suc)
                if continues_suc > maxx:
                    maxx = continues_suc
                continues_suc = 0

    length = maxx+1
    suc_result = [[0] * length for i in range(length)]
    fail_result = [[0]*length for i in range(length)]

    for i in range(len(result)-1):
        if result[i]>0:
            suc_result[result[i]][-result[i+1]]+=1
        else:
            fail_result[-result[i]][result[i+1]]+=1
    return suc_result, fail_result

def get_trans_prob(suc_result, fail_result, weighted=False):
    status_porb = {}
    if weighted:
        for i in range(len(suc_result)):
            fail = np.sum([j*suc_result[i][j] for j in range(len(suc_result[i]))])
            if i+1 < len(suc_result):
                success = np.sum([(j-i)*np.sum(suc_result[j]) for j in range(i+1,len(suc_result))])
                # success = np.sum(suc_result[i+1:])
            else: 
                success = 0
            status_porb[i] = success / (success + fail)

        for i in range(len(fail_result)):
            success = np.sum([j*fail_result[i][j] for j in range(len(fail_result[i]))])
            if i+1 < len(fail_result):
                fail = np.sum([(j-i)*np.sum(fail_result[j]) for j in range(i+1,len(fail_result))])
                # fail = np.sum(fail_result[i+1:])
            else: 
                fail = 0
            status_porb[-i] = success / (success + fail)
    else:
        for i in range(len(suc_result)):
            fail = np.sum(suc_result[i])
            if i+1 < len(suc_result):
                success = np.sum(suc_result[i+1:])
            else: 
                success = 0
            status_porb[i] = success / (success + fail)

        for i in range(len(fail_result)):
            success = np.sum(fail_result[i])
            if i+1 < len(fail_result):
                fail = np.sum(fail_result[i+1:])
            else: 
                fail = 0
            status_porb[-i] = success / (success + fail)
    return status_porb

def trans_prob(y_true, y_pred, weighted=False):
    suc_result, fail_result = get_transtitions(y_true, y_pred)
    return get_trans_prob(suc_result, fail_result, weighted)

def get_suc_num(test_df):
    test_df['win'] = -1
    test_df['win'].loc[test_df['y_true']==test_df['y_pred']] = 1
    test_df['suc_num'] = np.nan
    test_df['suc_num'].loc[test_df['win']!=test_df['win'].shift(1)] = 1
    test_df['suc_num'] = test_df['suc_num'].cumsum().fillna(method='ffill')
    test_df['suc_num'] = test_df.groupby('suc_num')['suc_num'].cumsum() / test_df['suc_num'] * test_df['win']
    return test_df['suc_num']

In [29]:
# reproduce training set
sample_len = 9
p1 = 192
p2 = 192
epochs = 30
batch_size = 200

train_begin = sample_len
train_end = train_begin + 1500
scaler = StandardScaler()
train_set = df[['log_r','label']][train_begin-sample_len:train_end].reset_index()
x_train, y_train = [], []
x_train_set = list(chain.from_iterable(scaler.fit_transform(train_set['log_r'].values.reshape(-1,1))))
for i in range(sample_len,len(x_train_set)):
    x_train.append(x_train_set[i-sample_len:i])
    y_train.append(train_set['label'][i])
x_train, y_train = np.array(x_train), np.array(y_train)
y_train = to_categorical(y_train,2)
x_train = np.reshape(x_train, (x_train.shape[0],x_train.shape[1],1)) 

model = lstm_model(sample_len=sample_len,para_a=p1,para_b=p2)
model.fit(x_train,y_train,epochs=epochs, batch_size=batch_size, callbacks=[EarlyStopping(monitor='loss',patience=10)])
model.save(f'd{sample_len}-{p1}_{p2}_{epochs}_{batch_size}.h5')

# from keras.models import load_model
# model = load_model(f'd{sample_len}-{p1}_{p2}_{epochs}_{batch_size}.h5')  

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [43]:
## Main experiment 1:
# first get first window test set and rolling
prob_save = []
window = 100
starters = range(train_end,len(df)-window,window)
all_result = []
for test_begin in starters:
    test_end = test_begin + window

    x_test, y_test = [], []
    test_set = df[['log_r','label']][test_begin-sample_len:test_end].reset_index()
    test_df = df[test_begin:test_end].fillna(0)
    x_test_set = list(chain.from_iterable(scaler.transform(test_set['log_r'].values.reshape(-1,1))))
    for i in range(sample_len,len(x_test_set)):
        x_test.append(x_test_set[i-sample_len:i])
        y_test.append(test_set['label'][i-1])
    test_df['y_true'] = y_test
    x_test, y_test = np.array(x_test), np.array(y_test)
    x_test = np.reshape(x_test, (x_test.shape[0],x_test.shape[1],1)) 
    y_test = to_categorical(y_test,2)

    y_pred = model.predict_classes(x_test)
    test_df['y_pred'] = y_pred
    y_true = test_df['y_true'].values
    if test_begin == train_end:
        ti = test_indicator(test_df)
        # last_prob = ti.get_status_win_rate()
        suc_fail,fail_suc = get_transtitions(y_true,y_pred)
        last_prob = get_trans_prob(suc_fail,fail_suc,True)
        print("Before adjusted:")
        ti.backtest()
        ti.status_prob = last_prob
        print("\nAfter adjusted:")
        ti.backtest(prob_adjusted=True)
        prob_save.append(last_prob)
        continue
    this_result = [test_begin]
    ti2 = test_indicator(test_df)
    print("\nBefore adjusted:")
    this_result += list(ti2.backtest())
    print("\nAfter adjusted:")
    suc_fail,fail_suc = get_transtitions(y_true,y_pred)
    ti2.status_prob, last_prob = last_prob, get_trans_prob(suc_fail,fail_suc,True)
    this_result+= list(ti2.backtest(prob_adjusted=True))
    all_result.append(this_result)
    prob_save.append(last_prob)

Before adjusted:
Accuracy: 0.4700
Starting Portfolio Value: 100000.00
Final Portfolio Value: 93810.00
Sharpe: -1.36
Max drawdown: 7.63%
Annual rate: -14.87%

After adjusted:
Adjusted accuracy: 0.6400
Starting Portfolio Value: 100000.00
Final Portfolio Value: 104846.00
Sharpe: 0.99
Max drawdown: 7.58%
Annual rate: 12.67%

Before adjusted:
Accuracy: 0.5000
Starting Portfolio Value: 100000.00
Final Portfolio Value: 89705.00
Sharpe: -1.52
Max drawdown: 17.60%
Annual rate: -23.95%

After adjusted:
Adjusted accuracy: 0.4300
Starting Portfolio Value: 100000.00
Final Portfolio Value: 102754.00
Sharpe: 0.49
Max drawdown: 6.27%
Annual rate: 7.09%

Before adjusted:
Accuracy: 0.6300
Starting Portfolio Value: 100000.00
Final Portfolio Value: 113170.00
Sharpe: 3.19
Max drawdown: 1.74%
Annual rate: 36.58%

After adjusted:
Adjusted accuracy: 0.4700
Starting Portfolio Value: 100000.00
Final Portfolio Value: 93999.00
Sharpe: -1.59
Max drawdown: 8.41%
Annual rate: -14.44%

Before adjusted:
Accuracy: 0.48

In [46]:
prob = pd.DataFrame(prob_save)
prob.std()

 0    0.000000
 1    0.140070
 2    0.169174
 3    0.276323
 4    0.392996
 5    0.437662
 6    0.490573
 7    0.527046
-1    0.143245
-2    0.180405
-3    0.248240
-4    0.353075
-5    0.450882
-6    0.489864
-7    0.456269
 8    0.547723
 9    0.000000
-8    0.500000
-9    0.000000
dtype: float64

In [47]:
prob

Unnamed: 0,0,1,2,3,4,5,6,7,-1,-2,-3,-4,-5,-6,-7,8,9,-8,-9
0,0.0,0.352941,0.333333,0.375,1.0,1.0,0.0,,0.5,0.285714,0.384615,0.666667,0.0,0.0,1.0,,,,
1,0.0,0.454545,0.590909,0.545455,0.4,0.0,,,0.456522,0.666667,0.3,0.166667,0.0,0.0,0.0,,,0.0,1.0
2,0.0,0.711538,0.685714,0.944444,0.785714,0.6,1.0,1.0,0.769231,0.785714,1.0,,,,,0.0,,,
3,0.0,0.392857,0.65,0.636364,0.666667,1.0,0.0,,0.446809,0.5,0.642857,0.6,0.0,1.0,,,,,
4,0.0,0.52,0.545455,0.454545,0.4,1.0,0.0,,0.722222,0.6,0.571429,0.0,0.0,1.0,,,,,
5,0.0,0.42,0.322581,0.75,0.6,1.0,1.0,0.0,0.384615,0.321429,0.4,0.111111,0.285714,0.0,0.5,,,1.0,
6,0.0,0.62,0.842105,0.190476,1.0,0.0,,,0.685185,0.529412,0.5,0.666667,1.0,,,,,,
7,0.0,0.538462,0.809524,0.529412,0.714286,1.0,0.2,0.0,0.62,0.55,0.333333,0.75,1.0,,,,,,
8,0.0,0.6,0.28125,0.625,1.0,0.25,0.0,,0.48,0.52,0.444444,1.0,,,,,,,
9,0.0,0.636364,0.451613,0.777778,0.222222,0.0,,,0.418605,0.607143,0.5,0.857143,1.0,,,,,,


In [31]:
a = pd.DataFrame(all_result,columns=['starts','pre-acc','sharpe','drawdown','annual','adj-acc','sharpe','drawdown','annual'])
a.mean()

starts      3359.000000
pre-acc        0.500278
sharpe        -0.244191
drawdown      12.689832
annual        -4.004730
adj-acc        0.488611
sharpe        -0.808469
drawdown      13.967464
annual       -10.961667
dtype: float64

In [48]:
## 滚动的测试：先把全部预测、连续对错状态都算出来，再遍历判断修改
test_len = 500
weighted = False
test_begin = train_end
test_end = test_begin + test_len

x_test, y_test = [], []
test_set = df[['log_r','label']][test_begin-sample_len:test_end].reset_index()
test_df = df[test_begin:test_end]
x_test_set = list(chain.from_iterable(scaler.transform(test_set['log_r'].values.reshape(-1,1))))
for i in range(sample_len,len(x_test_set)):
    x_test.append(x_test_set[i-sample_len:i])
    y_test.append(test_set['label'][i-1])
test_df['y_true'] = y_test
x_test, y_test = np.array(x_test), np.array(y_test)
x_test = np.reshape(x_test, (x_test.shape[0],x_test.shape[1],1)) 
y_test = to_categorical(y_test,2)

y_pred = model.predict_classes(x_test)
test_df['y_pred'] = y_pred

In [49]:
fit_window = 100
weighted = False

prob_save = []
# test_df = df[test_begin:test_end]
# test_df['y_true'] = y_true
# test_df['y_pred'] = y_pred
test_df['suc_num'] = get_suc_num(test_df)

win_rate = [1] * fit_window
for i in range(fit_window, len(test_df)):
    this_true = test_df['y_true'].iloc[i-fit_window:i]
    this_pred = test_df['y_pred'].iloc[i-fit_window:i]
    this_prob = trans_prob(this_true,this_pred,weighted)
    last_suc = test_df['suc_num'].iloc[i-1]
    if last_suc not in this_prob.keys():
        if last_suc > 0:
            this_win = 0
        else:
            this_win = 1
    else:
        this_win = this_prob[last_suc] 
    prob_save.append(this_prob)
    win_rate.append(this_win)
test_df['win_rate'] = win_rate
test_df['adjusted_pred'] = test_df['y_pred']
test_df['adjusted_pred'].loc[test_df['win_rate']<0.5] = 1 - test_df['adjusted_pred'].loc[test_df['win_rate']<0.5]

pre_acc = accuracy_score(test_df['y_true'].iloc[fit_window:],test_df['y_pred'].iloc[fit_window:])
after_acc = accuracy_score(test_df['y_true'].iloc[fit_window:],test_df['adjusted_pred'].iloc[fit_window:])
# test_df = orig_df[test_begin+fit_window:test_end]
# test_df['label'] = y_pred
# print(accuracy_score(y_true,y_pred))

# adjusted_df = orig_df[test_begin+fit_window:test_end]
# adjusted_df['label'] = test_df['adjusted_pred']
# print(accuracy_score(y_true,adjusted_df['label']))
print(f'Pre-adjustment accuracy: {pre_acc:.4f}')
test_df['label'] = test_df['y_pred'].shift(-1).fillna(0)
my_backtest(test_df.iloc[fit_window:])
print(f'Adjusted accuracy: {after_acc:.4f}')
test_df['label'] = test_df['adjusted_pred'].shift(-1).fillna(0)
my_backtest(test_df.iloc[fit_window:])

Pre-adjustment accuracy: 0.5425
Starting Portfolio Value: 100000.00
Final Portfolio Value: 109617.00
Sharpe: 0.46
Max drawdown: 17.60%
Annual rate: 5.96%
Adjusted accuracy: 0.4950
Starting Portfolio Value: 100000.00
Final Portfolio Value: 91781.00
Sharpe: -0.47
Max drawdown: 16.58%
Annual rate: -5.26%


(-0.4711286965132542, 16.581685692504227, -5.259809271875554)

In [54]:
a = pd.DataFrame(prob_save)
a.std(),a.mean()

( 0    0.000000
  1    0.095009
  2    0.103388
  3    0.169895
  4    0.179563
  5    0.384963
  6    0.484780
  7    0.083611
 -1    0.064702
 -2    0.107350
 -3    0.244922
 -4    0.319008
 -5    0.114201
 -6    0.501110
 -7    0.404424
  8    0.495712
  9    0.000000
 -8    0.105409
 -9    0.000000
 dtype: float64,  0    0.000000
  1    0.487604
  2    0.527010
  3    0.560913
  4    0.676250
  5    0.577083
  6    0.484405
  7    0.991573
 -1    0.558075
 -2    0.617284
 -3    0.611524
 -4    0.625183
 -5    0.013158
 -6    0.497778
 -7    0.203540
  8    0.497175
  9    0.000000
 -8    0.011111
 -9    1.000000
 dtype: float64)

In [37]:
test_df['log_profit'] = 2*(test_df['y_pred']-0.5)*test_df['log_r']
win_profit = test_df['log_profit'].loc[test_df['y_true']==test_df['y_pred']].mean()
lose_profit = test_df['log_profit'].loc[test_df['y_true']!=test_df['y_pred']].mean()
pre_wtl = abs(win_profit / lose_profit)

In [38]:
adj_win_profit = test_df['log_profit'].loc[test_df['y_true']==test_df['adjusted_pred']].mean()
adj_lose_profit = test_df['log_profit'].loc[test_df['y_true']!=test_df['adjusted_pred']].mean()
adj_wtl = abs(adj_win_profit / adj_lose_profit)

In [39]:
win_profit,lose_profit,pre_wtl,adj_win_profit,adj_lose_profit,adj_wtl

(0.009514747478374845,
 -0.009302221590743646,
 1.0228467883244876,
 0.0054130332345014775,
 -0.003959310210107463,
 1.3671657302029279)

In [58]:
## 浮动阈值法
## 先计算出全部输出概率值，然后挑取大者观察
test_len = 500
test_begin = train_end
test_end = test_begin + test_len

x_test, y_test = [], []
test_set = df[['log_r','label']][test_begin-sample_len:test_end].reset_index()
test_set = df[['log_r','label']][test_begin-sample_len:test_end].reset_index()
test_df = df[test_begin:test_end]
x_test_set = list(chain.from_iterable(scaler.transform(test_set['log_r'].values.reshape(-1,1))))
for i in range(sample_len,len(x_test_set)):
    x_test.append(x_test_set[i-sample_len:i])
    y_test.append(test_set['label'][i-1])
test_df['y_true'] = y_test
x_test, y_test = np.array(x_test), np.array(y_test)
x_test = np.reshape(x_test, (x_test.shape[0],x_test.shape[1],1)) 
y_test = to_categorical(y_test,2)

y_pred = model.predict_classes(x_test)
test_df['y_pred'] = y_pred
y_pred_prob = model.predict(x_test)

# test_df = orig_df[test_begin:test_end]
# test_df['y_true'] = y_true
# test_df['y_pred'] = y_pred
test_df['win'] = -1
test_df['win'].loc[test_df['y_true']==test_df['y_pred']] = 1
test_df['max_conf'] = y_pred_prob.max(axis=1)


In [59]:
fit_window = 100
trust_thres = [0.5] * fit_window
for i in range(fit_window, len(test_df)):
    this_df = test_df.iloc[i-fit_window:i]
    this_win = this_df['max_conf'].loc[this_df['win']==1].mean()
    this_lose = this_df['max_conf'].loc[this_df['win']==-1].mean()
    if this_win > this_lose:
        threshold = (this_win + this_lose) * 0.5
        trust_thres.append(threshold)
    else:
        trust_thres.append(1)

test_df['trust_thres'] = threshold
test_df['adjusted_pred'] = y_pred
test_df['adjusted_pred'].loc[test_df['max_conf'] < test_df['trust_thres']] = 1 - test_df['adjusted_pred'].loc[test_df['max_conf'] < test_df['trust_thres']]

# print(accuracy_score(test_df['y_true'].iloc[fit_window:],test_df['y_pred'].iloc[fit_window:]))
# print(accuracy_score(test_df['y_true'].iloc[fit_window:],test_df['adjusted_pred'].iloc[fit_window:]))

pre_acc = accuracy_score(test_df['y_true'].iloc[fit_window:],test_df['y_pred'].iloc[fit_window:])
after_acc = accuracy_score(test_df['y_true'].iloc[fit_window:],test_df['adjusted_pred'].iloc[fit_window:])

print(f'Pre-adjustment accuracy: {pre_acc:.4f}')
test_df['label'] = test_df['y_pred'].shift(-1).fillna(0)
print(my_backtest(test_df.iloc[fit_window:]))
print(f'Adjusted accuracy: {after_acc:.4f}')
test_df['label'] = test_df['adjusted_pred'].shift(-1).fillna(0)
print(my_backtest(test_df.iloc[fit_window:]))

Pre-adjustment accuracy: 0.5425
Starting Portfolio Value: 100000.00
Final Portfolio Value: 109617.00
Sharpe: 0.46
Max drawdown: 17.60%
Annual rate: 5.96%
(0.46024613121331, 17.60142917587321, 5.955397383610872)
Adjusted accuracy: 0.5000
Starting Portfolio Value: 100000.00
Final Portfolio Value: 92843.00
Sharpe: -0.43
Max drawdown: 14.57%
Annual rate: -4.57%
(-0.42507666032998365, 14.56583715873399, -4.5706481633274665)


In [42]:
test_df['log_profit'] = 2*(test_df['y_pred']-0.5)*test_df['log_r']
win_profit = test_df['log_profit'].loc[test_df['y_true']==test_df['y_pred']].mean()
lose_profit = test_df['log_profit'].loc[test_df['y_true']!=test_df['y_pred']].mean()
pre_wtl = abs(win_profit / lose_profit)
adj_win_profit = test_df['log_profit'].loc[test_df['y_true']==test_df['adjusted_pred']].mean()
adj_lose_profit = test_df['log_profit'].loc[test_df['y_true']!=test_df['adjusted_pred']].mean()
adj_wtl = abs(adj_win_profit / adj_lose_profit)
win_profit,lose_profit,pre_wtl,adj_win_profit,adj_lose_profit,adj_wtl

(0.008513103102104914,
 -0.008501713721241252,
 1.0013396570664579,
 0.0002507606129606986,
 -0.0002651531370280987,
 0.9457199555369586)