In [1]:
import pandas as pd

from copy import deepcopy
from keras.models import Sequential
from keras.layers import Dense, Dropout
from sklearn.base import BaseEstimator
from sklearn.preprocessing import StandardScaler

import numpy as np
from keras.callbacks import EarlyStopping, ModelCheckpoint, Callback
from keras.regularizers import l1, l2

import tensorflow as tf
from tensorflow.keras import backend as K

import yfinance as yf

from sklearn import tree
import xgboost as xgb

import warnings
warnings.filterwarnings("ignore")


In [2]:
def construct_features_single_asset(df,k,h, linear = False, test = False):
    df = df.dropna(how='any',axis=0) 
    df['Cummulative Return'] = (1+ df['Return Daily']).cumprod(axis = 0)
    df['Next H Return'] = df['Cummulative Return'].pct_change(h).shift(-h)
    df['Mean H Return'] = df["Return Daily"].rolling(h).apply(lambda x: x.iloc[range(0,h)].mean()).shift(-h + 1)
    df['Square Sum Return'] = df["Return Daily"].rolling(h).apply(lambda x: x.iloc[range(0,h)].pow(2).sum()).shift(-h + 1)
    df['STD H Return'] = df["Return Daily"].rolling(h).apply(lambda x: x.iloc[range(0,h)].std(ddof = 1)).shift(-h + 1)

    for temp in range(k,0,-1):
        df["Before " + str(temp) + " Day" ] = df['Return Daily'].shift(periods = int(temp))

    if linear == True:
        df['Signal'] = [1 if x > 0 else -1 for x in df['Mean H Return']]
    
    if test == True:
        df = df.dropna(how='any',axis=0)
        df = df[1:]
        temp = pd.DataFrame(columns= df.columns)
        n = 0
        while True:
            try:
                temp = pd.concat([temp,df.iloc[[n*h],:]], axis = 0)
                n = n+1
            except: break
        df = temp
    else:
        df = df.dropna(how='any',axis=0)
        df = df[1:]

    return df

In [3]:
def feature_engineering(data,k,h,linear = False):
    company = data.columns
    features = []
    for i in range(k,0,-1):
        features.append("Before " + str(i) + " Day")
    features.append("Return Daily")

    X_train = pd.DataFrame(columns=features)
    if linear == False:
        y_train = pd.DataFrame(columns=["Mean H Return","Square Sum Return"])
    elif linear == True:
        y_train = pd.DataFrame(columns=["Signal"])
    for oo in company:
        df = data[[oo]].copy()
        
        df.columns = ["Return Daily"]

        df = construct_features_single_asset(df,k,h,linear = linear)
        
        X_train = pd.concat([X_train,df[features]],axis = 0)
        if linear == False:
            y_train = pd.concat([y_train,df[["Mean H Return","Square Sum Return"]]],axis = 0)
        elif linear == True:
            y_train = pd.concat([y_train,df[["Signal"]]],axis = 0)
    return [X_train,y_train]

In [4]:
def sharpe_loss(h):
    def calculation(y_target_dummy, y_pred):

        mean = K.reshape(y_target_dummy[:, 0], (-1, 1))
        square_sum =  K.reshape(y_target_dummy[:, 1], (-1, 1))

        sum_pofolio = mean * h * y_pred
        mean_pofolio = K.mean(mean * h * y_pred) / h

        std_pofolio = tf.math.sqrt(K.mean(square_sum * y_pred **2 
                                          - 2 * sum_pofolio * y_pred * mean_pofolio 
                                          + mean_pofolio ** 2)/h)

    
        return  - (mean_pofolio / std_pofolio) *np.sqrt(252)
    
    return calculation

In [5]:
def train_decision_tree(data, k, h):

    model = tree.DecisionTreeClassifier()

    X_train,y_train = feature_engineering(data,k,h,linear = True)
    X_train = np.array(X_train, dtype=np.float64)
    y_train = np.array(y_train, dtype=np.float64)
    model= model.fit(X_train, y_train)
    
    return model

In [6]:
def train_xgboost(data, k, h):

    model = xgb.XGBRegressor(objective="multi:softmax", num_class = 2,random_state=42)

    X_train,y_train = feature_engineering(data,k,h,linear = True)
    X_train = np.array(X_train, dtype=np.float64)
    y_train = np.array(y_train, dtype=np.float64)
    y_train[y_train == -1] = 0
    model= model.fit(X_train, y_train)
    
    return model

In [None]:
def train_MLP_supervised(data, k, h):

    model = Sequential([
    Dropout(0, input_shape=(k+1,)),
    Dense(5,activation = 'tanh'),
    Dense(1,activation = 'sigmoid'),
    ])

    checkpoint_filepath = 'Test/Data/checkpoint.model.keras'
    model_checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_filepath,
    mode='max',
    save_best_only=True)

    model.compile(optimizer='adam', loss='binary_crossentropy')


    X_train,y_train = feature_engineering(data,k,h,linear = True)
    y_train[y_train == -1] = 0
    X_train = np.array(X_train, dtype=np.float64)
    y_train = np.array(y_train, dtype=np.float64)
    model.fit(X_train, y_train, validation_split=0.1,epochs=100, batch_size = 256, verbose=1,callbacks = [model_checkpoint_callback])
    
    return model

In [8]:
def train_lasso_supervised(data, k, h,lambda_val = .7):

    model = Sequential([
        Dense(1, input_shape = (k+1,),kernel_regularizer = l1(lambda_val),activation='sigmoid')
    ])

    checkpoint_filepath = 'Test/Data/checkpoint.model.keras'
    model_checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_filepath,
    mode='max',
    save_best_only=True)

    model.compile(optimizer='adam', loss='binary_crossentropy')


    X_train,y_train = feature_engineering(data,k,h,linear = True)
    y_train[y_train == -1] = 0
    X_train = np.array(X_train, dtype=np.float64)
    y_train = np.array(y_train, dtype=np.float64)
    model.fit(X_train, y_train,validation_split=0.1, epochs=100, batch_size = 256, verbose=1,callbacks = [model_checkpoint_callback])
    
    return model

In [9]:
def train_MLP_sharpeLoss(data, k, h):

    model = Sequential([
    Dropout(0, input_shape=(k+1,)),
    Dense(2,activation = 'tanh'),
    Dense(1,activation = 'tanh'),
    ])

    checkpoint_filepath = 'Test/Data/checkpoint.model.keras'
    model_checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_filepath,
    mode='max',
    save_best_only=True)

    model.compile(optimizer='adam', loss=sharpe_loss(h = h))


    X_train,y_train = feature_engineering(data,k,h,linear = False)
    
    model.fit(X_train, y_train, validation_split=0.1,epochs=100, batch_size = 256, verbose=1,callbacks = [model_checkpoint_callback])
    
    return model

In [10]:
def train_Lasso_sharpeLoss(data, k, h,lambda_val = .7):

    model = Sequential([
        Dense(1, input_shape = (k+1,),kernel_regularizer = l1(lambda_val),activation= 'tanh')
    ])

    checkpoint_filepath = 'Test/Data/checkpoint.model.keras'
    model_checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_filepath,
    mode='max',
    save_best_only=True)

    model.compile(optimizer='adam', loss=sharpe_loss(h = h))

    X_train,y_train = feature_engineering(data,k,h,linear = False)

    model.fit(X_train, y_train,validation_split=0.1, epochs=100, batch_size = 256, verbose=1,callbacks = [model_checkpoint_callback])
    
    return model

In [11]:
time_range = '6mo' # khoảng thời gian làm backtest 
start_time = '2024-02-01'
end_time = '2024-08-01'

def EU_Stock_data():
    """Lấy dữ liệu giá Close của 50 công ty trên sàn Euro_STOXX 50 vào thời gian cho trước"""

    stock_list = pd.read_html( 'https://en.wikipedia.org/wiki/EURO_STOXX_50')[4]['Ticker'][1:].to_list()
    futures = pd.DataFrame(columns= stock_list) # danh sách mã
    
    # đặt index 
    time_index = list(yf.Ticker(stock_list[0]).history(period = time_range,start = start_time, end = end_time).index) 

    # xét từng mã
    for symbol in stock_list:
        df = yf.Ticker(symbol).history(period = time_range, start = start_time, end = end_time)
        df = pd.DataFrame(df['Close']) # lấy giá close
        i = 0
        daily_return = []
        # tinh daily return, = 0 trong ngày đầu tiên 
        for k in df['Close']:
            if i != 0:
                daily_return.append(float((k-i)/i))
            else:
                daily_return.append(float(0))
            i = k
        try:
            futures[symbol] = daily_return
        except:
            while len(daily_return) < len(futures):
                daily_return.insert(0,np.nan)
            futures[symbol] = daily_return

    futures.index = time_index

    futures['Date'] = pd.to_datetime(futures.index, format='%Y-%m-%d')
    futures.set_index('Date', inplace=True)

    return futures

In [12]:
def test_model_TSMOM(data, model,k,h,linear = False):

    company = data.columns

    signal = pd.DataFrame(index = data.index, columns= company)

    features = []
    for i in range(k,0,-1):
        features.append("Before " + str(i) + " Day")
    features.append("Return Daily")

    for oo in company:
        df = data[[oo]].copy()
        
        df.columns = ["Return Daily"]
        df = construct_features_single_asset(df,k,h,linear = linear,test= True)

        
        X_test = df[features]
        try:
            if model.loss ==  'binary_crossentropy':
                X_test['prediction'] = np.sign(model.predict(X_test) - 0.5)
            else:
                X_test['prediction'] = np.sign(model.predict(X_test))
        except:
            X_test['prediction'] = np.sign(model.predict(X_test))
            X_test['prediction'][X_test['prediction'] == 0] = -1
        for x,v in enumerate(X_test.index):
            signal.loc[v,oo] = X_test.loc[v,'prediction']
        
        signal[oo] = signal[oo].ffill()
        signal[oo] = signal[oo].fillna(0)


    return signal

In [13]:
def Volatility_scale(data, ignore_na=False, adjust = True, com = 60, min_periods=0):
    """Scale data using ex ante volatility"""

    # Lưu trữ index, tức thời gian 
    std_index = data.index

    # chứa kết quả
    daily_index = pd.DataFrame(index=std_index)

    # xét từng cổ phiếu
    for oo in data.columns:
        returns = data[oo]  # Lấy ra các return
        returns.dropna(inplace=True)  # xử lý null bằng zero

        # Tính cumulative (cum) return , nhưng ko có thành phần - 1
        ret_index = (1 + returns).cumprod()

        # Tính daily volatility (vol)
        day_vol = returns.ewm(ignore_na=ignore_na,
                              adjust=adjust,
                              com=com,
                              min_periods=min_periods).std(bias=False)
        
        vol = day_vol * np.sqrt(252)  # scale lại theo 252 ngày active trading

        # Join cum return và vol
        ret_index = pd.concat([ret_index, vol], axis=1)
        ret_index.columns = [oo, oo + '_Vol']  # Đặt tên cột cum return là tên cổ phiếu, bên cạnh là vol 

        # Join 
        daily_index = pd.concat([daily_index, ret_index], axis=1)

    return daily_index


In [14]:
def classic_TSMOM(data, k, h, tolerance = 0,ignore_na = False, adjust = True, com = 60, min_periods = 0):
    signal = pd.DataFrame(index = data.index)

    # gọi hàm Volatility scale
    daily_index = Volatility_scale(data,ignore_na=ignore_na,
                          adjust=adjust,
                          com=com,   
                          min_periods = min_periods)

    company = data.columns

    for oo in company:
        flag_h = 0
        flag_k = k+1
        df = pd.concat([daily_index[oo], daily_index[oo+"_Vol"]], axis=1)
        df['rolling returns'] = df[oo].pct_change(k) # so sánh thay đổi ở ngày t với k ngày trước đó (tức t - k)
        df['signal'] = 0.
        for x, v in enumerate(df['rolling returns']):
            if flag_h != 0:
                # Bỏ qua giai đoạn hold, tránh bị tính lặp lại
                flag_h = flag_h - 1
                continue
            # Bỏ qua thời gian cty chưa được lên sàn (nêu có)
            if df[oo].isnull().iloc[x] == False:
                # bỏ qua k ngày đầu vì chưa đủ k lookback
                if flag_k != 0:
                    flag_k = flag_k - 1
                    continue
            else: continue
            try:
                if df['rolling returns'].iloc[x-1] < tolerance:
                    for h_period in range(0,h):
                        # rolling return < 0, short rồi giữ trong h ngày, tính pnl, leverage///
                        df['signal'].iloc[x + h_period] = -1
                
                elif df['rolling returns'].iloc[x-1] > tolerance:
                    for h_period in range(0,h):
                        # rolling return > 0, long rồi giữ trong h ngày, tính pnl, leverage///
                        df['signal'].iloc[x + h_period] = 1

            except:pass
            

            # Đặt flag holding là h - 1, để qua vòng for mới bỏ qua ngày hold, tránh bị tính lặp lại
            if df['rolling returns'].iloc[x-1] != tolerance: flag_h = h - 1

        signal = pd.concat([signal, df['signal']], axis=1)

    signal.columns = data.columns
    
    return signal

In [15]:
def backtest(data,signal,k,h,  vol_flag = 1, target_vol = 0.2, ignore_na = False, adjust = True, com = 60, min_periods = 0):
    
    pnl = pd.DataFrame(index=data.index)
    leverage = pd.DataFrame(index = data.index)

    # gọi hàm Volatility scale
    daily_index = Volatility_scale(data,ignore_na=ignore_na,
                          adjust=adjust,
                          com=com,   
                          min_periods = min_periods)

    company = data.columns

    # Volatility settings
    vol_flag = vol_flag    # Set flag to 1 for vol targeting
    if vol_flag == 1:
        target_vol = target_vol 
    else:
        target_vol = 'no target vol'
    

    for oo in company:
        flag_h = 0
        flag_k = k+1
        df = pd.concat([daily_index[oo], daily_index[oo+"_Vol"]], axis=1)

        df['pnl'] = 0. 
        df['leverage'] = 0.
        for x, v in enumerate(df['pnl']):
            if flag_h != 0:
                # Bỏ qua giai đoạn hold, tránh bị tính lặp lại
                flag_h = flag_h - 1
                continue
            # Bỏ qua thời gian cty chưa được lên sàn (nêu có)
            if df[oo].isnull().iloc[x] == False:
                # bỏ qua k ngày đầu vì chưa đủ k lookback
                if flag_k != 0:
                    flag_k = flag_k - 1
                    continue
            else: continue
            try:
                if signal[oo].iloc[x] == -1:
                    for h_period in range(0,h):
                        if vol_flag == 1:
                            df['pnl'].iloc[x + h_period] = (1 - df[oo].iloc[x + h_period] / df[oo].iloc[x - 1 + h_period]) * \
                                target_vol / df[oo+"_Vol"].iloc[x -1] 
                            df['leverage'].iloc[x + h_period] = target_vol / df[oo+"_Vol"].iloc[x -1]
                        else:
                            df['pnl'].iloc[x + h_period] = (1 - df[oo].iloc[x + h_period] / df[oo].iloc[x - 1 + h_period])
                            df['leverage'].iloc[x+h_period] = 1
                elif signal[oo].iloc[x] == 1:
                    for h_period in range(0,h):
                        if vol_flag == 1:
                            df['pnl'].iloc[x + h_period] = (df[oo].iloc[x + h_period] / df[oo].iloc[x - 1 + h_period] - 1) * \
                                    target_vol / df[oo+"_Vol"].iloc[x - 1]
                            df['leverage'].iloc[x+h_period] = target_vol / df[oo+"_Vol"].iloc[x -1]
                        else:
                            df['pnl'].iloc[x + h_period] = (df[oo].iloc[x + h_period] / df[oo].iloc[x - 1 + h_period] - 1)
                            df['leverage'].iloc[x+h_period] = 1
            except:pass
            
            if signal[oo].iloc[x] == 1 or signal[oo].iloc[x] == -1 : flag_h = h - 1


        leverage = pd.concat([leverage, df['leverage']], axis = 1)
        pnl = pd.concat([pnl, df['pnl']], axis=1)

    pnl.columns = data.columns
    leverage.columns = data.columns

    return [pnl,leverage]

In [16]:
def strategy_daily_return(pnl):
    
    return pnl.mean(skipna = False, axis=1)

In [17]:
train_data = pd.read_csv('Main File/TSMOM/Data/data_close.csv' , index_col= 'Date')
test_data = EU_Stock_data()

k = 10
h = 3

model_name = ['classic_TSMOM','train_decision_tree','train_xgboost','train_MLP_supervised','train_lasso_supervised','train_MLP_sharpeLoss','train_Lasso_sharpeLoss']

for model in model_name:
    func = globals()[model]
    if model == 'classic_TSMOM':
        signal = func(test_data,k,h)
        # signal.to_csv("signal_" + str(model) + ".csv")
        pnl = strategy_daily_return(backtest(test_data,signal,k,h)[0])
    else:
        temp_model = func(train_data,k,h)
        signal = test_model_TSMOM(test_data,temp_model,k,h)
        # signal.to_csv("signal_" + str(model) + ".csv")
        pnl = strategy_daily_return(backtest(test_data,signal,k,h)[0])
    try:
        temp = pnl.to_list()
        temp.insert(0,model)
        stats.loc[len(stats.index)] = temp
    except:
        index = pnl.index.to_list()
        index.insert(0,'Model')
        stats = pd.DataFrame(columns = index)
        temp = pnl.to_list()
        temp.insert(0,model)
        stats.loc[len(stats.index)] = temp

Epoch 1/100
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.6863 - val_loss: 0.6780
Epoch 2/100
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.6806 - val_loss: 0.6765
Epoch 3/100
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.6800 - val_loss: 0.6751
Epoch 4/100
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.6777 - val_loss: 0.6740
Epoch 5/100
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 0.6763 - val_loss: 0.6720
Epoch 6/100
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.6747 - val_loss: 0.6701
Epoch 7/100
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.6729 - val_loss: 0.6678
Epoch 8/100
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 0.6703 - val_loss: 0.6662
Epoch 9/100
[1m172/172[0m [32

In [18]:
stats

Unnamed: 0,Model,2024-02-01 00:00:00+01:00,2024-02-02 00:00:00+01:00,2024-02-05 00:00:00+01:00,2024-02-06 00:00:00+01:00,2024-02-07 00:00:00+01:00,2024-02-08 00:00:00+01:00,2024-02-09 00:00:00+01:00,2024-02-12 00:00:00+01:00,2024-02-13 00:00:00+01:00,...,2024-07-18 00:00:00+02:00,2024-07-19 00:00:00+02:00,2024-07-22 00:00:00+02:00,2024-07-23 00:00:00+02:00,2024-07-24 00:00:00+02:00,2024-07-25 00:00:00+02:00,2024-07-26 00:00:00+02:00,2024-07-29 00:00:00+02:00,2024-07-30 00:00:00+02:00,2024-07-31 00:00:00+02:00
0,classic_TSMOM,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.000981,-0.001922,0.004395,0.00224,-0.001821,-0.003493,0.00017,-0.002985,0.003105,-0.000299
1,train_decision_tree,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.001602,0.000123,0.000605,0.000959,0.000364,0.001001,-0.002191,-0.00202,0.000899,0.000356
2,train_xgboost,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.001753,-0.001822,0.004502,0.001206,-0.004014,-0.003401,0.004312,-0.000858,0.003915,0.000627
3,train_MLP_supervised,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00038,-0.007188,0.010859,-0.000303,-0.005182,-0.005574,0.008977,-0.005604,0.005657,0.000217
4,train_lasso_supervised,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.001188,-0.00996,0.013647,-0.000745,-0.007363,-0.006722,0.009322,-0.008151,0.005958,0.001912
5,train_MLP_sharpeLoss,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000532,-0.008064,0.011719,-0.000438,-0.005842,-0.00523,0.009574,-0.006512,0.004798,0.000726
6,train_Lasso_sharpeLoss,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.001188,-0.00996,0.013647,-0.000745,-0.007363,-0.006722,0.009322,-0.008151,0.005958,0.001912


In [19]:
stats.to_csv("result.csv")