In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%run utils.py

import math
import json

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV, LassoCV

In [None]:
# Check if GPU is connected
print(tf.config.list_physical_devices('GPU'))
print(tf.config.list_physical_devices('CPU'))

# Auxiliary functions

In [None]:
def invest(day, day20, day21, day22, total, total_asset, position_dict, top_stocks):

    if day != 0:
        uninvested = 0
    elif (day == 0) & (first_run == False):
        uninvested = 0
    elif (day == 0) & (first_run == True):
        uninvested = 1
    to_hold = []
    to_delete = []

    for key in position_dict:
        # Sell
        if key not in top_stocks:
            uninvested += position_dict[key]
            to_delete.append(key)
#             print(f'Sell {key} on {day21} of market open')
        # Hold
        else:
            to_hold.append(key)

    # Remove sold stocks from position
    for key in to_delete:
        del position_dict[key]

#     print(f'Hold {to_hold} on {day21} market open')

    # Buy stocks
    for index in list(set(top_stocks) - set(to_hold)):
        position_dict[index] = uninvested / (10 - len(to_hold))
#         print(f'Buy {index} on {day21} market open')

    # Calculate return right away
    for key in top_stocks:
        percent_change = x_test[(x_test['datadate'] == day20) & (x_test['tic'] == key)]['ret_d'].values[0]
        total += position_dict[key] * percent_change
        # This calculates the position on the next day
        position_dict[key] = position_dict[key] * (1 + percent_change)
#     print(f'position on {day22} market open will be {position_dict}')

    total_asset.append(total)
    print(f'Total asset on {day22} will be {total}')
    # print(f'It should be the same as {sum(position_dict.values())}')
    
    return total_asset, total, position_dict

# Baseline for 2019-2024 data

In [None]:
# For importing processed data
# processed_data_path = file_path

In [None]:
basic_factors = ['cshtrd', 'prccd', 'prchd', 'prcld', 'prcod', 'dol_vol', 'Mom_2day', 'Mom_3day', 'Mom_5day',
                 'MA_10day', 'MA_50day', 'open/MA10', 'open/MA50', 'STD_10day', 'H-L', 'RSI', 'MACD', 'MACD_Signal_Line']

TA_factors = [# Momentum indicators
              'momentum_stoch_rsi', 'momentum_stoch', 'momentum_ao', 'momentum_pvo', 'momentum_kama', 'momentum_wr',
              # Volume indicators
              'volume_adi', 'volume_em', 'volume_fi', 'volume_cmf', 'volume_vpt',
              # Volatility indicators
              'volatility_atr', 'volatility_bbh', 'volatility_dcw', 'volatility_ui',
              # Trend indicators
              'trend_adx', 'trend_aroon_up', 'trend_aroon_down', 'trend_ichimoku_a',
              # Other indicators
              'others_dr'
]

# Remove factors that have low variance
basic_factors.remove('dol_vol')

# Remove factors that have high correlation with the opening price
basic_factors.remove('prccd')
basic_factors.remove('prcld')
basic_factors.remove('prchd')
basic_factors.remove('prcod')
basic_factors.remove('MA_10day')
TA_factors.remove('trend_ichimoku_a')
TA_factors.remove('volatility_bbh')
TA_factors.remove('momentum_kama')

# Remove factors that have high correlation with momentum_stoch
TA_factors.remove('momentum_wr')

factors = basic_factors + TA_factors
print(f'There are {len(basic_factors)} basic factors')
print(f'There are {len(TA_factors)} TA factors')
print(f'There are {len(factors)} factors')

data = pd.read_csv(processed_data_path)
all_days = list(data['datadate'].unique())
num_of_tokens = data.sector.nunique()
num_to_tic_dict, tic_to_num_dict = num_tic_dicts(data)

data = remove_dead_stocks(data)

data = assign_class_labels(data, 'fixed_thres')
data = data[['datadate', 'tic'] + factors + ['ret_d', 'TBill1y', 'rel_ret_d', 'rank', 'sector']]

data

In [None]:
all_days = list(data['datadate'].unique())
num_of_ts = len(all_days)
print(f'There are {num_of_ts} days in the dataset')

tickers = list(data.tic.unique())
nt = len(tickers)
print(f'There are {nt} tickers')
assert len(tickers) * num_of_ts == data.shape[0]

seq_length = 20 # Length of time-series
train_length = 200 # Length of training data
ftd = train_length # First train day
ltd = ftd+train_length-1 # Last train day
num_stocks = 10 # Choose the top {num_stocks} each day
num_of_models = 3

num_iters = math.floor((num_of_ts - 2*train_length) / seq_length)
print(f'There are {num_iters} iterations')

In [None]:
# Get risk-free rate for SR calculation
data_to_rfr_dict = dict(zip(data.datadate, data.TBill1y))

## Do regression once

In [None]:
# Set factors
factors_list = [['Mom_3day', 'cshtrd'], ['Mom_3day', 'RSI'], ['Mom_3day', 'MA_50day'], ['Mom_2day', 'Mom_5day'],
                basic_factors, factors]

In [None]:
# Define regression methods
alphas = [1e-3, 5e-3, 1e-2, 5e-2, 0.1, 0.5, 1, 10, 100, 500, 1000]
print(f'Set of regularization parameters: {alphas}')
names_list = ['OLS', 'ridge', 'lasso']
print(f'Regression methods used are {names_list}')
methods_list = [LinearRegression(), RidgeCV(alphas=alphas, cv=10), LassoCV(alphas=alphas, cv=10)]
assert len(names_list) == len(methods_list)

In [None]:
ftd = 200
ltd = 399
ftestd = 400
ltestd = 419
seq_length = 20
graph = True
num_stocks = 10
first_run = True
total_dict = {}
total_asset_dict = {}
position_dict_all = {}

for name in names_list:
    for i in range(len(factors_list)):
        total_dict[f'{name}_{i}'] = 1
        total_asset_dict[f'{name}_{i}'] = [1]
        position_dict_all[f'{name}_{i}'] = {}

# Initialize linear regression dictionaries
regressor_dict = {}
for i in range(len(names_list)):
    name = names_list[i]
    for j in range(len(factors_list)):
        regressor_dict[f'{name}_{j}'] = methods_list[i]

# Get data for this train/test period
# y_train and y_test are always the same
data_train, y_train, data_test, y_test = prep_train_test_data_regression(data, seq_length, ftd, ltd, all_days, factors)

In [None]:
# Fit, predict, simulate
for j in range(len(factors_list)):

    # obtain the correct train/test sets
    reg_factors = factors_list[j]
    x_train = np.array(data_train[reg_factors])
    x_test = data_test[reg_factors + ['datadate', 'tic', 'ret_d']]

    for i in range(len(names_list)):
        name = names_list[i]

        # fit OLS, ridge, lasso
        regressor_dict[f'{name}_{j}'] = methods_list[i].fit(x_train, y_train)
        
        for day in range(seq_length):
    
            test_day = all_days[ftestd + day]
            buy_day = all_days[ftestd + day + 1]
            simul_day = all_days[ftestd + day + 2]
            x_test_temp = np.array(x_test[x_test['datadate'] == test_day][reg_factors])

            # do regression with {name} on {factors_list[i]}
            y_pred = regressor_dict[f'{name}_{j}'].predict(x_test_temp)
            top_indices = np.argsort(y_pred)[-num_stocks:]
            top_stocks = [num_to_tic_dict[num] for num in top_indices]
            print(f'top_stocks by {name} factors {j} to buy on {buy_day} are {top_stocks}')
            # Simulate investment
            total_asset, total, position_dict = invest(day, test_day, buy_day, simul_day, total_dict[f'{name}_{j}'],
                                                        total_asset_dict[f'{name}_{j}'], position_dict_all[f'{name}_{j}'], top_stocks)
            total_dict[f'{name}_{j}'] = total
            total_asset_dict[f'{name}_{j}'] = total_asset
            position_dict_all[f'{name}_{j}'] = position_dict   

# Plot return graphs
for j in range(len(factors_list)):
    
    plt.figure(figsize=(16, 6))
    
    for name in names_list:
        plt.plot(all_days[ftestd-1:ftestd+20], total_asset_dict[f'{name}_{j}'], label=f'{name}_{j}')
        
    plt.xticks(rotation=45)
    plt.xticks(ticks=range(0, 22, 4), labels=[all_days[j] for j in range(ftestd-1, ftestd+20, 4)])
    plt.legend()
    plt.grid()
    plt.show()


## Refit OLS every `seq_length` days

In [None]:
ftd = 200
ltd = 399
seq_length = 20
num_stocks = 10
first_run = False

for num_iter in range(1, num_iters):
    
    print(f'Running {num_iter + 1} of {num_iters} iterations. First iteration already finished')
    
    ftd += 20
    ltd += 20
    ftestd = ltd + 1
    ltestd = ftestd + 19
    
    data_train, y_train, data_test, y_test = prep_train_test_data_regression(data, seq_length, ftd, ltd, all_days, factors)
    
    # Fit, predict, simulate
    for j in range(len(factors_list)):

        # obtain the correct train/test sets
        reg_factors = factors_list[j]
        x_train = np.array(data_train[reg_factors])
        x_test = data_test[reg_factors + ['datadate', 'tic', 'ret_d']]

        for i in range(len(names_list)):
            name = names_list[i]

            # fit OLS, ridge, lasso
            regressor_dict[f'{name}_{j}'] = methods_list[i].fit(x_train, y_train)

            for day in range(seq_length):

                test_day = all_days[ftestd + day]
                buy_day = all_days[ftestd + day + 1]
                simul_day = all_days[ftestd + day + 2]
                x_test_temp = np.array(x_test[x_test['datadate'] == test_day][reg_factors])

                # do regression with {name} on {factors_list[i]}
                y_pred = regressor_dict[f'{name}_{j}'].predict(x_test_temp)
                top_indices = np.argsort(y_pred)[-num_stocks:]
                top_stocks = [num_to_tic_dict[num] for num in top_indices]
                print(f'top_stocks by {name} factors {j} to buy on {buy_day} are {top_stocks}')
                # Simulate investment
                total_asset, total, position_dict = invest(day, test_day, buy_day, simul_day, total_dict[f'{name}_{j}'],
                                                            total_asset_dict[f'{name}_{j}'], position_dict_all[f'{name}_{j}'], top_stocks)
                total_dict[f'{name}_{j}'] = total
                total_asset_dict[f'{name}_{j}'] = total_asset
                position_dict_all[f'{name}_{j}'] = position_dict   

    # Plot return graphs
    if num_iter % 10 == 0 or num_iter == num_iters-1:
        for j in range(len(factors_list)):

            plt.figure(figsize=(16, 6))
            x_axis = all_days[ltestd+1-len(total_asset_dict['OLS_0']):ltestd+1]

            for name in names_list:

                plt.plot(x_axis, total_asset_dict[f'{name}_{j}'], label=f'{name}_{j}')

            indices_to_display = np.linspace(0, len(x_axis)-1, 15, dtype=int)
            plt.xticks(indices_to_display, [x_axis[i] for i in indices_to_display], rotation=45)
            plt.ylabel('Total Asset')
            plt.xlabel('Date')
            plt.legend()
            plt.grid()
            plt.show()


In [None]:
# Save results
# with open('filename.json', 'w') as f:
#     json.dump(total_asset_dict, f)

In [None]:
last_simul_day = "2024-12-03"
rfr = list(data[data['datadate'] <= last_simul_day][['datadate', 'TBill1y']].drop_duplicates(subset=['datadate'])['TBill1y'])
rfr = rfr[-(len(total_asset_dict[0])-1):]

def calculate_SR(total_asset, rfr):
    '''
    Calculates the Sharpe Ratio of the simulated strategy
    Inputs:
        total_asset: list of total assets on each day
        rfr: list of risk-free rates on the same days as in total_asset
    Output:
        SR: Sharpe Ratio
    '''
    
    daily_ret = [(total_asset[i] - total_asset[i-1]) / total_asset[i-1] for i in range(1, len(total_asset))]
    assert len(daily_ret) == len(rfr)
    excess_ret = [daily_ret[i] - rfr[i] for i in range(len(rfr))]
    SR = np.mean(excess_ret) / np.std(daily_ret) * np.sqrt(252)
    
    return SR

for key, item in total_asset_dict.items():
    SR = calculate_SR(item, rfr)
    print(f'SR for model {key} is {SR}')