In [1]:
import pandas as pd
import numpy as np
import torch
import pandas as pd
import random
import os
import glob


eps = 1e-8 #needed for numerical stability
L = 756 #length of dataframe for date range of interest

import warnings
warnings.filterwarnings('ignore', category=RuntimeWarning)
pd.options.mode.chained_assignment = None 

In [2]:
def splitdata():
    """
    Splits data into train, val, test for model development

    Parameters:
    None

    Returns:
    None
    """

    # Delete CSV files in the folder
    csv_files = glob.glob(os.path.join('model_data\\test_data\\', '*.csv'))
    for file in csv_files:
        os.remove(file)

    csv_files = glob.glob(os.path.join('model_data\\val_data\\', '*.csv'))
    for file in csv_files:
        os.remove(file)

    csv_files = glob.glob(os.path.join('model_data\\train_data\\', '*.csv'))
    for file in csv_files:
        os.remove(file)

    random.seed(45)
        
    #Get tickers in acl18 dataset and divide it in 50% test data, 50% validation data
    table_df = pd.read_csv('tick_lst/acl18.txt', sep='\t')
    table_lst = table_df.Symbol.to_list()
    valtick_lst = random.sample(table_lst , 44)
    testtick_lst = list()
    
    #Save files into test and validation folders
    for tick in table_lst:
        raw_df = pd.read_csv(f'acl18/raw/{tick[1:]}.csv')
        df = raw_df[(raw_df.Date >= '2014-01-01') & (raw_df.Date <= '2016-12-31') ]
        df.dropna(inplace = True) #drop tickers that have missing data within date range
        Ldf = len(df)
        if tick in valtick_lst and df.Low.min()>= 1.0 and df.Volume.min()>= 500.0 and len(df)>= L:
            df.to_csv(f'model_data/val_data/{tick[1:]}.csv')
        elif df.Low.min()>= 1.0 and df.Volume.min()>= 500.0 and len(df)>= L:
            testtick_lst.append(tick)
            df.to_csv(f'model_data/test_data/{tick[1:]}.csv')
        else:
            #Only select tickers that have data covering the entire date range
            pass

    #Save files into training folder
    #Added additional preprocessing
    #1. Eliminate tickers in validation and test sets
    #2. Eliminate tickers if daily trading volume falls below 10000 (ensures sufficient trading volume)
    #3. Eliminate tickers if low price falls below $5 (ensures sufficient trading volume)
    #4. Eliminate tickers that have less than L datapoints
    traintick_lst = list()
    for fle in glob.glob("tick_data/*.csv"):
        fsplt = '$'+ fle.split('\\')[1]
        tick = fsplt.rpartition('.')[0]

        if tick not in table_lst:
            raw_df = pd.read_csv(f'tick_data/{tick[1:]}.csv')
                   
            df = raw_df[(raw_df['Date'] >= '2014-01-01') & (raw_df['Date'] <= '2016-12-31') ]
            if df.Low.min()>= 1.0 and df.Volume.min()>= 500.0 and len(df)>= L: 
                traintick_lst.append(tick)
                df.to_csv(f'model_data/train_data/{tick[1:]}.csv')

def create_lag(arry, lag = 5):
    """
    Create lagged features

    Parameters:
    array (numpy array): A numpy array from which to create features
    lag (int, optional): Lag value for features. Default is 5. 

    Returns:
    Numpy array with lag features. The last element in the features is the truth label
    (arry[:,:,-1] is truth label)
    """    

    for t in range(0,lag):
        arry = np.concatenate((arry[:,:-1,0:16],arry[:,1:,:]), axis = 2 )
    return arry


In [3]:
def calculate_rsi(data, period=14):
    """
    Calculates the Relative Strength Index (RSI) for a given Pandas Series of closing prices.

    Parameters:
    data (pd.Series): A Pandas Series representing the closing prices of an asset.
    period (int, optional): The period over which to calculate the RSI. Default is 14.

    Returns:
    pd.Series: A Pandas Series containing the RSI values.
    """
    delta = data.diff()
    gain = delta.where(delta > 0, 0)
    loss = -delta.where(delta < 0, 0)

    avg_gain = gain.rolling(window=period, min_periods=period).mean()
    avg_loss = loss.rolling(window=period, min_periods=period).mean()

    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))
    return rsi


def calculate_bollinger_bands_percent(data, period=20, sigma=2):
    """
    Calculates the Bollinger Bands Percentage (%B) for a given Pandas Series of closing prices.

    Parameters:
    data (pd.Series): A Pandas Series representing the closing prices of an asset.
    period (int, optional): The period over which to calculate the %B. Default is 20.

    Returns:
    pd.Series: A Pandas Series containing the %B values.
    """
    rolling_avg = data.rolling(period).mean()
    rolling_std = data.rolling(period).std()
    upper = rolling_avg + sigma * rolling_std
    lower = rolling_avg - sigma * rolling_std
    return (data - lower) / (upper - lower)


def calculate_stochastic_momentum_indicator(data, k_period=5, d_period=10, signal_period=5):
    """
    Calculates the Stochastic Momentum Indicator (SMI) for a given Pandas Series of closing prices.

    Parameters:
    data (pd.Series): A Pandas Series representing the closing prices of an asset.
    period (int, optional): The period over which to calculate the SMI. Default is 20.

    Returns:
    pd.Series: A Pandas Series containing the %B values.
    """
    # https://www.tradingview.com/support/solutions/43000707882-stochastic-momentum-index-smi/
    # https://www.amcharts.com/stochastic-momentum-index-indicator/
    delta = data.rolling(k_period).max() - data.rolling(k_period).min()
    relative = data - (data.rolling(k_period).max() + data.rolling(k_period).min()) / 2
    delta = delta.ewm(span=d_period, adjust=False).mean().ewm(span=d_period, adjust=False).mean()
    relative = relative.ewm(span=d_period, adjust=False).mean().ewm(span=d_period, adjust=False).mean()
    smi = 100 * (relative / delta)
    signal = smi.ewm(span=signal_period, adjust=False).mean()
    return smi, signal


In [4]:
#This code does several things
# 1. It gets all the tickers that has 2767 datapoints
# 2. It calculates the following indicates for each ticker (20 and 100 day EMA, MACD, RSI, CCI, BOLL, MA5/MA10, MTM6/MTM12, ROC, SMI)
# 3. In order to scale all ticker share prices to the same range, it takes the difference between the consecutive days (e.g., RSI(t) - RSI (t-1))
# 4. It takes the log of the differences
# 5. It normalizes the open, high, low, and close price by the previous day's close price and take the log value ensuring the normalized share prices for all tickers are in the same range for training stability
# 6. It normalizes the volume by the previous day's volume and take the log value
# 7. It calculates the labels (Change), which is whether the close price increases(1) or decreases(0) the next day
# 8. It converts everything to a numpy array and concatenate the array on each loop
# Final numpy array is shape (Ntickers, Ndays, Nfeatures)
# Ntickers = number of tickers
# Ndays = number of days
# Nfeatures = number of features + 1 truth label
# Features+label are EMA20dif, EMA100dif, RSIdif, MACDdif, Opendif, Highdif, Lowdif, Closedif, Volumedif, Change
# 'dif' indicates the normarlize value (e.g., EMA20dif)
# Ignore runtime warning

def return_numpy_data(flepth):
    """
    Calculates the features and truth label for the model

    Parameters:
    str: file path to location of csv data files

    Returns:
    tensor: A numpy array containing the features and the truth label. The truth label is index [-1]
    """

    try:
        del ticker_ary
    except:
        pass


    #loop through all the tickers
    for fle in glob.glob(f"{flepth}/*.csv"):

        ticker_df = pd.read_csv(f'{fle}')
        ticker_temp2 = ticker_df.copy()
        
        #calculate the technical indicators
        indicators = ['EMA20','EMA100', 'RSI', 'MACD','MACDSignal', 'CCI', 'BOLL', 'MA5_MA10', 'MTM6_MTM12', 'ROC', 'SMI']
        
        # EMAs and MACD
        ticker_temp2['EMA12'] = ticker_temp2['Close'].ewm(span=12, adjust=False).mean()
        ticker_temp2['EMA26'] = ticker_temp2['Close'].ewm(span=26, adjust=False).mean()
        ticker_temp2['EMA20'] = ticker_temp2['Close'].ewm(span=20, adjust=False).mean()
        ticker_temp2['EMA100'] = ticker_temp2['Close'].ewm(span=100, adjust=False).mean()
        ticker_temp2['MACD'] = ticker_temp2['EMA12'] - ticker_temp2['EMA26']
        ticker_temp2['MACDSignal'] = ticker_temp2['MACD'].ewm(span=9, adjust=False).mean()
        
        # RSI
        ticker_temp2['RSI'] = calculate_rsi(ticker_temp2['Close'])

        # CCI
        ticker_temp2['p_t'] = (ticker_temp2['High'] + ticker_temp2['Low'] + ticker_temp2['Close']) / 3
        ticker_temp2['p_t_MAD'] = (ticker_temp2['p_t'] - ticker_temp2['p_t'].rolling(20).mean()).abs().mean() 
        ticker_temp2['CCI'] = (ticker_temp2['p_t'] - ticker_temp2['p_t'].rolling(20).mean()) / (0.015 * ticker_temp2['p_t_MAD'])

        # ATR
        ticker_temp2['Close_shift1'] = ticker_temp2['Close'].shift(1)
        ticker_temp2['TR'] = ticker_temp2[['High', 'Close_shift1']].max(axis=1) - ticker_temp2[['Low', 'Close_shift1']].min(axis=1)
        ticker_temp2['ATR'] = ticker_temp2['TR'].ewm(span=10, adjust=False).mean()

        # BOLL
        ticker_temp2['BOLL'] = calculate_bollinger_bands_percent(ticker_temp2['Close'])

        # MA5/MA10
        ticker_temp2['MA5'] = ticker_temp2['Close'].rolling(5).mean()
        ticker_temp2['MA10'] = ticker_temp2['Close'].rolling(10).mean()
        ticker_temp2['MA5_MA10'] = ticker_temp2['MA5'] / ticker_temp2['MA10']

        # MTM6/MTM12
        ticker_temp2['MTM6'] = ticker_temp2['Close'] / ticker_temp2['Close'].shift(6)
        ticker_temp2['MTM12'] = ticker_temp2['Close'] / ticker_temp2['Close'].shift(12)
        ticker_temp2['MTM6_MTM12'] = ticker_temp2['MTM6'] / ticker_temp2['MTM12']

        # ROC
        ticker_temp2['ROC'] = 100 * (ticker_temp2['Close'] - ticker_temp2['Close'].shift(10)) / ticker_temp2['Close'].shift(10)

        # SMI
        ticker_temp2['SMI'], ticker_temp2['SMI_signal'] = calculate_stochastic_momentum_indicator(ticker_temp2['Close'])

        # normalize the technical indicators
        for i in indicators:
            ticker_temp2[i+'dif'] = (ticker_temp2[i] + eps) / (ticker_temp2[i].shift(1) + eps)
        ticker_log = ticker_temp2[[i+'dif' for i in indicators]]


        #normalize the share prices and volume (OHLCV)
        ticker_log['Opendif'] = ticker_temp2['Open']/ticker_temp2['Close'].shift(1)
        ticker_log['Highdif'] = ticker_temp2['High']/ticker_temp2['Close'].shift(1)
        ticker_log['Lowdif'] = ticker_temp2['Low']/ticker_temp2['Close'].shift(1)
        ticker_log['Closedif'] = ticker_temp2['Close']/ticker_temp2['Close'].shift(1)
        ticker_log['Volumedif']  = ticker_temp2['Volume']/ticker_temp2['Volume'].shift(1)


        #Calculate the label for each day
        ticker_log['Truth_lbl'] = ticker_temp2['Close'].shift(-1)/ticker_temp2['Close']
        ticker_log.dropna(inplace = True)
        
        #Convert to a numpy array
        ticker_temp = ticker_log.values

        #Add new ticker data to numpy array as we loop through the data
        try:
            ticker_ary  = np.concatenate((ticker_ary, ticker_temp[np.newaxis, :, :]), axis=0)  
        except:
            ticker_ary = ticker_temp[np.newaxis, :, :]
            
        del ticker_temp
       
    return ticker_ary

    


In [5]:
#Split the data into train, val, test
splitdata()

#Create the features for the train, val, test datasets
train_ary = return_numpy_data('model_data\\train_data\\')
val_ary = return_numpy_data('model_data\\val_data\\')
test_ary = return_numpy_data('model_data\\test_data\\')

#Create lag features
train_ary = create_lag(train_ary, lag = 5)
val_ary = create_lag(val_ary, lag = 5)
test_ary = create_lag(test_ary, lag = 5)


print("training, test, and val arrays are created")
print('---------------\n')

print("Array shape is (N_tickers, N_trading_days, N_features + label)")
print("There are 96 features and 1 label")
print('Label is array[:,:,-1], N_features are array[:,:,:-1]')
print('---------------\n')

print("train data shape:", train_ary.shape)
print("test data shape:", test_ary.shape)
print("val data shape:", val_ary.shape)


training, test, and val arrays are created
---------------

Array shape is (N_tickers, N_trading_days, N_features + label)
There are 96 features and 1 label
Label is array[:,:,-1], N_features are array[:,:,:-1]
---------------

train data shape: (333, 730, 97)
test data shape: (40, 730, 97)
val data shape: (40, 730, 97)
