In [1]:
import pandas as pd
import numpy as np
import torch

In [2]:
def calculate_rsi(data, period=14):
    """
    Calculates the Relative Strength Index (RSI) for a given Pandas Series of closing prices.

    Parameters:
    data (pd.Series): A Pandas Series representing the closing prices of an asset.
    period (int, optional): The period over which to calculate the RSI. Default is 14.

    Returns:
    pd.Series: A Pandas Series containing the RSI values.
    """
    delta = data.diff()
    gain = delta.where(delta > 0, 0)
    loss = -delta.where(delta < 0, 0)

    avg_gain = gain.rolling(window=period, min_periods=period).mean()
    avg_loss = loss.rolling(window=period, min_periods=period).mean()

    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

In [3]:
#Ingests the report csv and identify the tickers that had 2767 (maximum datapoints)
#Does not use tickers with less than 2767 datapoints
rep_df = pd.read_csv("report.csv")
repuse_df= rep_df[rep_df.Datapoints == 2767]
print("Number of tickers to use in analysis: ", len(repuse_df))

Number of tickers to use in analysis:  383


In [None]:
#This code does several things
# 1. It gets all the tickers that has 2767 datapoints
# 2. It calculates the following indicates for each ticker (20 and 100 day EMA, MACD, RSI)
# 3. In order to scale all ticker share prices to the same range, it takes the difference between the consecutive days (e.g., RSI(t) - RSI (t-1))
# 4. It takes the log of the differences
# 5. It normalizes the open, high, low, and close price by the previous day's close price and take the log value ensuring the normalized share prices for all tickers are in the same range for training stability
# 6. It normalizes the volume by the previous day's volume and take the log value
# 7. It calculates the labels (Change), which is whether the close price increases(1) or decreases(0) the next day
# 8. It converts everything to a numpy array and concatenate the array on each loop
# Final numpy array is shape (Ntickers, Ndays, Nfeatures)
# Ntickers = number of tickers = 383
# Ndays = number of days = 2767
# Nfeatures = number of features + 1 label
# Features+label are EMA20dif, EMA100dif, RSIdif, MACDdif, Opendif, Highdif, Lowdif, Closedif, Volumedif, Change
# 'dif' indicates the normarlize value (e.g., EMA20dif)
# Ignore runtime warning

try:
    del ticker_ary
except:
    pass

# get the tickers for the analysis
tickers = repuse_df.Ticker.values

#loop through all the tickers
for tick in tickers:
    #get the ticker file
    ticker_df = pd.read_csv(f'tick_data/{tick}.csv')
    ticker_temp2 = ticker_df.copy()

    #calculate the technical indicators
    ticker_temp2['EMA12'] = ticker_temp2['Close'].ewm(span=12, adjust=False).mean()
    ticker_temp2['EMA26'] = ticker_temp2['Close'].ewm(span=26, adjust=False).mean()
    ticker_temp2['EMA20'] = ticker_temp2['Close'].ewm(span=20, adjust=False).mean()
    ticker_temp2['EMA100'] = ticker_temp2['Close'].ewm(span=100, adjust=False).mean()
    ticker_temp2['MACD'] = ticker_temp2['EMA12'] - ticker_temp2['EMA26']
    ticker_temp2['RSI'] = calculate_rsi(ticker_temp2['Close'])

    #normalize the technical indicators
    ticker_temp2[['EMA20dif','EMA100dif', 'RSIdif', 'MACDdif']] = ticker_temp2[['EMA20','EMA100', 'RSI', 'MACD']]/ticker_temp2[['EMA20','EMA100', 'RSI', 'MACD']].shift(1)
    ticker_log = np.log(ticker_temp2[['EMA20dif','EMA100dif', 'RSIdif', 'MACDdif']])

    #normalize the share prices and volume (OHLCV)
    ticker_log['Opendif'] = np.log(ticker_temp2['Open']/ticker_temp2['Close'].shift(1))
    ticker_log['Highdif'] = np.log(ticker_temp2['High']/ticker_temp2['Close'].shift(1))
    ticker_log['Lowdif'] = np.log(ticker_temp2['Low']/ticker_temp2['Close'].shift(1))
    ticker_log['Closedif'] = np.log(ticker_temp2['Close']/ticker_temp2['Close'].shift(1))
    ticker_log['Volumedif']  = np.log(ticker_temp2['Volume']/ticker_temp2['Volume'].shift(1))

    #Calculate the label for each day
    ticker_log['Change'] = ((ticker_temp2['Close'].shift(-1)/ticker_temp2['Close']) > 1)*1
    ticker_log.dropna(inplace = True)

    #Convert to a numpy array
    ticker_temp = ticker_log.values

    #Add new ticker data to numpy array as we loop through the data
    try:
        ticker_ary  = np.concatenate((ticker_ary, ticker_temp[np.newaxis, :, :]), axis=0)
    except:
        ticker_ary = ticker_temp[np.newaxis, :, :]
    del ticker_temp


In [6]:
#Example of data. Will need to do a batch normalization
ticker_log.head()

Unnamed: 0,EMA20dif,EMA100dif,RSIdif,MACDdif,Opendif,Highdif,Lowdif,Closedif,Volumedif,Change
14,-0.000621,-0.00031,0.111858,-0.045194,7.105406e-08,0.011692,-0.001909,0.008861,-0.556451,0
15,-0.001399,-0.000476,0.007682,0.067569,-0.007272816,-0.002208,-0.01396,-0.008861,0.325819,0
16,-0.001566,-0.000528,-0.007684,0.076056,0.002222592,0.005073,-0.006378,-0.003184,0.427726,1
17,-0.000963,-0.000424,0.011201,-0.009451,0.001022203,0.010555,-0.003137,0.004846,-0.500543,0
18,-0.002857,-0.000823,-0.073517,0.202081,-0.005421521,0.000318,-0.023167,-0.021216,-0.287857,0
