This is a demonstration on how to obtain the data for the 2018-2024 period deposited at Mendeley data (https://data.mendeley.com/datasets/czwwfgcgz7/1). The data for the other period is similar.

In [None]:
import yfinance as yf
import pandas as pd
import numpy as np
from ta import add_all_ta_features
import time

%run utils.py

# Download data

In [None]:
with open("2019-2024-tics-WRDS.txt", "r") as f:
    tickers = [tic.rstrip() for tic in f]

# Some of the tickers will fail
# failed_tics = ['BAC.PL', 'BF.B', 'BIG', 'BRK.A', 'BRK.B', 'DCPH', 'EBIXQ', 'ETRN', 'EURN', 'EVBG', 'GPS', 'GTHX', 'HA', 'HEI.A',
#                'HIBB', 'LGF.A', 'LSXMA', 'LSXMK', 'MOG.A', 'NSTGQ', 'PBR.A', 'PNM', 'PRFT', 'SIX', 'SLCA', 'SPWR', 'SWN', 'TUP',
#                'TWOU', 'UCBI', 'VGR', 'WIRE', 'WRK']
for tic in failed_tics:
    tickers.remove(tic)
tickers = tickers + ['BAC-PL', 'BF-B', 'BRK-A', 'BRK-B', 'HEI-A', 'LGF-A', 'MOG-A', 'PBR-A']
print(f"There are {len(tickers)} tickers")

There are 2270 tickers


yfinance only accepts up to 2000 requests per hour per IP (reference: https://stackoverflow.com/questions/5888662/does-yahoo-finance-have-data-request-upper-limit-is-there-an-alternative-or-wor/32913242#32913242)

In [None]:
data = pd.DataFrame()
missing_sector = []

for ticker in tickers:
    tic_data = yf.download(ticker, start="2018-01-03", end="2024-12-06", group_by='ticker')
    tic_data = tic_data.stack(level=0, future_stack=True).reset_index()
    # Sleep for 3 seconds so there should be no more than 1200 requests per hour
    time.sleep(3)
    
    stock = yf.Ticker(ticker).info
    try:
        # Sector information
        tic_data['sector'] = stock['sector']
        missing_sector.append(ticker)
    except:
        tic_data['sector'] = 'other'
        print(f'{ticker} has missing sector')
    data = pd.concat([data, tic_data])

In [None]:
# Keep track of failed requests in the previous cell
failed_tics = set(tickers) - set(data.Ticker.unique())

while len(failed_tics) > 0:
    other_data = pd.DataFrame()
    for ticker in failed_tics:
        tic_data = yf.download(ticker, start="2018-01-03", end="2024-12-06", group_by='ticker')
        tic_data = tic_data.stack(level=0, future_stack=True).reset_index()
        time.sleep(3)
    
        stock = yf.Ticker(ticker).info
        try:
            # Sector information
            tic_data['sector'] = stock['sector']
            missing_sector.append(ticker)
        except:
            tic_data['sector'] = 'other'
            print(f'{ticker} has missing info')
        other_data = pd.concat([other_data, tic_data])
    data = pd.concat([data, other_data])
    failed_tics = set(tickers) - set(data.Ticker.unique())

In [None]:
column_names = {'Date': 'datadate', 'Ticker': 'tic', 'Open': 'prcod', 'High': 'prchd', 'Low': 'prcld', 'Close': 'prccd',
                'Volume': 'cshtrd'}
data = data.rename(columns=column_names)
data = data[['datadate', 'tic', 'prcod', 'prchd', 'prcld', 'prccd', 'cshtrd', 'sector']]
data = data.reset_index(drop=True)
data.columns.name = None

In [None]:
total_dates = set(data['datadate'].unique())
apple_dates = set(data[data['tic']=='AAPL']['datadate'].unique())
print(len(apple_dates))
print(len(total_dates - apple_dates))

num_of_ts = data[data['tic']=='AAPL']['datadate'].nunique()
print(f'There are {num_of_ts} timestamps')

In [None]:
data = remove_tic(data, num_of_ts)

# Check that each ticker has exactly num_of_ts dates
tickers = list(data.tic.unique())
assert data.shape[0] == num_of_ts * len(tickers)

data = remove_low_dollar_vol(data, dol_vol_thres = 10000000)
data

# Save tickers
# with open("2018-2024-tics-yh.txt", "w") as f:
#     for tic in tickers:
#         f.write(f"{tic}\n")

# Process Data

In [None]:
# Define TBill path (for calculating risk-free rates; optional)
TBill_path = '1yearTBill_all_times.csv'

In [None]:
data['datadate'] = pd.to_datetime(data['datadate'])
data = compute_ret(data, TBill_path)
data = remove_dead_stocks(data)

data = assign_class_labels(data, 'fixed_size')
# Make sector column
data, num_of_tokens, num_to_sector_dict = make_sector_column(data)
# Create dictionary that associate each ticker with a numerical label and vice versa for easier reference
num_to_tic_dict, tic_to_num_dict = num_tic_dicts(data)

print(f'Confirm that data has no NAs: {~data.isna().any().any()}')

In [None]:
factors = ['cshtrd', 'prccd', 'prchd', 'prcld', 'prcod', 'dol_vol']

data, factors = feature_engineer(data, factors)
print(f'Confirm that feature engineering did not create NaNs: {~data.isna().any().any()}')
print(f'All factors: {factors}')

assert data.shape[0] == data.datadate.nunique() * data.tic.nunique()

data = all_features_ta(data)
print(f'Confirm that data has no NaNs: {~data.isna().any().any()}')

TA_factors = [# Momentum indicators
              'momentum_stoch_rsi', 'momentum_stoch', 'momentum_ao', 'momentum_pvo', 'momentum_kama', 'momentum_wr',
              # Volume indicators
              'volume_adi', 'volume_em', 'volume_fi', 'volume_cmf', 'volume_vpt',
              # Volatility indicators
              'volatility_atr', 'volatility_bbh', 'volatility_dcw', 'volatility_ui',
              # Trend indicators
              'trend_adx', 'trend_aroon_up', 'trend_aroon_down', 'trend_ichimoku_a',
              # Other indicators
              'others_dr'
]

factors.extend(TA_factors)
print(factors)

data = data[['datadate', 'tic'] + factors + ['ret_d', 'TBill1y', 'rel_ret_d', 'DistinctRank', 'rank', 'sector']]
data

In [None]:
# Save data
# data.to_csv('YH_processed_20180103-20241206.csv', index=False)