In [21]:
import json

import numpy as np
import pandas as pd
import yfinance as yf

from ray.utils.file_utilities import data_dir, s3_upload

train_s3_path = "CRSP/crsp_2018-2023_clean_3.parquet"

In [2]:
file_name = train_s3_path.split("/")[1]
train_path = data_dir.joinpath(file_name).absolute()

train_df = pd.read_parquet(train_path)
train = train_df.pivot(index="date", columns="ticker", values="return")
print(f"Train data shape: {train.shape}, from {train.index.min()} to {train.index.max()}")

Train data shape: (1509, 585), from 2018-01-02 00:00:00 to 2023-12-29 00:00:00
There are missing values in the training data.


In [4]:
if train.isnull().sum().sum() > 0:
    print(train.isnull().sum())
    train = train[train.columns[train.isnull().sum() == 0]]

ticker
A         0
AAL       0
AAPL      0
ABBV      0
ABC      85
       ... 
ZBRA      0
ZION      0
ZM      325
ZS       51
ZTS       0
Length: 585, dtype: int64


In [16]:
# write the ticker list to a file
ticker_list = train.columns.tolist()
ticker_list_path = data_dir.joinpath("ticker_list.json")
with open(ticker_list_path, "w") as f:
    json.dump(ticker_list, f)

In [17]:
test_path = data_dir.joinpath("test.parquet").absolute()
print("Downloading test data from Yahoo Finance...")
test_tickers = ticker_list + ["SPY"]
test_df = yf.download(test_tickers, start="2023-12-29")
test_close = pd.DataFrame(test_df["Adj Close"])

test = pd.DataFrame(np.log(test_close / test_close.shift()))
test = test.drop(test.index[0])  # drop the first day with NaN

[*                      2%%                      ]  10 of 483 completed

Downloading test data from Yahoo Finance...


[***********           22%%                      ]  108 of 483 completed

$HCN: possibly delisted; No price data found  (1d 2023-12-29 -> 2024-07-14)


[************          25%%                      ]  123 of 483 completed

$BF: possibly delisted; No price data found  (1d 2023-12-29 -> 2024-07-14)


[********************  41%%                      ]  200 of 483 completed

$BRK: possibly delisted; No price data found  (1d 2023-12-29 -> 2024-07-14)


[*********************100%%**********************]  483 of 483 completed

7 Failed downloads:
['HCN', 'BF', 'BRK']: YFPricesMissingError('$%ticker%: possibly delisted; No price data found  (1d 2023-12-29 -> 2024-07-14)')
['FLT', 'PXD', 'DISH', 'SPLK']: YFTzMissingError('$%ticker%: possibly delisted; No timezone found')


In [18]:
# Count the number of nulls in each column
null_counts = test.isnull().sum()
total_rows = len(test)
null_percentages = (null_counts / total_rows) * 100
print(f"Tickers with >1% null: {null_percentages[null_percentages > 1].index}")

Tickers with >1% null: Index(['BF', 'BRK', 'DISH', 'FLT', 'HCN', 'PXD', 'SPLK', 'WRK'], dtype='object', name='Ticker')


In [19]:
test = test[null_percentages[null_percentages <= 1].index]

# fill 0 for the rest of NaN
print(f"Tickers with <1% null but none 0: {null_percentages[(null_percentages < 1)&(null_percentages > 0)].index}")
test = test.fillna(0)
test.to_parquet(data_dir.joinpath("test.parquet").absolute())

Tickers with <1% null but none 0: Index([], dtype='object', name='Ticker')


In [22]:
s3_upload("CRSP/test.parquet")