In [2]:
import torch
import pandas as pd
import yfinance as yf
import json
from tqdm import tqdm
import random

device = ("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
device

'cuda'

In [3]:
news_data_file_path = 'TickerTick-stock-news-dataset.2023-11-23.json'

sp500_data = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0]
sp500_tickers = sp500_data['Symbol'].tolist()

columns = ["timestamp", "url", "title", "description"] + sp500_tickers
column_idx = {col: i for i, col in enumerate(columns)}

HOUR = 60 * 60

In [37]:
dataset = []
errors = []
pbar = tqdm()
limit_rem = 10000

with open(news_data_file_path, "r") as f:
    l = 0
    while (line := f.readline()):
    # for line in tqdm(f):
        try:
            story = json.loads(line)
            if "description" not in story:
                continue

            row = [None] * len(columns)
            timestamp = story["unix_timestamp"] - 60*60*16
            row[column_idx["timestamp"]] = timestamp
            row[column_idx["url"]] = story["url"]
            row[column_idx["title"]] = story["title"]
            row[column_idx["description"]] = story["description"]

            tickers = [T for t in story.get("tickers_direct", []) + story.get("tickers_indirect", []) if (T := t.upper()) in sp500_tickers]

            if len(tickers) == 0:
                continue

            for ticker in tickers:
                yfTicker = yf.Ticker(ticker)
                hist = yfTicker.history(start=timestamp-24*2*HOUR, end=timestamp+24*2*HOUR, interval='1h', raise_errors=True)
                if len(hist) < 12:
                    hist = yfTicker.history(start=timestamp-24*7*HOUR, end=timestamp+24*7*HOUR, interval='1h', raise_errors=True)
                hist_times = hist.index.values.astype('int64') // 1e9

                i = 0
                while i+2 < len(hist_times) and hist_times[i+1] < timestamp:
                    i += 1
                init_idx = i
                
                i += 1
                while i < len(hist_times) and hist_times[i] - 3*HOUR  < timestamp:
                    i += 1
                final_idx = i

                init_price = hist.iloc[init_idx]["Open"]
                final_price = hist.iloc[final_idx]["Close"]
                change_pct = (final_price - init_price) / init_price * 100
                row[column_idx[ticker]] = change_pct

            dataset.append(row)

        except Exception as e:
            errors.append((e, story["id"]))

        # tqdm stuff to keep track of progress
        l += (jump := random.randint(1, 90))
        pbar.update(jump)

        if limit_rem is not None:
            limit_rem -= 1
            if limit_rem <= 0:
                break

pbar.close()
print(f"{len(errors)} errors: {errors}")
    
df = pd.DataFrame(dataset, columns=columns)
df.to_csv("dataset-one-hot-enc.csv", index=False)
df

0it [00:00, ?it/s]

460000it [07:32, 1017.61it/s]


36 errors: [(Exception('BRK.B: No timezone found, symbol may be delisted'), '-2146268629891649901'), (Exception('BRK.B: No timezone found, symbol may be delisted'), '-2876896092783107358'), (Exception('BRK.B: No timezone found, symbol may be delisted'), '-5602237677975798316'), (Exception('BRK.B: No timezone found, symbol may be delisted'), '8253319829633350123'), (Exception('BRK.B: No timezone found, symbol may be delisted'), '-6056503350793612874'), (Exception('BRK.B: No timezone found, symbol may be delisted'), '8058031600686423630'), (Exception('BRK.B: No timezone found, symbol may be delisted'), '826602545318281906'), (Exception('BRK.B: No timezone found, symbol may be delisted'), '-1801203470565253377'), (Exception('BRK.B: No timezone found, symbol may be delisted'), '-4811615795256040667'), (Exception('BRK.B: No timezone found, symbol may be delisted'), '-7972420577751134448'), (Exception('BRK.B: No timezone found, symbol may be delisted'), '-3466383762346591439'), (Exception('B

Unnamed: 0,timestamp,url,title,description,MMM,AOS,ABT,ABBV,ACN,ADBE,...,WMB,WTW,GWW,WYNN,XEL,XYL,YUM,ZBRA,ZBH,ZTS
0,1700761052,https://www.marketbeat.com/instant-alerts/nyse...,Principal Financial Group Inc. Lowers Stock Po...,Principal Financial Group Inc. trimmed its sta...,,,,,,,...,,,,,,,,,,
1,1700760974,https://www.openpr.com/news/3301861/images-con...,Images Content Moderation Solution Market Size...,Images Content Moderation Solution replace wit...,,,,,,,...,,,,,,,,,,
2,1700760916,https://www.androidpolice.com/linkedin-ai-prof...,LinkedIn: How to create an AI profile photo,A professional photo is essential for job hunt...,,,,,,,...,,,,,,,,,,
3,1700760858,https://retailtimes.co.uk/jd-announces-its-hug...,JD announces its huge Black Friday Sale with u...,Retail Times publishes international & UK reta...,,,,,,,...,,,,,,,,,,
4,1700760780,https://www.globenewswire.com/en/news-release/...,Commercial Aircraft Global Market Outlook 2023...,"Dublin, Nov. 24, 2023 (GLOBE NEWSWIRE) -- The ...",,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9959,1700546344,https://news.google.com/rss/articles/CBMicGh0d...,Nvidia Earnings Soar Past Analyst Predictions ...,Nvidia Earnings Soar Past Analyst Predictions ...,,,,,,,...,,,,,,,,,,
9960,1700546340,https://finance.yahoo.com/m/b02bc82e-703d-39ba...,Nvidia Calls the Shots After the Bell,Response to the quarterly report is likely to ...,,,,,,,...,,,,,,,,,,
9961,1700546339,https://www.mercurynews.com/2023/11/21/police-...,Police shoot Oakland man who allegedly tried t...,The incident happened Sunday morning at a Best...,,,,,,,...,,,,,,,,,,
9962,1700546282,https://www.marketscreener.com/quote/stock/AUT...,Autodesk Raises Full-Year Guidance After 3Q Re...,(marketscreener.com) By Ben Glickman Autodesk ...,,,,,,,...,,,,,,,,,,
