In [None]:
import torch
import pandas as pd
import yfinance as yf
import json
from tqdm import tqdm
import random

news_data_file_path = 'TickerTick-stock-news-dataset.2023-11-23.json'

sp500_data = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0]
sp500_tickers = sp500_data['Symbol'].tolist()

ignored_tickers = ["BRK.B"]
sp500_tickers = [ticker for ticker in sp500_tickers if ticker not in ignored_tickers]


HOUR = 60 * 60

device = ("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
device

In [None]:
columns = ["datetime", "url", "title", "description", "ticker", "change_pct"]
column_idx = {col: i for i, col in enumerate(columns)}

dataset = []
errors = []
pbar = tqdm()
limit_rem = None

with open(news_data_file_path, "r") as f:
    l = 0
    while (line := f.readline()):
    # for line in tqdm(f):
        try:
            story = json.loads(line)
            if "description" not in story:
                continue

            timestamp_int = story["unix_timestamp"]
            timestamp = pd.Timestamp(timestamp_int, unit='s')

            row = [None] * len(columns)
            row[column_idx["datetime"]] = timestamp.strftime('%a %d %b %Y, %I:%M%p')
            row[column_idx["url"]] = story["url"]
            row[column_idx["title"]] = story["title"]
            row[column_idx["description"]] = story["description"]

            tickers = [T for t in story.get("tickers_direct", []) + story.get("tickers_indirect", []) if (T := t.upper()) in sp500_tickers]

            if len(tickers) == 0:
                continue

            for ticker in tickers:
                yfTicker = yf.Ticker(ticker)
                hist = yfTicker.history(start=timestamp_int-24*2*HOUR, end=timestamp_int+24*2*HOUR, interval='1h', raise_errors=True)
                if len(hist) < 12:
                    hist = yfTicker.history(start=timestamp_int-24*7*HOUR, end=timestamp_int+24*7*HOUR, interval='1h', raise_errors=True)
                hist_times = hist.index.values.astype('int64') // 1e9

                i = 0
                while i+2 < len(hist_times) and hist_times[i+1] < timestamp_int:
                    i += 1
                init_idx = i
                
                i += 1
                while i < len(hist_times) and hist_times[i] - 3*HOUR  < timestamp_int:
                    i += 1
                final_idx = i

                init_price = hist.iloc[init_idx]["Open"]
                final_price = hist.iloc[final_idx]["Close"]
                change_pct = (final_price - init_price) / init_price * 100
                
                row_copy = row.copy()
                row_copy[column_idx["ticker"]] = ticker
                row_copy[column_idx["change_pct"]] = change_pct
                dataset.append(row_copy)

        except Exception as e:
            errors.append((e, story["id"]))

        jump = random.randint(1, 90)
        for _ in range(jump-1):
            next(f)
        l += jump
        pbar.update(jump)

        if limit_rem is not None:
            limit_rem -= 1
            if limit_rem <= 0:
                break

pbar.close()
print(f"{len(errors)} errors: {errors}")
    
df = pd.DataFrame(dataset, columns=columns)
df.to_csv("dataset.csv", index=False)
df

In [None]:
# One hot encoding

columns = ["timestamp", "url", "title", "description"] + sp500_tickers
column_idx = {col: i for i, col in enumerate(columns)}

dataset = []
errors = []
pbar = tqdm()
limit_rem = None

with open(news_data_file_path, "r") as f:
    l = 0
    while (line := f.readline()):
    # for line in tqdm(f):
        try:
            story = json.loads(line)
            if "description" not in story:
                continue

            row = [None] * len(columns)
            timestamp_int = story["unix_timestamp"]
            row[column_idx["timestamp"]] = timestamp_int
            row[column_idx["url"]] = story["url"]
            row[column_idx["title"]] = story["title"]
            row[column_idx["description"]] = story["description"]

            tickers = [T for t in story.get("tickers_direct", []) + story.get("tickers_indirect", []) if (T := t.upper()) in sp500_tickers]

            if len(tickers) == 0:
                continue

            for ticker in tickers:
                yfTicker = yf.Ticker(ticker)
                hist = yfTicker.history(start=timestamp_int-24*2*HOUR, end=timestamp_int+24*2*HOUR, interval='1h', raise_errors=True)
                if len(hist) < 12:
                    hist = yfTicker.history(start=timestamp_int-24*7*HOUR, end=timestamp_int+24*7*HOUR, interval='1h', raise_errors=True)
                hist_times = hist.index.values.astype('int64') // 1e9

                i = 0
                while i+2 < len(hist_times) and hist_times[i+1] < timestamp_int:
                    i += 1
                init_idx = i
                
                i += 1
                while i < len(hist_times) and hist_times[i] - 3*HOUR  < timestamp_int:
                    i += 1
                final_idx = i

                init_price = hist.iloc[init_idx]["Open"]
                final_price = hist.iloc[final_idx]["Close"]
                change_pct = (final_price - init_price) / init_price * 100
                row[column_idx[ticker]] = change_pct

            dataset.append(row)

        except Exception as e:
            errors.append((e, story["id"]))

        jump = random.randint(1, 90)
        for _ in range(jump-1):
            next(f)
        l += jump
        pbar.update(jump)

        if limit_rem is not None:
            limit_rem -= 1
            if limit_rem <= 0:
                break

pbar.close()
print(f"{len(errors)} errors: {errors}")
    
df = pd.DataFrame(dataset, columns=columns)
df.to_csv("dataset-one-hot-enc.csv", index=False)
df