In [189]:
import torch
import pandas as pd
import yfinance as yf
import json
from tqdm import tqdm
import random

device = ("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
device

'mps'

In [185]:
news_data_file_path = 'stock-news-dataset-sample.txt'

sp500_data = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0]
sp500_tickers = sp500_data['Symbol'].tolist()

columns = ["timestamp", "url", "title", "description"] + sp500_tickers
column_idx = {col: i for i, col in enumerate(columns)}

HOUR = 60 * 60

In [202]:
dataset = []
errors = []
pbar = tqdm()
limit_rem = 1000

with open(news_data_file_path, "r") as f:
    l = 0
    while (line := f.readline()):
    # for line in tqdm(f):
        try:
            story = json.loads(line)
            row = [None] * len(columns)
            timestamp = story["unix_timestamp"] - 60*60*16
            row[column_idx["timestamp"]] = timestamp
            row[column_idx["url"]] = story["url"]
            row[column_idx["title"]] = story["title"]
            row[column_idx["description"]] = story["description"]

            tickers = [T for t in story.get("tickers_direct", []) + story.get("tickers_indirect", []) if (T := t.upper()) in sp500_tickers]

            for ticker in tickers:
                yfTicker = yf.Ticker(ticker)
                hist = yfTicker.history(start=timestamp-24*5*HOUR, end=timestamp+24*5*HOUR, interval='1h')
                hist_times = hist.index.values.astype('int') // 1e9

                i = 0
                while i < len(hist_times) and hist_times[i+1] < timestamp:
                    i += 1
                init_idx = i
                
                i += 1
                while i < len(hist_times) and hist_times[i] - 3*HOUR  < timestamp:
                    i += 1
                final_idx = i

                initial_price = hist.iloc[init_idx]["Open"]
                final_price = hist.iloc[final_idx]["Close"]
                row[column_idx[ticker]] = (final_price - initial_price) / initial_price * 100

            dataset.append(row)

        except Exception as e:
            errors.append(e)

        # tqdm stuff to keep track of progress
        
        l += (jump := random.randint(1, 10))
        pbar.update(jump)

        if limit_rem is not None:
            limit_rem -= 1
            if limit_rem <= 0:
                break

pbar.close()
print(f"Errors: {errors}")
    
df = pd.DataFrame(dataset, columns=columns)
df.to_csv("dataset-one-hot-enc.csv", index=False)
df

2223it [00:08, 258.82it/s]

Errors: [KeyError('description'), KeyError('description'), KeyError('description'), KeyError('description'), KeyError('description'), KeyError('description'), KeyError('description'), KeyError('description'), KeyError('description'), KeyError('description'), KeyError('description'), KeyError('description'), KeyError('description'), KeyError('description'), KeyError('description'), KeyError('description'), KeyError('description'), KeyError('description'), KeyError('description')]





Unnamed: 0,timestamp,url,title,description,MMM,AOS,ABT,ABBV,ACN,ADBE,...,WMB,WTW,GWW,WYNN,XEL,XYL,YUM,ZBRA,ZBH,ZTS
0,1679065123,https://www.etfdailynews.com/2023/03/18/adapti...,Adaptimmune Therapeutics (NASDAQ:ADAP) Coverag...,Equities research analysts at StockNews.com st...,,,,,,,...,,,,,,,,,,
1,1679065121,https://www.etfdailynews.com/2023/03/18/adicet...,Adicet Bio (NASDAQ:ACET) Rating Reiterated by ...,Adicet Bio (NASDAQ:ACET – Get Rating)‘s stock ...,,,,,,,...,,,,,,,,,,
2,1679065121,https://www.tickerreport.com/banking-finance/1...,Abeona Therapeutics (NASDAQ:ABEO) Earns Hold R...,Equities research analysts at StockNews.com be...,,,,,,,...,,,,,,,,,,
3,1679065121,https://www.tickerreport.com/banking-finance/1...,Graybug Vision Stock Set to Reverse Split on M...,"Graybug Vision, Inc. (NASDAQ:GRAY – Get Rating...",,,,,,,...,,,,,,,,,,
4,1679065121,https://www.etfdailynews.com/2023/03/18/arbutu...,Arbutus Biopharma (NASDAQ:ABUS) Coverage Initi...,Equities researchers at StockNews.com began co...,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
380,1679060058,https://leadership.ng/terra-unveils-shrimp-sea...,Terra Unveils Shrimp Seasoning Cube,Tropical General Investments (TGI) Group has u...,,,,,,,...,,,,,,,,,,
381,1679060034,https://www.teamblind.com/post/integration-of-...,Integration of ChatGPT with Microsoft’s Azure ...,Do you think that Azure's implementation of a ...,,,,,,,...,,,,,,,,,,
382,1679060034,https://www.teamblind.com/post/is-amazon-going...,Is Amazon going to do more layoff?,"After meta second round of layoff, the next qu...",,,,,,,...,,,,,,,,,,
383,1679060034,https://www.teamblind.com/post/good-time-to-bu...,Good time to buy first republic bank?,,,,,,,,...,,,,,,,,,,


In [183]:
yf.Ticker('MSFT').history(start=1700811374, end=1700836574, interval='1h')


IndexError: index 0 is out of bounds for axis 0 with size 0

In [None]:
# previously used code

            # while timestamp > (init_timestamp := hist.index.values[0].astype('int') // 1e9):
            #     init_timestamp += HOUR
            #     hist = yfTicker.history(start=init_timestamp-24*5*HOUR, end=datetime.fromtimestamp(init_timestamp+24*5*HOUR), interval='1h')

            # start = timestamp - 60*60*2
            # end = timestamp + 60*60*6
            # hist = yfTicker.history(start=start, end=end, interval='1h')

            # for i in range(24*5):
            #     # print(datetime.datetime.fromtimestamp(hist.index.values[0].astype('int') // 1e9), datetime.datetime.fromtimestamp(timestamp))
            #     # print(hist.index.values[0].astype('int') // 1e9, timestamp)
            #     if len(hist) == 0 or hist.index.values[0].astype('int') // 1e9 > timestamp:
            #         start -= 60*60
            #         hist = yfTicker.history(start=start, end=end, interval='1h')
            #         # print(datetime.fromtimestamp(start), hist.index[0] if len(hist) > 0 else None)
            #     else:
            #         break

            # for i in range(24*4):
            #     if len(hist) < 2:
            #         end += 60*60
            #         # print(timestamp, ticker, start, end)
            #         hist = yfTicker.history(start=start, end=end, interval='1h')
            #     else:
            #         break

            # initial_price = hist["Open"].values[0]
            # final_price = hist["Close"].values[-1]