In [51]:
import torch
import pandas as pd
import yfinance as yf
import json
from tqdm import tqdm
import random
from urllib.parse import urlparse
import bisect

news_data_file_path = 'TickerTick-stock-news-dataset.2023-11-23.json'

sp500_data = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0]
sp500_tickers = set(sp500_data['Symbol'].tolist())

ignored_tickers = ["BRK.B"]
sp500_tickers = [ticker for ticker in sp500_tickers if ticker not in ignored_tickers]

HOUR = 60 * 60
STORY_PERIOD_START_DELTA_HRS = - 1
STORY_PERIOD_END_DELTA_HRS = 5

device = ("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
device

'cuda'

In [65]:
columns = ["datetime", "title", "description", "ticker", "company", "sector", "industry", "change_pct"]
column_idx = {col: i for i, col in enumerate(columns)}

dataset = []
errors = []
limit_rem = 100

def process_valid_story(story, tickers):
    timestamp_int = story["unix_timestamp"]
    timestamp = pd.Timestamp(timestamp_int, unit='s', tz='US/Eastern')
    timestamp_str = timestamp.strftime('%a %d %b %Y, %I:%M%p')

    row = [None] * len(columns)
    row[column_idx["datetime"]] = timestamp_str
    row[column_idx["title"]] = story["title"]
    row[column_idx["description"]] = story["description"] if "description" in story else ""

    for ticker in tickers:
        yfTicker = yf.Ticker(ticker)
        hist = yfTicker.history(start=timestamp_int-24*2*HOUR, end=timestamp_int+24*2*HOUR, interval='1h', raise_errors=True)
        if len(hist) < (STORY_PERIOD_END_DELTA_HRS - STORY_PERIOD_START_DELTA_HRS + 1):
            hist = yfTicker.history(start=timestamp_int-24*5*HOUR, end=timestamp_int+24*5*HOUR, interval='1h', raise_errors=True)

        hist_times = hist.index.values.astype('int64') // 1e9
        init_idx = bisect.bisect_right(hist_times, timestamp_int + STORY_PERIOD_START_DELTA_HRS * HOUR)
        final_idx = bisect.bisect_left(hist_times, timestamp_int + STORY_PERIOD_END_DELTA_HRS * HOUR)

        init_price = hist.iloc[init_idx]["Open"]
        final_price = hist.iloc[final_idx]["Close"]
        change_pct = (final_price - init_price) / init_price * 100
        
        row_copy = row.copy()
        row_copy[column_idx["ticker"]] = ticker
        row_copy[column_idx["company"]] = yfTicker.info["longName"]
        row_copy[column_idx["sector"]] = yfTicker.info["sector"]
        row_copy[column_idx["industry"]] = yfTicker.info["industry"]
        row_copy[column_idx["change_pct"]] = change_pct
        dataset.append(row_copy)


with open(news_data_file_path, "r") as f:
    pbar = tqdm()
    while (line := f.readline()):
    # for line in tqdm(f):
        pbar.update(1)
        try:
            story = json.loads(line)

            url = story["url"]
            if urlparse(url).hostname != "finance.yahoo.com":
                continue 

            tickers = [T for t in story.get("tickers_direct", []) + story.get("tickers_indirect", []) if (T := t.upper()) in sp500_tickers]
            if len(tickers) == 0:
                continue

            process_valid_story(story, tickers)
            
        except Exception as e:
            errors.append((e, story["id"]))

        n_skip = random.randint(1, 100)
        try:
            for _ in range(n_skip):
                next(f)
        except StopIteration:
            break
        pbar.update(n_skip)

        if limit_rem is not None:
            limit_rem -= 1
            if limit_rem <= 0:
                break

    pbar.close()
    
print(f"{len(errors)} errors: {errors}")

df = pd.DataFrame(dataset, columns=columns)
df.to_csv("dataset.csv", index=False)
df

299190it [11:13, 444.22it/s]
12835it [00:10, 1213.18it/s]

0 errors: []





Unnamed: 0,datetime,title,description,ticker,company,sector,industry,change_pct
0,"Fri 24 Nov 2023, 12:00AM",OpenAI turmoil exposes threat to Microsoft’s i...,Microsoft chief executive Satya Nadella’s deci...,MSFT,Microsoft Corporation,Technology,Software—Infrastructure,-0.355126
1,"Thu 23 Nov 2023, 07:00PM",10 Can’t Miss Black Friday Electronics Deals a...,The biggest shopping day of the season is upon...,COST,Costco Wholesale Corporation,Consumer Defensive,Discount Stores,0.262256
2,"Thu 23 Nov 2023, 05:43PM",UPDATE 1-German union Verdi calls for strikes ...,German trade union Verdi has called on members...,AMZN,"Amazon.com, Inc.",Consumer Cyclical,Internet Retail,-0.812065
3,"Thu 23 Nov 2023, 04:47PM",Corrections & Amplifications - The success of ...,The success of blood thinners being developed ...,BMY,Bristol-Myers Squibb Company,Healthcare,Drug Manufacturers—General,-0.281406
4,"Thu 23 Nov 2023, 04:20PM",Algoma Steel Announces Retirement of Vice Pres...,"SAULT STE. MARIE, Ontario, Nov. 23, 2023 (GLOB...",NDAQ,"Nasdaq, Inc.",Financial Services,Financial Data & Stock Exchanges,0.117837
...,...,...,...,...,...,...,...,...
117,"Wed 22 Nov 2023, 02:19PM",It's Probably Less Likely That Campbell Soup C...,Key Insights Campbell Soup will host its Annua...,CPB,Campbell Soup Company,Consumer Defensive,Packaged Foods,0.807838
118,"Wed 22 Nov 2023, 02:19PM",It's Probably Less Likely That Campbell Soup C...,Key Insights Campbell Soup will host its Annua...,ICE,"Intercontinental Exchange, Inc.",Financial Services,Financial Data & Stock Exchanges,-0.484082
119,"Wed 22 Nov 2023, 02:11PM",Boeing Stock Rises After FAA Clears 737 MAX 10...,Boeing stock was rising on Wednesday after the...,BA,The Boeing Company,Industrials,Aerospace & Defense,0.520960
120,"Wed 22 Nov 2023, 02:01PM",Here's Why We're A Bit Worried About NGM Bioph...,There's no doubt that money can be made by own...,NDAQ,"Nasdaq, Inc.",Financial Services,Financial Data & Stock Exchanges,0.308779


In [57]:
# https://stackoverflow.com/questions/5543651/computing-standard-deviation-in-a-stream

class OnlineStats:
    def __init__(self, iterable=None, ddof=1):
        self.ddof, self.n, self.mean, self.M2 = ddof, 0, 0.0, 0.0
        if iterable is not None:
            for datum in iterable:
                self.include(datum)

    def include(self, datum):
        self.n += 1
        self.delta = datum - self.mean
        self.mean += self.delta / self.n
        self.M2 += self.delta * (datum - self.mean)

    def variance(self):
        return self.M2 / (self.n - self.ddof)
    
from collections import defaultdict

domain_desc_stats_map = defaultdict(OnlineStats)

with open(news_data_file_path, "r") as f:
    pbar = tqdm()
    while (line := f.readline()):
        story = json.loads(line)
        if "url" in story:
            url = story["url"]
            domain = urlparse(url).hostname
            domain_desc_stats_map[domain].include(len(story["description"]) if "description" in story else 0)
        pbar.update(1)
    pbar.close()

len(domain_desc_stats_map)

9867339it [02:37, 62648.72it/s]


32479

In [59]:
domain_desc_stats = []

for domain, stats in domain_desc_stats_map.items():
    if stats.n > 1000:
        domain_desc_stats.append({"domain": domain, "n": stats.n, "mean": stats.mean, "std": stats.variance() ** 0.5})

desc_stats_df = pd.DataFrame(domain_desc_stats)
desc_stats_df = desc_stats_df.sort_values(by="n", ascending=False)
desc_stats_df

Unnamed: 0,domain,n,mean,std
30,www.tickerreport.com,782351,355.701036,20.414411
137,finance.yahoo.com,762210,234.421086,342.909783
35,seekingalpha.com,446532,0.030849,2.914709
29,www.etfdailynews.com,263621,354.313712,20.617613
28,www.reddit.com,229657,791.079379,1187.396293
...,...,...,...,...
652,news.cgtn.com,1023,5203.140762,1989.244992
176,www.gq.com,1019,95.922473,52.830705
618,www.datadoghq.com,1018,178.916503,270.155249
381,www.anandtech.com,1008,1510.179563,1832.775524
