In [1]:
import pandas as pd
import yfinance as yf
import json
from tqdm import tqdm
import concurrent.futures
from urllib.parse import urlparse
import bisect
import time
from requests import Session
from requests_cache import CacheMixin, SQLiteCache
from requests_ratelimiter import LimiterMixin, MemoryQueueBucket
from pyrate_limiter import Duration, RequestRate, Limiter

class CachedLimiterSession(CacheMixin, LimiterMixin, Session):
    pass

HOUR = 60 * 60
DAY = HOUR * 24
YF_HISTORY_LIMIT = 730 * DAY      # limit by the yfinance package

news_data_file_path = 'TickerTick-stock-news-dataset.2023-11-23.json'

sp500_data = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0]
sp500_tickers = set(sp500_data['Symbol'].tolist())

sp100_data = pd.read_html('https://en.wikipedia.org/wiki/S%26P_100')[2]
sp100_tickers = set(sp100_data['Symbol'].tolist())

In [2]:
# https://stackoverflow.com/questions/5543651/computing-standard-deviation-in-a-stream
class OnlineStats:
    def __init__(self, iterable=None, ddof=1):
        self.ddof, self.n, self.mean, self.M2 = ddof, 0, 0.0, 0.0
        if iterable is not None:
            for datum in iterable:
                self.include(datum)

    def include(self, datum):
        self.n += 1
        self.delta = datum - self.mean
        self.mean += self.delta / self.n
        self.M2 += self.delta * (datum - self.mean)

    def variance(self):
        return self.M2 / (self.n - self.ddof)

In [3]:
from collections import defaultdict

hostname_desc_stats_map = defaultdict(OnlineStats)

with open('TickerTick-stock-news-dataset.2023-11-23.json', "r") as f:
    pbar = tqdm()
    while (line := f.readline()):
        story = json.loads(line)
        if "url" in story:
            url = story["url"]
            hostname = urlparse(url).hostname
            hostname_desc_stats_map[hostname].include(len(story["description"]) if "description" in story else 0)
        pbar.update(1)
    pbar.close()

len(hostname_desc_stats_map)

9867339it [02:35, 63630.18it/s]


32479

In [4]:
domain_desc_stats = []

for hostname, stats in hostname_desc_stats_map.items():
    if stats.n > 1000:
        domain_desc_stats.append({"domain": hostname, "n": stats.n, "mean": stats.mean, "std": stats.variance() ** 0.5})

desc_stats_df = pd.DataFrame(domain_desc_stats)
desc_stats_df = desc_stats_df.sort_values(by="n", ascending=False)
desc_stats_df

Unnamed: 0,domain,n,mean,std
30,www.tickerreport.com,782351,355.701036,20.414411
137,finance.yahoo.com,762210,234.421086,342.909783
35,seekingalpha.com,446532,0.030849,2.914709
29,www.etfdailynews.com,263621,354.313712,20.617613
28,www.reddit.com,229657,791.079379,1187.396293
...,...,...,...,...
652,news.cgtn.com,1023,5203.140762,1989.244992
176,www.gq.com,1019,95.922473,52.830705
618,www.datadoghq.com,1018,178.916503,270.155249
381,www.anandtech.com,1008,1510.179563,1832.775524


In [5]:
# Filter the original dataset to only include Yahoo Finance stories

yf_news_stories = []
with open(news_data_file_path, "r") as f:
   pbar = tqdm()
   while (line := f.readline()):
      story = json.loads(line)
      if "url" in story and urlparse(story["url"]).hostname == "finance.yahoo.com":
         yf_news_stories.append(story)
      pbar.update(1)
   pbar.close()

9867339it [02:26, 67162.58it/s] 


In [6]:
session = CachedLimiterSession(
    limiter=Limiter(RequestRate(2, Duration.SECOND*5)),  # max 2 requests per 5 seconds
    bucket_class=MemoryQueueBucket,
    backend=SQLiteCache("yfinance.cache"),
)
# session = requests_cache.CachedSession('yfinance.cache')
session.headers['User-agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36'

query_startime = int(time.time()) // DAY * DAY  - (YF_HISTORY_LIMIT - DAY)

for story in yf_news_stories:
    if "unix_timestamp" in story and type(story["unix_timestamp"]) == int:
        query_endtime = story["unix_timestamp"] + 5 * DAY
        break
else:
    raise Exception("Data appears to be empty or missing unix_timestamp field.")

print(f"Data query range: ({pd.Timestamp(query_startime, unit='s', tz='US/Eastern')}, {pd.Timestamp(query_endtime, unit='s', tz='US/Eastern')})")

ticker_history = {}
ticker_info = {}
fetch_errors = []

for ticker in tqdm(sp500_tickers):
    for retry in range(4):
      try:
          yfTicker = yf.Ticker(ticker, session=session)
          ticker_info[ticker] = yfTicker.info
          hist_df = yfTicker.history(start=query_startime, end=query_endtime, interval='1h', raise_errors=True)
          hist_df['ts_int'] = hist_df.index.values.astype('int64') // 1e9
          ticker_history[ticker] = hist_df
          break
      except Exception as e:
          fetch_errors.append(e)
    else:
      print(f"Skipping {ticker}")

print(f"Fetch errors: {fetch_errors}")
len(ticker_history), len(ticker_info)

Data query range: (2022-05-03 20:00:00-04:00, 2023-11-29 01:45:18-05:00)


  0%|          | 0/503 [00:00<?, ?it/s]



 19%|█▉        | 95/503 [00:05<00:54,  7.47it/s]

Skipping SOLV


 40%|████      | 202/503 [00:10<00:33,  9.02it/s]

Skipping BRK.B
Skipping CPAY


 64%|██████▍   | 323/503 [00:14<00:05, 31.68it/s]

Skipping BF.B


 66%|██████▌   | 331/503 [00:15<00:12, 13.27it/s]

Skipping GEV
Skipping DAY


100%|██████████| 503/503 [00:20<00:00, 24.21it/s]

Fetch errors: [Exception("SOLV: Data doesn't exist for startDate = 1651622400, endDate = 1701240318"), Exception("SOLV: Data doesn't exist for startDate = 1651622400, endDate = 1701240318"), Exception("SOLV: Data doesn't exist for startDate = 1651622400, endDate = 1701240318"), Exception("SOLV: Data doesn't exist for startDate = 1651622400, endDate = 1701240318"), Exception('BRK.B: No timezone found, symbol may be delisted'), Exception('BRK.B: No timezone found, symbol may be delisted'), Exception('BRK.B: No timezone found, symbol may be delisted'), Exception('BRK.B: No timezone found, symbol may be delisted'), Exception('CPAY: No price data found, symbol may be delisted (1h 1651622400 -> 1701240318)'), Exception('CPAY: No price data found, symbol may be delisted (1h 1651622400 -> 1701240318)'), Exception('CPAY: No price data found, symbol may be delisted (1h 1651622400 -> 1701240318)'), Exception('CPAY: No price data found, symbol may be delisted (1h 1651622400 -> 1701240318)'), Excep




(497, 503)

In [7]:
columns = ["datetime", "title", "description", "ticker", "company", "sector", "industry", "change_pct"]
column_idx = {col: i for i, col in enumerate(columns)}

def process_valid_story(story, tickers):
    rows = []

    timestamp_int = story["unix_timestamp"]
    timestamp = pd.Timestamp(timestamp_int, unit='s', tz='US/Eastern')

    row = [None] * len(columns)
    row[column_idx["datetime"]] = timestamp
    row[column_idx["title"]] = story["title"]
    row[column_idx["description"]] = story["description"] if "description" in story else ""

    for ticker in tickers:
        hist_df = ticker_history[ticker]
        info = ticker_info[ticker]

        ts_ints = hist_df["ts_int"].values
        init_ts_int = timestamp_int - 2 * HOUR
        final_ts_int = timestamp_int + 6 * HOUR

        init_idx = bisect.bisect_right(ts_ints, init_ts_int) - 1                 # bisect_right - 1: rightmost value less than or equal to x
        final_idx = bisect.bisect_left(ts_ints, final_ts_int, lo=init_idx+1)     # bisect_left: leftmost item greater than or equal to x
        
        if init_idx == 0 and ts_ints[init_idx] > init_ts_int:
            continue

        init_price = hist_df.iloc[init_idx]["Close"]
        final_price = hist_df.iloc[final_idx]["Close"]
        change_pct = (final_price - init_price) / init_price * 100
        
        row_copy = row.copy()
        row_copy[column_idx["ticker"]] = ticker
        row_copy[column_idx["company"]] = info["longName"]
        row_copy[column_idx["sector"]] = info["sector"]
        row_copy[column_idx["industry"]] = info["industry"]
        row_copy[column_idx["change_pct"]] = change_pct
        rows.append(row_copy)

    return rows

In [8]:
dataset = []
parse_errors = []
future_story_map = {}
now = time.time()

with concurrent.futures.ThreadPoolExecutor() as executor:
    for story in yf_news_stories:
        tickers = [T for t in story.get("tickers_direct", []) + story.get("tickers_indirect", []) if (T := t.upper()) in ticker_history]
        if len(tickers) == 0:
            continue

        if "unix_timestamp" not in story and type(story["unix_timestamp"]) != int:
            continue

        if now - story["unix_timestamp"] >= (YF_HISTORY_LIMIT - DAY):
            break

        if "description" in story and len(story["description"]) > 0:
            future_story_map[executor.submit(process_valid_story, story, tickers)] = story
    
len(future_story_map)

180775

In [9]:
for future in tqdm(concurrent.futures.as_completed(future_story_map.keys()), total=len(future_story_map)):
    try:
        rows = future.result()
        dataset.extend(rows)
            
    except Exception as e:
        story = future_story_map[future]
        if "id" in story:
            parse_errors.append((story["id"], e))

print(f"{len(parse_errors)} parse errors: {parse_errors}")

df = pd.DataFrame(dataset, columns=columns)
df.sort_values(by="datetime", inplace=True, ascending=True)
df.reset_index(drop=True, inplace=True)
df['datetime'] = df['datetime'].dt.strftime('%a %d %b %Y, %I:%M%p')
df

100%|██████████| 180775/180775 [00:00<00:00, 317759.22it/s]


0 parse errors: []


Unnamed: 0,datetime,title,description,ticker,company,sector,industry,change_pct
0,"Wed 04 May 2022, 12:00AM",BlackRock targets ‘industrial renaissance’ wit...,The two iShares ETFs are the manager’s first r...,BLK,"BlackRock, Inc.",Financial Services,Asset Management,-13.482159
1,"Wed 04 May 2022, 12:00AM",Big 5 Sporting Goods (BGFV) Q1 2022 Earnings C...,Image source: The Motley Fool. Big 5 Sporting ...,NDAQ,"Nasdaq, Inc.",Financial Services,Financial Data & Stock Exchanges,-6.146185
2,"Wed 04 May 2022, 12:00AM",Advanced Micro Devices (AMD) Q1 2022 Earnings ...,It's now my pleasure to turn the call over to ...,AMD,"Advanced Micro Devices, Inc.",Technology,Semiconductors,-23.202964
3,"Wed 04 May 2022, 12:05AM",Elon Musk Has a Fascinating Idea to Make Money...,Elon Musk hears criticism and never hesitates ...,TSLA,"Tesla, Inc.",Consumer Cyclical,Auto Manufacturers,20.651853
4,"Wed 04 May 2022, 12:30AM","Airbnb, Inc. (ABNB) Q1 2022 Earnings Call Tran...","Good afternoon, and thank you for joining Airb...",ABNB,"Airbnb, Inc.",Consumer Cyclical,Travel Services,18.279398
...,...,...,...,...,...,...,...,...
227502,"Thu 23 Nov 2023, 04:47PM",Corrections & Amplifications - The success of ...,The success of blood thinners being developed ...,BMY,Bristol-Myers Squibb Company,Healthcare,Drug Manufacturers - General,0.323559
227503,"Thu 23 Nov 2023, 05:40PM",Crews Extinguish Fire After CSX Train Derailme...,Hundreds of emergency workers in Kentucky put ...,CSX,CSX Corporation,Industrials,Railroads,-0.199597
227504,"Thu 23 Nov 2023, 05:43PM",UPDATE 1-German union Verdi calls for strikes ...,German trade union Verdi has called on members...,AMZN,"Amazon.com, Inc.",Consumer Cyclical,Internet Retail,-0.825589
227505,"Thu 23 Nov 2023, 07:00PM",10 Can’t Miss Black Friday Electronics Deals a...,The biggest shopping day of the season is upon...,COST,Costco Wholesale Corporation,Consumer Defensive,Discount Stores,0.592448


In [10]:
df.to_csv("dataset.csv", index=False)