In [17]:
import torch
import pandas as pd
import yfinance as yf
import json
from tqdm import tqdm
import random
import tldextract
from collections import Counter

news_data_file_path = 'TickerTick-stock-news-dataset.2023-11-23.json'

sp500_data = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0]
sp500_tickers = sp500_data['Symbol'].tolist()

ignored_tickers = ["BRK.B"]
sp500_tickers = [ticker for ticker in sp500_tickers if ticker not in ignored_tickers]


HOUR = 60 * 60

device = ("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
device

'mps'

In [24]:
domains = []

with open(news_data_file_path, "r") as f:
    while (line := f.readline()):
        story = json.loads(line)
        if "url" in story:
            url = story["url"]
            domain = tldextract.extract(url).domain
            domains.append(domain)

domain_counts = Counter(domains)
sorted_domains = sorted(domain_counts.items(), key=lambda x: x[1], reverse=True)

with open("domains.txt", "w") as f:
    for domain, count in sorted_domains:
        f.write(f"Domain: {domain}, Count: {count}\n")

In [47]:
import pandas_market_calendars as mcal
from datetime import timedelta

def get_percent_change(ticker, publish_date):
    # print(publish_date)
    nyse = mcal.get_calendar('NYSE')
    yf_ticker = yf.Ticker(ticker)

    # Get the trading days within a range
    trading_days = nyse.valid_days(start_date=publish_date - timedelta(days=5), end_date=publish_date + timedelta(days=5))
    # print(trading_days)

    # Check if publish_date is a trading day
    if publish_date.date() in trading_days:
        # Get the market close time
        market_close = publish_date.replace(hour=16, minute=0, second=0)

        if publish_date <= market_close:
            # Published before market close
            previous_day = trading_days[trading_days.get_loc(publish_date) - 1]
            current_day = trading_days[trading_days.get_loc(publish_date)]
        else:
            # Published after market close
            previous_day = trading_days[trading_days.get_loc(publish_date)]
            current_day = trading_days[trading_days.get_loc(publish_date) + 1]
    else:
        # Not published on a trading day
        previous_day = trading_days[trading_days.asof(publish_date)]
        current_day = trading_days[trading_days.searchsort(publish_date)]

    # Fetch historical data
    hist = yf_ticker.history(start=previous_day, end=current_day + timedelta(days=1))

    # Calculate price change
    percent_change = (hist['Close'].iloc[-1] - hist['Close'].iloc[0]) / hist['Close'].iloc[0] * 100

    return percent_change


In [49]:
import pytz

# Create a timezone object for Eastern Time
eastern = pytz.timezone('US/Eastern')

columns = ["datetime", "url", "title", "description", "ticker", "change_pct"]
column_idx = {col: i for i, col in enumerate(columns)}

dataset = []
errors = []
pbar = tqdm()
limit_rem = None

with open(news_data_file_path, "r") as f:
    while (line := f.readline()):
    # for line in tqdm(f):
        try:
            story = json.loads(line)
            if "description" not in story:
                continue

            timestamp_int = story["unix_timestamp"]
            timestamp = pd.Timestamp(timestamp_int, unit='s')
            timestamp_utc = timestamp.tz_localize('UTC')
            timestamp_eastern = timestamp.astimezone(eastern)

            row = [None] * len(columns)
            # row[column_idx["datetime"]] = timestamp.strftime('%a %d %b %Y, %I:%M%p')
            row[column_idx["datetime"]] = timestamp_eastern.strftime('%a %d %b %Y, %I:%M%p')
            print('hi')
            # print(row[column_idx["datetime"]])
            row[column_idx["url"]] = story["url"]
            # print(story["url"])
            row[column_idx["title"]] = story["title"]
            row[column_idx["description"]] = story["description"]

            tickers = [T for t in story.get("tickers_direct", []) + story.get("tickers_indirect", []) if (T := t.upper()) in sp500_tickers]

            if len(tickers) == 0:
                continue

            for ticker in tickers:
                # yfTicker = yf.Ticker(ticker)
                # hist = yfTicker.history(start=timestamp_int-24*2*HOUR, end=timestamp_int+24*2*HOUR, interval='1h', raise_errors=True)
                # if len(hist) < 12:
                #     hist = yfTicker.history(start=timestamp_int-24*7*HOUR, end=timestamp_int+24*7*HOUR, interval='1h', raise_errors=True)
                # hist_times = hist.index.values.astype('int64') // 1e9

                # i = 0
                # while i+2 < len(hist_times) and hist_times[i+1] < timestamp_int:
                #     i += 1
                # init_idx = i
                
                # i += 1
                # while i < len(hist_times) and hist_times[i] - 3*HOUR  < timestamp_int:
                #     i += 1
                # final_idx = i

                # init_price = hist.iloc[init_idx]["Open"]
                # final_price = hist.iloc[final_idx]["Close"]
                # change_pct = (final_price - init_price) / init_price * 100

                change_pct = get_percent_change(ticker, timestamp)
                
                row_copy = row.copy()
                row_copy[column_idx["ticker"]] = ticker
                row_copy[column_idx["change_pct"]] = change_pct
                print(row_copy)
                dataset.append(row_copy)

        except Exception as e:
            errors.append((e, story["id"]))
            print(e)

        jump = random.randint(1, 10000)
        for _ in range(jump-1):
            next(f, None)
        pbar.update(jump)
        # pbar.update(1)

        # if limit_rem is not None:
        #     limit_rem -= 1
        #     if limit_rem <= 0:
        #         break

pbar.close()
print(f"{len(errors)} errors: {errors}")
    
df = pd.DataFrame(dataset, columns=columns)
df.to_csv("dataset.csv", index=False)
df

6090704it [00:11, 550961.14it/s]


Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to l



Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to l



Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to l



Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to l



Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to l



Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to l



Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to l



Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to l



Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to l



Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to l



Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to l



Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to l



Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to l



Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to l



Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to l



Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to l



Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to l



Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to l


2534915it [00:03, 651648.48it/s]

Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to l

[A

Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to l



Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to l



Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to l



Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to l



Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to l



Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to l



Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to l



Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to l



Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to l



Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to localize
Cannot convert tz-naive Timestamp, use tz_localize to l

KeyboardInterrupt: 

In [None]:
# One hot encoding

columns = ["timestamp", "url", "title", "description"] + sp500_tickers
column_idx = {col: i for i, col in enumerate(columns)}

dataset = []
errors = []
pbar = tqdm()
limit_rem = None

with open(news_data_file_path, "r") as f:
    while (line := f.readline()):
    # for line in tqdm(f):
        try:
            story = json.loads(line)
            if "description" not in story:
                continue

            row = [None] * len(columns)
            timestamp_int = story["unix_timestamp"]
            row[column_idx["timestamp"]] = timestamp_int
            row[column_idx["url"]] = story["url"]
            row[column_idx["title"]] = story["title"]
            row[column_idx["description"]] = story["description"]

            tickers = [T for t in story.get("tickers_direct", []) + story.get("tickers_indirect", []) if (T := t.upper()) in sp500_tickers]

            if len(tickers) == 0:
                continue

            for ticker in tickers:
                yfTicker = yf.Ticker(ticker)
                hist = yfTicker.history(start=timestamp_int-24*2*HOUR, end=timestamp_int+24*2*HOUR, interval='1h', raise_errors=True)
                if len(hist) < 12:
                    hist = yfTicker.history(start=timestamp_int-24*7*HOUR, end=timestamp_int+24*7*HOUR, interval='1h', raise_errors=True)
                hist_times = hist.index.values.astype('int64') // 1e9

                i = 0
                while i+2 < len(hist_times) and hist_times[i+1] < timestamp_int:
                    i += 1
                init_idx = i
                
                i += 1
                while i < len(hist_times) and hist_times[i] - 3*HOUR  < timestamp_int:
                    i += 1
                final_idx = i

                init_price = hist.iloc[init_idx]["Open"]
                final_price = hist.iloc[final_idx]["Close"]
                change_pct = (final_price - init_price) / init_price * 100
                row[column_idx[ticker]] = change_pct

            dataset.append(row)

        except Exception as e:
            errors.append((e, story["id"]))

        jump = random.randint(1, 10000)
        for _ in range(jump-1):
            next(f)
        pbar.update(jump)
        # pbar.update(1)

        # if limit_rem is not None:
        #     limit_rem -= 1
        #     if limit_rem <= 0:
        #         break

pbar.close()
print(f"{len(errors)} errors: {errors}")
    
df = pd.DataFrame(dataset, columns=columns)
df.to_csv("dataset-one-hot-enc.csv", index=False)
df