In [55]:
import pandas as pd
import yfinance as yf
import json
from tqdm import tqdm

In [92]:
news_data_file_path = 'stock-news-dataset.csv'

with open(news_data_file_path, "r") as f:
    news_data = json.load(f)

sp500_data = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0]
sp500_tickers = sp500_data['Symbol'].tolist()

columns = ["timestamp", "url", "title", "description"] + sp500_tickers
column_idx = {col: i for i, col in enumerate(columns)}

dataset = []

for story in tqdm(news_data):
    row = [None] * len(columns)
    timestamp = story["unix_timestamp"]
    row[column_idx["timestamp"]] = timestamp
    row[column_idx["url"]] = story["url"]
    row[column_idx["title"]] = story["title"]
    row[column_idx["description"]] = story["description"]

    tickers = [t.upper() for t in story.get("tickers_direct", []) + story.get("tickers_indirect", []) if t.upper() in sp500_tickers]

    for ticker in tickers:
        yfTicker = yf.Ticker(ticker)
        start = timestamp - 60*60*2
        end = timestamp + 60*60*8
        hist = yfTicker.history(start=start, end=end, interval='1h')
        for i in range(24*7):
            if len(hist) > 2:
                end += 60*60
                hist = yfTicker.history(start=start, end=end, interval='1h')
                break

        initial_price = hist["Open"].values[0]
        final_price = hist["Close"].values[-1]
        row[column_idx[ticker]] = (final_price - initial_price) / initial_price * 100

    dataset.append(row)
    
df = pd.DataFrame(dataset, columns=columns)
df.to_csv("dataset-one-hot-encode.csv", index=False)
df

100%|██████████| 10/10 [00:00<00:00, 133.16it/s]


Unnamed: 0,timestamp,url,title,description,MMM,AOS,ABT,ABBV,ACN,ADBE,...,WMB,WTW,GWW,WYNN,XEL,XYL,YUM,ZBRA,ZBH,ZTS
0,1700818652,https://www.marketbeat.com/instant-alerts/nyse...,Principal Financial Group Inc. Lowers Stock Po...,Principal Financial Group Inc. trimmed its sta...,,,,,,,...,,,,,,,,,,
1,1700818640,https://www.openpr.com/news/3301865/ewing-s-sa...,"Ewing's Sarcoma Market Size, Trends, Industry ...",Market Overview: The Ewing's sarcoma market is...,,,,,,,...,,,,,,,,,,
2,1700818620,https://www.prnewswire.com/news-releases/globa...,Global CEOs Converge with United Nations Ambas...,Top business leaders will be interviewed durin...,,,,,,,...,,,,,,,,,,
3,1700818618,https://forums.collectors.com/discussion/10978...,Counterfeit Morgan dollars - 11/24/23 - Please...,Multiple counterfeit 1878-S Morgan dollars fro...,,,,,,,...,,,,,,,,,,
4,1700818590,https://www.rapidtvnews.com/2023112464421/worl...,World Rugby reaches fans beyond broadcast deal...,The RugbyPass TV streaming service – a partner...,,,,,,,...,,,,,,,,,,
5,1700818574,https://www.openpr.com/news/3301861/images-con...,Images Content Moderation Solution Market Size...,Images Content Moderation Solution replace wit...,,,,,,,...,,,,,,,,,,
6,1700818516,https://www.androidpolice.com/linkedin-ai-prof...,LinkedIn: How to create an AI profile photo,A professional photo is essential for job hunt...,,,,,,,...,,,,,,,,,,
7,1700818512,https://defence-blog.com/mbda-kai-alliance-wea...,"MBDA, KAI sign missile collaboration agreement",European missile maker MBDA and Korean Aerospa...,,,,,,,...,,,,,,,,,,
8,1700818458,https://retailtimes.co.uk/jd-announces-its-hug...,JD announces its huge Black Friday Sale with u...,Retail Times publishes international & UK reta...,,,,,,,...,,,,,,,,,,
9,1700818439,https://www.mobileworldlive.com/ai-cloud/the-f...,The Friday File: 5 days that shook OpenAI and ...,Mobile World Live brings you our top three pic...,,,,,,,...,,,,,,,,,,
