## Load Data from Yfinance, NewsAPI, and Bloomberg

**Load Data (Yahoo Finance)**

In [3]:
import yfinance as yf
import pandas as pd
from datetime import datetime, timedelta, UTC
import time

# Tickers to analyze
TICKERS = ["NKE", "LULU", "ATZ.TO"]
YF_MAX_ITEMS = 10  # yfinance often returns ~10
LOOKBACK_DAYS = 365



def extract_article(item: dict, ticker: str) -> dict:
    """Flatten a yfinance news item into a FinBERT-ready row."""
    content = item.get("content", {}) or {}

    # Published time: providerPublishTime (unix) or content pubDate/displayTime
    published_dt = None
    ts = item.get("providerPublishTime")
    if ts:
        try:
            published_dt = datetime.fromtimestamp(ts, tz=UTC)
        except Exception:
            published_dt = None
    if published_dt is None:
        pub_iso = content.get("pubDate") or content.get("displayTime")
        if pub_iso:
            published_dt = pd.to_datetime(pub_iso, errors="coerce", utc=True)

    link = (
        item.get("link")
        or (item.get("canonicalUrl") or {}).get("url")
        or (item.get("clickThroughUrl") or {}).get("url")
        or (content.get("canonicalUrl") or {}).get("url")
        or (content.get("clickThroughUrl") or {}).get("url")
    )

    return {
        "source": "yfinance",
        "ticker": ticker,
        "title": (content.get("title") or item.get("title") or "").strip(),
        "summary": (content.get("summary") or content.get("description") or item.get("summary") or "").strip(),
        "publisher": (item.get("publisher") or (content.get("provider") or {}).get("displayName") or "").strip(),
        "type": (item.get("type") or content.get("contentType") or "").strip(),
        "link": link,
        "published_utc": published_dt,
        "raw_id": item.get("id") or item.get("uuid"),
    }


rows = []
for tic in TICKERS:
    raw_news = (yf.Ticker(tic).news or [])[:YF_MAX_ITEMS]
    rows.extend([extract_article(item, tic) for item in raw_news])
    print(f"{tic}: fetched {len(raw_news)} raw items")
    time.sleep(1)

# Build dataframe
df = pd.DataFrame(rows)
if not df.empty:
    df["published_utc"] = pd.to_datetime(df["published_utc"], errors="coerce", utc=True)
    cutoff = pd.Timestamp.now(tz="UTC") - pd.Timedelta(days=LOOKBACK_DAYS)
    df = df[df["published_utc"].isna() | (df["published_utc"] >= cutoff)]
    df = df.drop_duplicates(subset=["title", "link"])
    df = df.sort_values(["ticker", "published_utc"], ascending=[True, False]).reset_index(drop=True)

print(df.head())
print()
print(df.describe(include="all"))


NKE: fetched 10 raw items
LULU: fetched 10 raw items
ATZ.TO: fetched 10 raw items
     source  ticker                                              title  \
0  yfinance  ATZ.TO  The investing winners and losers that made or ...   
1  yfinance  ATZ.TO  TSX Value Picks Including Aritzia And Two Othe...   
2  yfinance  ATZ.TO  Stifel Canada Names Gildan, KITS, and Couche-T...   
3  yfinance  ATZ.TO  3 TSX Growth Stocks With Up To 22% Insider Own...   
4  yfinance  ATZ.TO  Tech Tactics: Aritzia Taps Nedapâ€™s RFID Platfo...   

                                             summary         publisher   type  \
0  Despite turmoil from the trade war, most globa...    Financial Post  STORY   
1  As 2025 draws to a close, the Canadian market ...   Simply Wall St.  STORY   
2  Stifel Canada said Friday its best ideas for C...      MT Newswires  STORY   
3  As we approach the end of 2025, Canadian marke...   Simply Wall St.  STORY   
4  Aritzia utilizes Nedap's RFID platform to stre...  Sourcing Jou

In [4]:
OUTPUT_FILE = "./processed_data/financial_news_dataset.csv"  # optional export
df.to_csv(OUTPUT_FILE, index=False)

print(f"Data exported to {OUTPUT_FILE}")

Data exported to ./processed_data/financial_news_dataset.csv


**Import Data From News API**

In [8]:
from newsapi import NewsApiClient
from dotenv import load_dotenv
import os
from pathlib import Path

load_dotenv()

keywords = {
    "nike": ["nke","nike","Nike","NIKE","NKE"],
    "atz": ["ATZ", "atz", "Aritzia","aritzia"],
    "lulu": ["lulu", "LULU","lululemon","Lululemon"]
}

DATE = '2025-12-01'

api_key= os.getenv('NEWS_API_KEY')

# Init
newsapi = NewsApiClient(api_key=api_key)

**Export NewsAPI articles to CSV**


In [7]:
import pandas as pd
from pathlib import Path

def fetch_news_articles(keywords, start_date):
    rows = []
    for kw, words in keywords.items():
        # Single request per stock (no pagination)
        query = " OR ".join(words)
        resp = newsapi.get_everything(
            q=query,
            from_param=start_date,
            language="en",
            sort_by="relevancy",
            page=1,
            page_size=100
        )
        for art in resp.get("articles", []):
            rows.append({
                "keyword": kw,
                "source": (art.get("source") or {}).get("name"),
                "author": art.get("author"),
                "title": art.get("title"),
                "description": art.get("description"),
                "url": art.get("url"),
                "urlToImage": art.get("urlToImage"),
                "publishedAt": art.get("publishedAt"),
                "content": art.get("content"),
            })
    return pd.DataFrame(rows)

df_news = fetch_news_articles(keywords, DATE)
df_news["publishedAt"] = pd.to_datetime(df_news["publishedAt"], errors="coerce", utc=True)

Path("processed_data").mkdir(parents=True, exist_ok=True)
output_path = "processed_data/newsapi_articles.csv"
df_news.to_csv(output_path, index=False)


print(f"Saved {len(df_news)} rows to {output_path}")
print(df_news.head())


Saved 228 rows to processed_data/newsapi_articles.csv
  keyword            source                         author  \
0    nike         MacRumors               Hartley Charlton   
1    nike  Business Insider              Julia Pugachevsky   
2    nike  Business Insider                  Lara O'Reilly   
3    nike  Business Insider                   Mary Hanbury   
4    nike          BBC News  Russell Fuller, Amy Lofthouse   

                                               title  \
0  Apple CEO Tim Cook Buys $3 Million of Nike Shares   
1  Meet Gen Z's latest obsession: A sneaker so bo...   
2  Tim Cook just gave Nike a much-needed holiday ...   
3  Nike is struggling to stay culturally relevant...   
4  WTA signs with Mercedes in 'most significant d...   

                                         description  \
0  Apple CEO Tim Cook disclosed a roughly $3 mill...   
1  Asics Novablasts have become popular among Gen...   
2  Apple CEO Tim Cook bought $3 million in Nike s...   
3  Nike was 

**Analyzing Equity Research Reports from Bloomberg**

In [None]:
"""
Done in pdf_section_extractor.py

Pulls thesis/growth/risk/valuation/earnings blocks using simple
heading heuristics, then optionally chunks text for BERT-friendly input.
"""


**SEC + TSEC Filings**

In [None]:
from secedgar import filings, FilingType

# 8K filings for Nike and Lululemon (tickers "nke" and "lulu")
my_filings_8k = filings(cik_lookup=["nke","lulu"],
                     filing_type=FilingType.FILING_8K,
                     user_agent="Simon Kurono (simonkurono@gmail.com)")

my_filings_8k.save('./nlp/raw_data/sec_filings_8k')

# 10Q filings for Nike and Lululemon (tickers "nke" and "lulu")
my_filings_8k = filings(cik_lookup=["nke","lulu"],
                     filing_type=FilingType.FILING_10Q,
                     user_agent="Simon Kurono (simonkurono@gmail.com)")

my_filings_8k.save('./nlp/raw_data/sec_filings_10q')

