## Load Data from Yfinance, NewsAPI, and Bloomberg

**Load Data (Yahoo Finance)**

In [1]:
import yfinance as yf
import pandas as pd
from datetime import datetime, timedelta, UTC
import time

CANONICAL_FIELDS = [
    "id",
    "report_id",
    "ticker",
    "company",
    "date",
    "source",
    "doc_type",
    "item",
    "section_type",
    "section_heading",
    "chunk_index",
    "page_start",
    "page_end",
    "text",
    "source_file",
]


def make_row(*, report_id, text, chunk_index, source, source_file, ticker=None, company=None, date=None, doc_type=None, item=None, section_type=None, section_heading=None, page_start=None, page_end=None):
    return {
        "id": f"{report_id}-{chunk_index}",
        "report_id": report_id,
        "ticker": ticker or "",
        "company": company or "",
        "date": date or "",
        "source": source,
        "doc_type": doc_type or "",
        "item": item or "",
        "section_type": section_type or "",
        "section_heading": section_heading or "",
        "chunk_index": chunk_index,
        "page_start": page_start or "",
        "page_end": page_end or "",
        "text": (text or "").strip(),
        "source_file": source_file,
    }

# Tickers to analyze
TICKERS = ["NKE", "LULU", "ATZ.TO"]
YF_MAX_ITEMS = 10  # yfinance often returns ~10
LOOKBACK_DAYS = 365


def extract_article(item: dict, ticker: str) -> dict:
    content = item.get("content", {}) or {}

    # Published time: providerPublishTime (unix) or content pubDate/displayTime
    published_dt = None
    ts = item.get("providerPublishTime")
    if ts:
        try:
            published_dt = datetime.fromtimestamp(ts, tz=UTC)
        except Exception:
            published_dt = None
    if published_dt is None:
        pub_iso = content.get("pubDate") or content.get("displayTime")
        if pub_iso:
            published_dt = pd.to_datetime(pub_iso, errors="coerce", utc=True)

    link = (
        item.get("link")
        or (item.get("canonicalUrl") or {}).get("url")
        or (item.get("clickThroughUrl") or {}).get("url")
        or (content.get("canonicalUrl") or {}).get("url")
        or (content.get("clickThroughUrl") or {}).get("url")
    )
    report_id = item.get("id") or item.get("uuid") or f"{ticker}-{int(time.time())}"
    publisher = (item.get("publisher") or (content.get("provider") or {}).get("displayName") or "").strip()
    doc_type = (item.get("type") or content.get("contentType") or "news").strip()
    heading = (content.get("title") or item.get("title") or "").strip()
    text = (content.get("summary") or content.get("description") or item.get("summary") or heading).strip()

    return make_row(
        report_id=report_id,
        text=text,
        chunk_index=0,
        source="yfinance",
        source_file=link or "",
        ticker=ticker,
        company="",
        date=str(published_dt) if published_dt is not None else "",
        doc_type=doc_type,
        section_type="news",
        section_heading=heading,
        page_start="",
        page_end="",
    )


rows = []
for tic in TICKERS:
    raw_news = (yf.Ticker(tic).news or [])[:YF_MAX_ITEMS]
    rows.extend([extract_article(item, tic) for item in raw_news])
    print(f"{tic}: fetched {len(raw_news)} raw items")
    time.sleep(1)

# Build dataframe
if rows:
    df = pd.DataFrame(rows)
    df = df.reindex(columns=CANONICAL_FIELDS)
    df["date"] = pd.to_datetime(df["date"], errors="coerce", utc=True)
    cutoff = pd.Timestamp.now(tz="UTC") - pd.Timedelta(days=LOOKBACK_DAYS)
    df = df[df["date"].isna() | (df["date"] >= cutoff)]
    df = df.drop_duplicates(subset=["section_heading", "source_file"])
    df = df.sort_values(["ticker", "date"], ascending=[True, False]).reset_index(drop=True)
else:
    df = pd.DataFrame(columns=CANONICAL_FIELDS)

print(df.head())
print()
print(df.describe(include="all"))


NKE: fetched 10 raw items
LULU: fetched 10 raw items
ATZ.TO: fetched 10 raw items
                                       id  \
0  4f8f8b55-7a15-3280-aefa-2ba5fe1c5d60-0   
1  44127e76-4d51-3341-91f7-75a151c327de-0   
2  d27b1692-49b5-37f9-8f7f-170607a64bc6-0   
3  adf56cf4-fc14-38e2-8b57-4bdc4485cefa-0   
4  45fdbd44-5798-37fc-9b78-dbf1bb62b6f0-0   

                              report_id  ticker company  \
0  4f8f8b55-7a15-3280-aefa-2ba5fe1c5d60  ATZ.TO           
1  44127e76-4d51-3341-91f7-75a151c327de  ATZ.TO           
2  d27b1692-49b5-37f9-8f7f-170607a64bc6  ATZ.TO           
3  adf56cf4-fc14-38e2-8b57-4bdc4485cefa  ATZ.TO           
4  45fdbd44-5798-37fc-9b78-dbf1bb62b6f0  ATZ.TO           

                       date    source doc_type item section_type  \
0 2025-12-22 13:58:54+00:00  yfinance    STORY              news   
1 2025-12-22 12:38:14+00:00  yfinance    STORY              news   
2 2025-12-19 19:23:11+00:00  yfinance    STORY              news   
3 2025-12-12 12:35:5

In [2]:
OUTPUT_FILE = "./processed_data/financial_news_dataset.csv"  # optional export
df.to_csv(OUTPUT_FILE, index=False)

print(f"Data exported to {OUTPUT_FILE}")

Data exported to ./processed_data/financial_news_dataset.csv


**Import Data From News API**

In [7]:
from newsapi import NewsApiClient
from dotenv import load_dotenv
import os
from pathlib import Path

load_dotenv()

keywords = {
    "nike": ["nke","NKE"],
    "atz": ["ATZ", "atz", "atz.co","ATZ.CO"],
    "lulu": ["lulu", "LULU"]
}

DATE = '2025-12-01'

api_key= os.getenv('NEWS_API_KEY')

# Init
newsapi = NewsApiClient(api_key=api_key)

**Export NewsAPI articles to CSV**


In [8]:
import pandas as pd
from pathlib import Path
import hashlib

CANONICAL_FIELDS = [
    "id",
    "report_id",
    "ticker",
    "company",
    "date",
    "source",
    "doc_type",
    "item",
    "section_type",
    "section_heading",
    "chunk_index",
    "page_start",
    "page_end",
    "text",
    "source_file",
]


def make_row(*, report_id, text, chunk_index, source, source_file, ticker=None, company=None, date=None, doc_type=None, item=None, section_type=None, section_heading=None, page_start=None, page_end=None):
    return {
        "id": f"{report_id}-{chunk_index}",
        "report_id": report_id,
        "ticker": ticker or "",
        "company": company or "",
        "date": date or "",
        "source": source,
        "doc_type": doc_type or "",
        "item": item or "",
        "section_type": section_type or "",
        "section_heading": section_heading or "",
        "chunk_index": chunk_index,
        "page_start": page_start or "",
        "page_end": page_end or "",
        "text": (text or "").strip(),
        "source_file": source_file,
    }


def stable_id(text: str, url: str) -> str:
    basis = (url or text or "").encode("utf-8")
    return hashlib.md5(basis, usedforsecurity=False).hexdigest()


def fetch_news_articles(keywords, start_date):
    rows = []
    for kw, words in keywords.items():
        query = " OR ".join(words)
        resp = newsapi.get_everything(
            q=query,
            from_param=start_date,
            language="en",
            sort_by="relevancy",
            page=1,
            page_size=100,
        )
        ticker = ""
        if kw == "nike":
            ticker = "NKE"
        if kw == "atz":
            ticker = "ATZ"
        if kw == "lulu":
            ticker = "LULU"

        for art in resp.get("articles", []):
            title = art.get("title") or ""
            text = (art.get("content") or art.get("description") or title).strip()
            if not text:
                continue
            url = art.get("url") or ""
            rid = stable_id(title, url)
            rows.append(
                make_row(
                    report_id=rid,
                    text=text,
                    chunk_index=0,
                    source="newsapi",
                    source_file=url,
                    ticker=ticker,
                    company=kw,
                    date=art.get("publishedAt") or "",
                    doc_type="news",
                    section_type="news",
                    section_heading=title,
                )
            )
    return pd.DataFrame(rows, columns=CANONICAL_FIELDS)


df_news = fetch_news_articles(keywords, DATE)

Path("processed_data").mkdir(parents=True, exist_ok=True)
output_path = "processed_data/newsapi_articles.csv"
df_news.to_csv(output_path, index=False)

print(f"Saved {len(df_news)} rows to {output_path}")
print(df_news.head())


Saved 157 rows to processed_data/newsapi_articles.csv
                                   id                         report_id  \
0  7ac88f377541dfa1a4647f81055416b1-0  7ac88f377541dfa1a4647f81055416b1   
1  d0e260880c6c12f6e784a2c545307eff-0  d0e260880c6c12f6e784a2c545307eff   
2  8298c16f954431dcb0da138467024fdd-0  8298c16f954431dcb0da138467024fdd   
3  7cd991eba9bad7579b584ca9daa2aad3-0  7cd991eba9bad7579b584ca9daa2aad3   
4  777093cbbccf2c9e9cba482a8cadb43d-0  777093cbbccf2c9e9cba482a8cadb43d   

  ticker company                  date   source doc_type item section_type  \
0    NKE    nike  2025-12-05T22:06:20Z  newsapi     news              news   
1    NKE    nike  2025-12-23T12:16:21Z  newsapi     news              news   
2    NKE    nike  2025-12-17T18:47:56Z  newsapi     news              news   
3    NKE    nike  2025-12-25T15:36:47Z  newsapi     news              news   
4    NKE    nike  2025-12-22T17:01:51Z  newsapi     news              news   

                          

**Analyzing Equity Research Reports from Bloomberg**

In [5]:
"""
Done in pdf_section_extractor.py

Pulls thesis/growth/risk/valuation/earnings blocks using simple
heading heuristics, then optionally chunks text for BERT-friendly input.
"""


'\nDone in pdf_section_extractor.py\n\nPulls thesis/growth/risk/valuation/earnings blocks using simple\nheading heuristics, then optionally chunks text for BERT-friendly input.\n'

**SEC + TSEC Filings**

In [6]:
from secedgar import filings, FilingType

"""
# 8K filings for Nike and Lululemon (tickers "nke" and "lulu")
my_filings_8k = filings(cik_lookup=["nke","lulu"],
                     filing_type=FilingType.FILING_8K,
                     user_agent="Simon Kurono (simonkurono@gmail.com)")

my_filings_8k.save('./nlp/raw_data/sec_filings_8k')

# 10Q filings for Nike and Lululemon (tickers "nke" and "lulu")
my_filings_8k = filings(cik_lookup=["nke","lulu"],
                     filing_type=FilingType.FILING_10Q,
                     user_agent="Simon Kurono (simonkurono@gmail.com)")

my_filings_8k.save('./nlp/raw_data/sec_filings_10q')
"""


'\n# 8K filings for Nike and Lululemon (tickers "nke" and "lulu")\nmy_filings_8k = filings(cik_lookup=["nke","lulu"],\n                     filing_type=FilingType.FILING_8K,\n                     user_agent="Simon Kurono (simonkurono@gmail.com)")\n\nmy_filings_8k.save(\'./nlp/raw_data/sec_filings_8k\')\n\n# 10Q filings for Nike and Lululemon (tickers "nke" and "lulu")\nmy_filings_8k = filings(cik_lookup=["nke","lulu"],\n                     filing_type=FilingType.FILING_10Q,\n                     user_agent="Simon Kurono (simonkurono@gmail.com)")\n\nmy_filings_8k.save(\'./nlp/raw_data/sec_filings_10q\')\n'