In [1]:
!pip install requests pandas tqdm python-dotenv



In [2]:
import os
from datetime import datetime, timedelta, timezone
from typing import List, Dict, Any

import requests
import pandas as pd
from tqdm.notebook import tqdm

# ====== 在这里直接写你的 API Key（为了方便作业，可以写死在 notebook） ======
FINNHUB_API_KEY = "d4fdo09r01qkcvvhpregd4fdo09r01qkcvvhprf0"
NEWS_API_KEY = "e3bcd3c1af35460da72147471ebdc4ce"

# 可选：如果你以后想用 .env 文件，也可以这样：
# from dotenv import load_dotenv
# load_dotenv()
# FINNHUB_API_KEY = os.getenv("FINNHUB_API_KEY", "")
# NEWS_API_KEY = os.getenv("NEWS_API_KEY", "")


In [3]:
def utc_now() -> datetime:
    return datetime.now(timezone.utc)


def cutoff_months_ago(months: int = 6) -> datetime:
    # 简单：1 个月按 30 天算
    return utc_now() - timedelta(days=30 * months)


In [4]:
TARGET_COMPANIES = [
    {"name": "NVIDIA", "ticker": "NVDA"},
    {"name": "Tesla", "ticker": "TSLA"},
    {"name": "ASML Holdings", "ticker": "ASML"},
    {"name": "META", "ticker": "META"},
    {"name": "Amazon", "ticker": "AMZN"},
]

COMPANY_KEYWORDS = {
    "NVDA": ["NVIDIA", "NVDA"],
    "TSLA": ["Tesla", "TSLA"],
    "ASML": ["ASML", "ASML Holdings"],
    "META": ["Meta Platforms", "META", "Facebook"],
    "AMZN": ["Amazon", "AMZN", "Amazon.com"],
}


In [15]:
def fetch_finnhub_company_news(
    symbol: str,
    months_back: int = 6,
    max_items: int = 200,
) -> List[Dict[str, Any]]:
    """
    使用 Finnhub 的 company-news 端点，按 symbol 获取最近 months_back 个月的新闻。
    https://finnhub.io/docs/api/company-news
    """
    if not FINNHUB_API_KEY:
        print("WARN: FINNHUB_API_KEY 未配置，Finnhub 不会返回数据。")
        return []

    base_url = "https://finnhub.io/api/v1/company-news"
    from_date = cutoff_months_ago(months_back).date().isoformat()
    to_date = utc_now().date().isoformat()

    params = {
        "symbol": symbol,
        "from": from_date,
        "to": to_date,
        "token": FINNHUB_API_KEY,
    }

    try:
        resp = requests.get(base_url, params=params, timeout=20)
        resp.raise_for_status()
    except Exception as e:
        print(f"[Finnhub] 请求 {symbol} 失败: {e}")
        return []

    data = resp.json()
    if not isinstance(data, list):
        print(f"[Finnhub] 非预期返回格式: {data}")
        return []

    from datetime import datetime as dt_mod

    results: List[Dict[str, Any]] = []

    for item in data[:max_items]:
        ts = item.get("datetime")  # Unix timestamp (秒)
        pub_dt_iso = None
        if isinstance(ts, (int, float)):
            dt = dt_mod.utcfromtimestamp(ts)
            pub_dt_iso = dt.isoformat() + "Z"

        results.append({
            "provider": "finnhub",
            "symbol": symbol,
            "headline": item.get("headline"),
            "summary": item.get("summary"),
            "url": item.get("url"),
            "image": item.get("image"),
            "news_source": item.get("source"),
            "category": item.get("category"),
            "related": item.get("related"),
            "published_at_utc": pub_dt_iso,
            "collected_at_utc": utc_now().isoformat(),
        })

    return results


In [11]:
def fetch_newsapi_for_keyword(
    keyword: str,
    months_back: int = 6,  # free 版实际最多 1 个月，这里参数保持统一
    max_items: int = 50,
) -> List[Dict[str, Any]]:
    """
    使用 NewsAPI 的 /v2/everything 端点按关键词搜索新闻。
    免费 Developer 计划：最多回溯 1 个月 + 24 小时延迟。
    """
    if not NEWS_API_KEY:
        print("WARN: NEWS_API_KEY 未配置，NewsAPI 不会返回数据。")
        return []

    base_url = "https://newsapi.org/v2/everything"
    from_date = cutoff_months_ago(min(months_back, 1)).date().isoformat()

    params = {
        "q": keyword,
        "language": "en",
        "from": from_date,
        "sortBy": "publishedAt",
        "pageSize": max_items,
        "apiKey": NEWS_API_KEY,
    }

    try:
        resp = requests.get(base_url, params=params, timeout=20)
        resp.raise_for_status()
    except Exception as e:
        print(f"[NewsAPI] 请求 {keyword} 失败: {e}")
        return []

    data = resp.json()
    articles = data.get("articles", [])

    results: List[Dict[str, Any]] = []
    for art in articles:
        src = art.get("source") or {}
        results.append({
            "provider": "newsapi",
            "api_source_name": src.get("name"),
            "keyword": keyword,
            "title": art.get("title"),
            "description": art.get("description"),
            "url": art.get("url"),
            "image": art.get("urlToImage"),
            "published_at_utc": art.get("publishedAt"),
            "collected_at_utc": utc_now().isoformat(),
        })

    return results


In [7]:
def collect_news_for_ticker(
    ticker: str,
    months_back: int = 6,
    max_news_per_keyword: int = 50,
) -> Dict[str, Any]:
    """
    针对一个 ticker：
    - Finnhub: company-news（按 symbol）
    - NewsAPI: 按关键词补充新闻
    """
    keywords = COMPANY_KEYWORDS.get(ticker, [ticker])

    news_items: List[Dict[str, Any]] = []

    # 1) Finnhub
    finnhub_items = fetch_finnhub_company_news(
        symbol=ticker,
        months_back=months_back,
        max_items=200
    )
    for it in finnhub_items:
        it["ticker"] = ticker
        it["channel"] = "news"
        it["title"] = it.get("headline")
        news_items.append(it)

    # 2) NewsAPI
    for kw in keywords:
        newsapi_items = fetch_newsapi_for_keyword(
            keyword=kw,
            months_back=months_back,
            max_items=max_news_per_keyword,
        )
        for it in newsapi_items:
            it["ticker"] = ticker
            it["channel"] = "news"
            news_items.append(it)

    return {
        "ticker": ticker,
        "months_back": months_back,
        "news_items": news_items,
        "collected_at_utc": utc_now().isoformat(),
    }


In [8]:
all_records: List[Dict[str, Any]] = []

for comp in tqdm(TARGET_COMPANIES, desc="Collecting 6-month news"):
    ticker = comp["ticker"]
    bundle = collect_news_for_ticker(
        ticker=ticker,
        months_back=6,
        max_news_per_keyword=50,
    )
    for item in bundle["news_items"]:
        all_records.append({
            "ticker": ticker,
            "provider": item.get("provider"),
            "channel": item.get("channel"),
            "title": item.get("title")
                      or item.get("headline")
                      or item.get("description"),
            "url": item.get("url"),
            "published_at_utc": item.get("published_at_utc"),
            "news_source": item.get("news_source") or item.get("api_source_name"),
            "keyword": item.get("keyword"),
        })

df_news = pd.DataFrame(all_records)
df_news.head()


Collecting 6-month news:   0%|          | 0/5 [00:00<?, ?it/s]

  dt = dt_mod.utcfromtimestamp(ts)


Unnamed: 0,ticker,provider,channel,title,url,published_at_utc,news_source,keyword
0,NVDA,finnhub,news,Why The U.S.-China AI Arms Race Is Entering A ...,https://finnhub.io/api/news?id=46d037f36c14729...,2025-11-28T04:58:03Z,SeekingAlpha,
1,NVDA,finnhub,news,BUZZ Investing: High-Valuation Tech Stocks Reset,https://finnhub.io/api/news?id=3427d5bdf3598db...,2025-11-27T13:14:00Z,SeekingAlpha,
2,NVDA,finnhub,news,Taiwan raids former TSMC exec's homes in trade...,https://finnhub.io/api/news?id=521519f82ab828c...,2025-11-27T13:11:09Z,Yahoo,
3,NVDA,finnhub,news,"Ray Dalio: AI Stocks Are In Bubble Territory, ...",https://finnhub.io/api/news?id=85fdfef2324ab90...,2025-11-27T13:01:15Z,Yahoo,
4,NVDA,finnhub,news,Sands Capital Select Growth Strategy’s Top Abs...,https://finnhub.io/api/news?id=702fe5d79f81c82...,2025-11-27T12:55:14Z,Yahoo,


In [9]:
output_path = "news_6m_finnhub_newsapi.csv"
df_news.to_csv(output_path, index=False)
output_path


'news_6m_finnhub_newsapi.csv'