
# 01_fetch_news.ipynb — News sammeln (Finnhub + NewsAPI)

Dieses Notebook sammelt Nachrichten für eine Auswahl von Zielunternehmen (Tickern), kombiniert Finnhub Company-News und NewsAPI-Keyword-Suche und speichert die Rohdaten in `agent_new/data/raw_news.csv`.

**Wichtig:** Setze die Umgebungsvariablen `FINNHUB_API_KEY` und `NEWS_API_KEY` bevor du ausführst.

In [3]:
# =========================================================hhhhhh
# 0. Imports & Setup
# =========================================================
import os
import requests
import pandas as pd
from datetime import datetime, timedelta, timezone
from tqdm.notebook import tqdm
from typing import List, Dict, Any
from dotenv import load_dotenv

In [4]:
# =========================================================
# 1. 基础环境
# =========================================================
os.makedirs("Tools/data", exist_ok=True)

load_dotenv()

# API Keys（你现在是直接写死的，我保持不动）
FINNHUB_API_KEY = "d4m6udpr01qjidhtuevgd4m6udpr01qjidhtuf00"
NEWS_API_KEY = "pub_97d3b41e381a468393a42810d780d265"
GEMINI_API_KEY = "AIzaSyDHRIpGIwaXjNFsUouUJf8r64AeRm18mBA"

if not FINNHUB_API_KEY:
    print("WARNUNG: FINNHUB_API_KEY fehlt")
if not NEWS_API_KEY:
    print("WARNUNG: NEWS_API_KEY fehlt")

In [None]:
# =========================================================
# 2. 输出目录（保持你原来的）
# =========================================================
BASE_DATA_DIR = os.path.join("../../../../to delete/aai_final", "Tools", "data")
OUT_DIR_01 = os.path.join(BASE_DATA_DIR, "01")
os.makedirs(OUT_DIR_01, exist_ok=True)

print("01 output dir:", os.path.abspath(OUT_DIR_01))

In [7]:
%pip install yfinance


Collecting yfinance
  Downloading yfinance-0.2.66-py2.py3-none-any.whl (123 kB)
     -------------------------------------- 123.4/123.4 kB 3.7 MB/s eta 0:00:00
Collecting protobuf>=3.19.0
  Downloading protobuf-6.33.2-cp39-cp39-win_amd64.whl (436 kB)
     ------------------------------------- 436.9/436.9 kB 13.8 MB/s eta 0:00:00
Collecting requests>=2.31
  Downloading requests-2.32.5-py3-none-any.whl (64 kB)
     ---------------------------------------- 64.7/64.7 kB 3.4 MB/s eta 0:00:00
Collecting pytz>=2022.5
  Downloading pytz-2025.2-py2.py3-none-any.whl (509 kB)
     -------------------------------------- 509.2/509.2 kB 4.6 MB/s eta 0:00:00
Collecting multitasking>=0.0.7
  Downloading multitasking-0.0.12.tar.gz (19 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting websockets>=13.0
  Downloading websockets-15.0.1-cp39-cp39-win_amd64.whl (176 kB)
     -------------------------------------- 176.8/176.8 kB 5.4 MB/s eta 0

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
anaconda-project 0.11.1 requires ruamel-yaml, which is not installed.
conda-repo-cli 1.0.20 requires clyent==1.2.1, but you have clyent 1.2.2 which is incompatible.
conda-repo-cli 1.0.20 requires nbformat==5.4.0, but you have nbformat 5.5.0 which is incompatible.
conda-repo-cli 1.0.20 requires requests==2.28.1, but you have requests 2.32.5 which is incompatible.


In [10]:
import pandas as pd
import requests
from io import StringIO

def load_sp500_tickers():
    url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                      "AppleWebKit/537.36 (KHTML, like Gecko) "
                      "Chrome/116.0.0.0 Safari/537.36"
    }
    resp = requests.get(url, headers=headers)
    resp.raise_for_status()  # 确认请求成功
    # pandas 解析 HTML 表格
    tables = pd.read_html(resp.text)
    # Wikipedia 的第一个表格就是 S&P500 成分股
    return tables[0]["Symbol"].tolist()

ALL_TICKERS = load_sp500_tickers()

UNIVERSE_SIZE = 20
TARGET_COMPANIES = [{"name": t, "ticker": t} for t in ALL_TICKERS[:UNIVERSE_SIZE]]

print(f"Target companies: {len(TARGET_COMPANIES)}")


Target companies: 20


In [11]:
# =========================================================
# 4. ★ 自动关键词生成（替代 COMPANY_KEYWORDS）
# =========================================================
def keywords_for_ticker(ticker: str) -> List[str]:
    """
    最小可扩展方案：
    - 不手写 dict
    - 后续 relevance 会过滤噪音
    """
    return [ticker]

In [12]:
# =========================================================
# 5. 时间工具函数
# =========================================================
def utc_now() -> datetime:
    return datetime.now(timezone.utc)

def cutoff_months_ago(months: int = 6) -> datetime:
    return utc_now() - timedelta(days=30 * months)

In [14]:
# =========================================================
# 6. Finnhub 公司新闻
# =========================================================
def fetch_finnhub_company_news(
    symbol: str,
    months_back: int = 6,
    max_items: int = 200
) -> List[Dict[str, Any]]:

    if not FINNHUB_API_KEY:
        return []

    url = "https://finnhub.io/api/v1/company-news"
    params = {
        "symbol": symbol,
        "from": cutoff_months_ago(months_back).date().isoformat(),
        "to": utc_now().date().isoformat(),
        "token": FINNHUB_API_KEY
    }

    try:
        resp = requests.get(url, params=params, timeout=15)
        resp.raise_for_status()
        data = resp.json()
    except Exception as e:
        print(f"[Finnhub] Fehler für {symbol}: {e}")
        return []

    results = []
    for item in data[:max_items]:
        ts = item.get("datetime")
        pub_iso = (
            datetime.utcfromtimestamp(ts).isoformat() + "Z"
            if isinstance(ts, (int, float))
            else None
        )
        results.append({
            "provider": "finnhub",
            "ticker": symbol,
            "title": item.get("headline") or item.get("summary"),
            "summary": item.get("summary"),
            "url": item.get("url"),
            "image": item.get("image"),
            "source": item.get("source"),
            "published_at_utc": pub_iso,
            "collected_at_utc": utc_now().isoformat()
        })

    return results

**Hinweise:** Teste zuerst mit wenigen Artikeln (max_per_keyword klein). API-Keys müssen gesetzt sein. Die Ausgabe: `agent_new/data/raw_news.csv`.


In [15]:
# =========================================================
# 7. NewsAPI 关键词新闻
# =========================================================
def fetch_newsapi_for_keyword(
    keyword: str,
    months_back: int = 1,
    max_items: int = 50
) -> List[Dict[str, Any]]:

    if not NEWS_API_KEY:
        return []

    base_url = "https://newsapi.org/v2/everything"
    from_date = cutoff_months_ago(1).date().isoformat()  # free tier 限制

    params = {
        "q": keyword,
        "language": "en",
        "from": from_date,
        "sortBy": "publishedAt",
        "pageSize": max_items,
        "apiKey": NEWS_API_KEY,
    }

    try:
        resp = requests.get(base_url, params=params, timeout=15)
        resp.raise_for_status()
        data = resp.json()
    except Exception as e:
        print(f"[NewsAPI] Fehler für {keyword}: {e}")
        return []

    results = []
    for art in data.get("articles", []):
        src = art.get("source") or {}
        results.append({
            "provider": "newsapi",
            "ticker": None,
            "title": art.get("title"),
            "summary": art.get("description") or art.get("content"),
            "url": art.get("url"),
            "image": art.get("urlToImage"),
            "source": src.get("name"),
            "published_at_utc": art.get("publishedAt"),
            "collected_at_utc": utc_now().isoformat()
        })

    return results

In [16]:
# =========================================================
# 8. ★ 核心收集函数
# =========================================================
def collect_all_news(
    months_back: int = 6,
    max_per_keyword: int = 20
) -> pd.DataFrame:

    all_rows = []

    for comp in tqdm(TARGET_COMPANIES):
        ticker = comp["ticker"]

        # Finnhub
        fh_news = fetch_finnhub_company_news(
            ticker,
            months_back=months_back,
            max_items=max_per_keyword
        )
        all_rows.extend(fh_news)

        # NewsAPI
        for kw in keywords_for_ticker(ticker):
            na_news = fetch_newsapi_for_keyword(
                kw,
                months_back=months_back,
                max_items=max_per_keyword
            )
            for item in na_news:
                item["ticker"] = ticker
                all_rows.append(item)

    df = pd.DataFrame(all_rows)

    out_path = os.path.join(OUT_DIR_01, "raw_news.csv")
    df.to_csv(out_path, index=False)

    print("Gespeicherte Rohdaten:", os.path.abspath(out_path))
    return df


In [17]:
# =========================================================
# 9. Run（测试）
# =========================================================
df = collect_all_news(months_back=6, max_per_keyword=20)
print("Gesammelte Artikel:", len(df))
df.head()

  0%|          | 0/20 [00:00<?, ?it/s]

[NewsAPI] Fehler für MMM: 401 Client Error: Unauthorized for url: https://newsapi.org/v2/everything?q=MMM&language=en&from=2025-11-17&sortBy=publishedAt&pageSize=20&apiKey=pub_97d3b41e381a468393a42810d780d265
[NewsAPI] Fehler für AOS: 401 Client Error: Unauthorized for url: https://newsapi.org/v2/everything?q=AOS&language=en&from=2025-11-17&sortBy=publishedAt&pageSize=20&apiKey=pub_97d3b41e381a468393a42810d780d265
[NewsAPI] Fehler für ABT: 401 Client Error: Unauthorized for url: https://newsapi.org/v2/everything?q=ABT&language=en&from=2025-11-17&sortBy=publishedAt&pageSize=20&apiKey=pub_97d3b41e381a468393a42810d780d265
[NewsAPI] Fehler für ABBV: 401 Client Error: Unauthorized for url: https://newsapi.org/v2/everything?q=ABBV&language=en&from=2025-11-17&sortBy=publishedAt&pageSize=20&apiKey=pub_97d3b41e381a468393a42810d780d265
[NewsAPI] Fehler für ACN: 401 Client Error: Unauthorized for url: https://newsapi.org/v2/everything?q=ACN&language=en&from=2025-11-17&sortBy=publishedAt&pageSize=

Unnamed: 0,provider,ticker,title,summary,url,image,source,published_at_utc,collected_at_utc
0,finnhub,MMM,Jim Cramer Says “Industrials Always Work in Re...,3M Company (NYSE:MMM) is one of the stocks Jim...,https://finnhub.io/api/news?id=410a2b765b049f0...,https://s.yimg.com/rz/stage/p/yahoo_finance_en...,Yahoo,2025-12-13T16:17:39Z,2025-12-17T15:18:38.537678+00:00
1,finnhub,MMM,Barclays Maintains An Overweight Rating On 3M ...,3M Company (NYSE:MMM) is among the 13 Best Nan...,https://finnhub.io/api/news?id=8623b9110d61064...,https://s.yimg.com/rz/stage/p/yahoo_finance_en...,Yahoo,2025-12-12T13:49:02Z,2025-12-17T15:18:38.537678+00:00
2,finnhub,MMM,3M: Sales And Profits Are On The Rise,"3M gains with a 90% rebound, steady profits, n...",https://finnhub.io/api/news?id=0da7d2b2e6eb305...,https://static.seekingalpha.com/cdn/s3/uploads...,SeekingAlpha,2025-12-11T08:01:52Z,2025-12-17T15:18:38.537678+00:00
3,finnhub,MMM,3M: Operational Excellence And Turnaround Mome...,"3M drives a turnaround with restructuring, Q3 ...",https://finnhub.io/api/news?id=a2298ef5bc24c2b...,https://static.seekingalpha.com/cdn/s3/uploads...,SeekingAlpha,2025-12-11T02:07:33Z,2025-12-17T15:18:38.537678+00:00
4,finnhub,MMM,How Recent Developments Are Rewriting the Stor...,3M's fair value estimate has inched down to $1...,https://finnhub.io/api/news?id=e42ceee0cce24a1...,https://s.yimg.com/rz/stage/p/yahoo_finance_en...,Yahoo,2025-12-10T15:06:22Z,2025-12-17T15:18:38.537678+00:00
