
# 01_fetch_news.ipynb — News sammeln (Finnhub + NewsAPI)

Dieses Notebook sammelt Nachrichten für eine Auswahl von Zielunternehmen (Tickern), kombiniert Finnhub Company-News und NewsAPI-Keyword-Suche und speichert die Rohdaten in `agent_new/data/raw_news.csv`.

**Wichtig:** Setze die Umgebungsvariablen `FINNHUB_API_KEY` und `NEWS_API_KEY` bevor du ausführst.

In [None]:
# Falls nötig: Installiere Abhängigkeiten (nur falls noch nicht installiert)
# !pip install requests pandas tqdm python-dotenv

import os
import requests
import pandas as pd
from datetime import datetime, timedelta, timezone
from tqdm.notebook import tqdm
from typing import List, Dict, Any

# Zielordner anlegen
os.makedirs("Tools/data", exist_ok=True)


In [None]:
import os
from    dotenv import load_dotenv

# .env laden (optional, falls du auch eine .env Datei hast)
load_dotenv()

# API-Keys fest eingetragen
FINNHUB_API_KEY = "d4m6udpr01qjidhtuevgd4m6udpr01qjidhtuf00"
NEWS_API_KEY = "pub_97d3b41e381a468393a42810d780d265"
GEMINI_API_KEY = "AIzaSyDHRIpGIwaXjNFsUouUJf8r64AeRm18mBA"

# Optional: Warnungen, falls ein Key fehlt
if not FINNHUB_API_KEY:
    print("WARNUNG: FINNHUB_API_KEY nicht gefunden!")
if not NEWS_API_KEY:
    print("WARNUNG: NEWS_API_KEY nicht gefunden!")
if not GEMINI_API_KEY:
    print("WARNUNG: GEMINI_API_KEY nicht gefunden!")

# ---- Speicherstruktur (wird erzeugt) ----
# Standard-Base-Ordner (relativ vom Projekt-Root). Wenn Du absolute Pfade willst,
# ändere BASE_DATA_DIR z.B. auf r"C:\Users\hanac\Documents\Agentic Artificial Intelligence\aai_final\Tools\data"
BASE_DATA_DIR = os.path.join("../../../to delete/aai_final", "Tools", "data")

# Subfolder für Notebook 01 (nur hier speichern)
OUT_DIR_01 = os.path.join(BASE_DATA_DIR, "01")
os.makedirs(OUT_DIR_01, exist_ok=True)

print("01 output dir:", os.path.abspath(OUT_DIR_01))
# Default Einstellungen
DEFAULT_LLM_PROVIDER = os.getenv("DEFAULT_LLM_PROVIDER", "gemini")
DEFAULT_TEMPERATURE = float(os.getenv("DEFAULT_TEMPERATURE", 0.7))

print("Keys geladen. LLM Provider:", DEFAULT_LLM_PROVIDER, "Temperatur:", DEFAULT_TEMPERATURE)


01 output dir: c:\Users\Besitzer\Desktop\Neuer Ordner\Agentic_Artificial_inteligence\agent_new\notebooks\aai_final\agent_new\data\01
Keys geladen. LLM Provider: gemini Temperatur: 0.7


In [None]:
# Konfiguration: API-Keys aus Env oder config.py laden
FINNHUB_API_KEY = "d4m6udpr01qjidhtuevgd4m6udpr01qjidhtuf00"
NEWS_API_KEY = "pub_97d3b41e381a468393a42810d780d265"
try:
    import config
    if not FINNHUB_API_KEY:
        FINNHUB_API_KEY = getattr(config, "FINNHUB_API_KEY", None)
    if not NEWS_API_KEY:
        NEWS_API_KEY = getattr(config, "NEWS_API_KEY", None)
except Exception:
    pass

if not FINNHUB_API_KEY:
    print("WARNUNG: FINNHUB_API_KEY nicht gefunden. Setze FINNHUB_API_KEY als Umgebungsvariable oder in config.py.")
if not NEWS_API_KEY:
    print("WARNUNG: NEWS_API_KEY nicht gefunden. Setze NEWS_API_KEY als Umgebungsvariable oder in config.py.")

TARGET_COMPANIES = [
    {"name": "NVIDIA", "ticker": "NVDA"},
    {"name": "Tesla", "ticker": "TSLA"},
    {"name": "ASML Holdings", "ticker": "ASML"},
    {"name": "Meta Platforms", "ticker": "META"},
    {"name": "Amazon", "ticker": "AMZN"},
]

COMPANY_KEYWORDS = {
    "NVDA": ["NVIDIA", "NVDA"],
    "TSLA": ["Tesla", "TSLA"],
    "ASML": ["ASML", "ASML Holdings"],
    "META": ["Meta Platforms", "META", "Facebook"],
    "AMZN": ["Amazon", "AMZN", "Amazon.com"],
}


In [12]:
def utc_now() -> datetime:
    return datetime.now(timezone.utc)

def cutoff_months_ago(months: int = 6) -> datetime:
    return utc_now() - timedelta(days=30 * months)


In [13]:
def fetch_finnhub_company_news(symbol: str, months_back: int = 6, max_items: int = 200) -> List[Dict[str, Any]]:
    if not FINNHUB_API_KEY:
        return []
    url = "https://finnhub.io/api/v1/company-news"
    params = {
        "symbol": symbol,
        "from": cutoff_months_ago(months_back).date().isoformat(),
        "to": utc_now().date().isoformat(),
        "token": FINNHUB_API_KEY
    }
    try:
        resp = requests.get(url, params=params, timeout=15)
        resp.raise_for_status()
        data = resp.json()
    except Exception as e:
        print(f"[Finnhub] Fehler für {symbol}: {e}")
        return []
    results = []
    from datetime import datetime as dtmod
    for item in data[:max_items]:
        ts = item.get("datetime")
        pub_iso = None
        if isinstance(ts, (int, float)):
            pub_iso = dtmod.utcfromtimestamp(ts).isoformat() + "Z"
        results.append({
            "provider": "finnhub",
            "ticker": symbol,
            "title": item.get("headline") or item.get("summary"),
            "summary": item.get("summary"),
            "url": item.get("url"),
            "image": item.get("image"),
            "source": item.get("source"),
            "published_at_utc": pub_iso,
            "collected_at_utc": utc_now().isoformat()
        })
    return results


In [14]:
def fetch_newsapi_for_keyword(keyword: str, months_back: int = 1, max_items: int = 50) -> List[Dict[str, Any]]:
    if not NEWS_API_KEY:
        return []
    base_url = "https://newsapi.org/v2/everything"
    from_date = cutoff_months_ago(min(months_back, 1)).date().isoformat()  # NewsAPI free: max 1 month
    params = {
        "q": keyword,
        "language": "en",
        "from": from_date,
        "sortBy": "publishedAt",
        "pageSize": max_items,
        "apiKey": NEWS_API_KEY,
    }
    try:
        resp = requests.get(base_url, params=params, timeout=15)
        resp.raise_for_status()
        data = resp.json()
    except Exception as e:
        print(f"[NewsAPI] Fehler für {keyword}: {e}")
        return []
    results = []
    for art in data.get("articles", []):
        src = art.get("source") or {}
        results.append({
            "provider": "newsapi",
            "ticker": None,
            "title": art.get("title"),
            "summary": art.get("description") or art.get("content"),
            "url": art.get("url"),
            "image": art.get("urlToImage"),
            "source": src.get("name"),
            "published_at_utc": art.get("publishedAt"),
            "collected_at_utc": utc_now().isoformat()
        })
    return results


In [None]:
def collect_all_news(months_back: int = 6, max_per_keyword: int = 30) -> pd.DataFrame:
    all_rows = []
    for comp in TARGET_COMPANIES:
        t = comp["ticker"]
        # Finnhub company news
        fh = fetch_finnhub_company_news(t, months_back=months_back, max_items=max_per_keyword)
        for it in fh:
            all_rows.append(it)
        # NewsAPI keywords
        for kw in COMPANY_KEYWORDS.get(t, [t]):
            na = fetch_newsapi_for_keyword(kw, months_back=months_back, max_items=max_per_keyword)
            for it in na:
                # assign ticker if keyword matches company
                it["ticker"] = t
                all_rows.append(it)
    df = pd.DataFrame(all_rows)
    # normalize columns
    df = df.rename(columns={"title": "title", "summary": "summary"})
    out_path = os.path.join(OUT_DIR_01, "raw_news.csv")
    df.to_csv(out_path, index=False)
    print("Gespeicherte Rohdaten:", os.path.abspath(out_path))

    return df

# Run a small collection (test)
df = collect_all_news(months_back=6, max_per_keyword=20)
print("Gesammelte Artikel:", len(df))
df.head()


  pub_iso = dtmod.utcfromtimestamp(ts).isoformat() + "Z"


[NewsAPI] Fehler für NVIDIA: 401 Client Error: Unauthorized for url: https://newsapi.org/v2/everything?q=NVIDIA&language=en&from=2025-11-10&sortBy=publishedAt&pageSize=20&apiKey=pub_97d3b41e381a468393a42810d780d265
[NewsAPI] Fehler für NVDA: 401 Client Error: Unauthorized for url: https://newsapi.org/v2/everything?q=NVDA&language=en&from=2025-11-10&sortBy=publishedAt&pageSize=20&apiKey=pub_97d3b41e381a468393a42810d780d265


**Hinweise:** Teste zuerst mit wenigen Artikeln (max_per_keyword klein). API-Keys müssen gesetzt sein. Die Ausgabe: `agent_new/data/raw_news.csv`.
