# 02_evaluate_relevance.ipynb — Relevanzbewertung mit Gemini

Dieses Notebook bewertet jede News bezüglich Relevanz für den Aktienmarkt (inkl. Politik/Makro) mittels Google Gemini (via `google-genai`).
Setze `GEMINI_API_KEY` bevor Du ausführst.


In [1]:
# !pip install google-genai pandas tqdm

import os
import json
import pandas as pd
from typing import Dict, Any
from tqdm.notebook import tqdm

try:
    from google import genai
    _HAS_GEMINI = True
except Exception:
    genai = None
    _HAS_GEMINI = False

INPUT_CSV = "Tools/data/raw_news.csv"
if not os.path.exists(INPUT_CSV):
    raise FileNotFoundError(f"{INPUT_CSV} nicht gefunden. Führe 01_fetch_news.ipynb aus.")

df = pd.read_csv(INPUT_CSV)
print("Geladene Artikel:", len(df))


Geladene Artikel: 100


In [2]:

from dotenv import load_dotenv
import os
import json
import pandas as pd
from typing import Dict, Any
from tqdm.notebook import tqdm

# .env laden (optional, falls du auch eine .env Datei hast)
load_dotenv()

# API-Keys fest eingetragen
FINNHUB_API_KEY = "d4m6udpr01qjidhtuevgd4m6udpr01qjidhtuf00"
NEWS_API_KEY = "pub_97d3b41e381a468393a42810d780d265"
GEMINI_API_KEY = "AIzaSyDHRIpGIwaXjNFsUouUJf8r64AeRm18mBA"

# Optional: Warnungen, falls ein Key fehlt
if not FINNHUB_API_KEY:
    print("WARNUNG: FINNHUB_API_KEY nicht gefunden!")
if not NEWS_API_KEY:
    print("WARNUNG: NEWS_API_KEY nicht gefunden!")
if not GEMINI_API_KEY:
    print("WARNUNG: GEMINI_API_KEY nicht gefunden!")

# Default Einstellungen
DEFAULT_LLM_PROVIDER = os.getenv("DEFAULT_LLM_PROVIDER", "gemini")
DEFAULT_TEMPERATURE = float(os.getenv("DEFAULT_TEMPERATURE", 0.7))

print("Keys geladen. LLM Provider:", DEFAULT_LLM_PROVIDER, "Temperatur:", DEFAULT_TEMPERATURE)

# Zelle 2 — Imports & Setup (ersetzt)
# !pip install google-genai pandas tqdm



try:
    from google import genai
    _HAS_GEMINI = True
except Exception:
    genai = None
    _HAS_GEMINI = False

# Pfade: BASE_DATA_DIR muss mit dem in 01 übereinstimmen
BASE_DATA_DIR = os.path.join("../../../../to delete/aai_final", "Tools", "data")
IN_DIR_01 = os.path.join(BASE_DATA_DIR, "01")
OUT_DIR_02 = os.path.join(BASE_DATA_DIR, "02")
os.makedirs(OUT_DIR_02, exist_ok=True)

INPUT_CSV = os.path.join(IN_DIR_01, "raw_news.csv")
if not os.path.exists(INPUT_CSV):
    raise FileNotFoundError(f"{INPUT_CSV} nicht gefunden. Führe 01_fetch_news.ipynb aus (erzeugt raw_news.csv im /01-Ordner).")

df = pd.read_csv(INPUT_CSV)
print("Geladene Artikel:", len(df))
print("Eingabe gelesen von:", os.path.abspath(INPUT_CSV))
print("02 output dir:", os.path.abspath(OUT_DIR_02))


Keys geladen. LLM Provider: gemini Temperatur: 0.7


In [3]:
# === GEMINI API KEY EINSTELLEN ===
import os
from google import genai

# HIER EINTRAGEN: ↓↓↓
GEMINI_API_KEY = "AIzaSyDHRIpGIwaXjNFsUouUJf8r64AeRm18mBA"

# Falls jemand lieber config.py nutzt – optional
try:
    import config
    if not GEMINI_API_KEY:
        GEMINI_API_KEY = getattr(config, "GEMINI_API_KEY", None)
except Exception:
    pass

# Sicherheitscheck
if not GEMINI_API_KEY:
    print("WARNUNG: GEMINI_API_KEY nicht gesetzt. Bitte eintragen!")
else:
    print("Gemini API Key erfolgreich geladen.")

# Gemini-Client Initialisierung
def get_gemini_client():
    if not getattr(get_gemini_client, "client", None):
        get_gemini_client.client = genai.Client(api_key=GEMINI_API_KEY)
    return get_gemini_client.client


Gemini API Key erfolgreich geladen.


In [4]:
SYSTEM_INSTRUCTION = (
    "Du bist ein Finanzanalyst. Bewerte, ob die folgende Nachricht Relevanz für Aktienmärkte hat. "
    "Gib ausschließlich ein JSON zurück mit: "
    "relevance_score (0.0-1.0), relevant (true/false), categories (liste aus 'company','policy','macro','geo','social','other'), explanation (kurz, sachlich)."
)

def call_gemini_relevance(text: str, model: str = os.getenv("GEMINI_MODEL", "gemini-2.5-flash")) -> Dict[str, Any]:
    client = get_gemini_client()
    prompt = f"Bewerte die Relevanz für Aktienmärkte:\n\n{text}\n\nAntwort als JSON."
    try:
        resp = client.models.generate_content(
            model=model,
            contents=prompt,
            config=genai.types.GenerateContentConfig(
                system_instruction=SYSTEM_INSTRUCTION,
                temperature=0.0,
                max_output_tokens=512
            )
        )
        raw = getattr(resp, "text", str(resp))
        s = raw.strip()
        start = s.find("{")
        end = s.rfind("}")
        if start != -1 and end != -1 and end > start:
            json_text = s[start:end+1]
        else:
            json_text = s
        parsed = json.loads(json_text)
        # normalize
        parsed["relevance_score"] = float(parsed.get("relevance_score", 0.0))
        parsed["relevant"] = bool(parsed.get("relevant", parsed["relevance_score"] >= 0.5))
        parsed["categories"] = list(parsed.get("categories", []))
        parsed["explanation"] = str(parsed.get("explanation", ""))[:1000]
        return parsed
    except Exception as e:
        return {
            "relevance_score": 0.0,
            "relevant": False,
            "categories": [],
            "explanation": f"LLM Fehler: {e}"
        }


In [5]:
# Build evaluation text field
def build_eval_text(row):
    parts = []
    for c in ["title", "summary", "description", "headline"]:
        if c in row and pd.notna(row[c]) and str(row[c]).strip():
            parts.append(str(row[c]).strip())
    if "source" in row and pd.notna(row["source"]):
        parts.append(f"Quelle: {row['source']}")
    if "ticker" in row and pd.notna(row["ticker"]):
        parts.append(f"Ticker: {row['ticker']}")
    return "\n".join(parts)

df["eval_text"] = df.apply(build_eval_text, axis=1)

# Testlauf: nur erstes Sample N
N = 20
sample = df.head(N).copy()

records = []
for idx, row in tqdm(sample.iterrows(), total=len(sample)):
    out = call_gemini_relevance(row["eval_text"])
    records.append({
        "index": idx,
        "relevance_score": out["relevance_score"],
        "relevant": out["relevant"],
        "categories": out["categories"],
        "explanation": out["explanation"]
    })

res_df = pd.DataFrame(records).set_index("index")
df_eval = df.join(res_df, how="left")
df_eval.head()


  0%|          | 0/20 [00:00<?, ?it/s]

Unnamed: 0,provider,ticker,title,summary,url,image,source,published_at_utc,collected_at_utc,eval_text,relevance_score,relevant,categories,explanation
0,finnhub,NVDA,MAGY Vs. YMAG: Magnificent Exposure And Magnif...,,https://finnhub.io/api/news?id=09efdd3732bc5d0...,,SeekingAlpha,2025-11-30T09:04:39Z,2025-11-30T17:34:44.350537+00:00,MAGY Vs. YMAG: Magnificent Exposure And Magnif...,0.0,False,[],LLM Fehler: 'NoneType' object has no attribute...
1,finnhub,NVDA,Potentially 12%-15% Consistent Income: Monthly...,Selling options can generate annual income but...,https://finnhub.io/api/news?id=f125926d5022926...,https://static.seekingalpha.com/cdn/s3/uploads...,SeekingAlpha,2025-11-30T08:30:00Z,2025-11-30T17:34:44.350550+00:00,Potentially 12%-15% Consistent Income: Monthly...,0.0,False,[],LLM Fehler: 'NoneType' object has no attribute...
2,finnhub,NVDA,It All Could Come Down To Depreciation,Discover key investment opportunities and hidd...,https://finnhub.io/api/news?id=cc85681e09368b3...,https://static.seekingalpha.com/cdn/s3/uploads...,SeekingAlpha,2025-11-29T08:42:34Z,2025-11-30T17:34:44.350555+00:00,It All Could Come Down To Depreciation\nDiscov...,0.0,False,[],LLM Fehler: 'NoneType' object has no attribute...
3,finnhub,NVDA,December will begin with investors owning litt...,"Wall Street thinks you don't own enough stock,...",https://finnhub.io/api/news?id=59af1ff69975059...,https://image.cnbcfm.com/api/v1/image/10822746...,CNBC,2025-11-29T08:34:55Z,2025-11-30T17:34:44.350560+00:00,December will begin with investors owning litt...,0.0,False,[],LLM Fehler: 'NoneType' object has no attribute...
4,finnhub,NVDA,How I'm Investing Ahead Of America's Next Manu...,"Read how AI, automation, and re-shoring could ...",https://finnhub.io/api/news?id=f9d937310770a40...,https://static.seekingalpha.com/cdn/s3/uploads...,SeekingAlpha,2025-11-29T07:30:00Z,2025-11-30T17:34:44.350563+00:00,How I'm Investing Ahead Of America's Next Manu...,0.0,False,[],LLM Fehler: 'NoneType' object has no attribute...


In [6]:
# Build evaluation text field
def build_eval_text(row):
    parts = []
    for c in ["title", "summary", "description", "headline"]:
        if c in row and pd.notna(row[c]) and str(row[c]).strip():
            parts.append(str(row[c]).strip())
    if "source" in row and pd.notna(row["source"]):
        parts.append(f"Quelle: {row['source']}")
    if "ticker" in row and pd.notna(row["ticker"]):
        parts.append(f"Ticker: {row['ticker']}")
    return "\n".join(parts)

df["eval_text"] = df.apply(build_eval_text, axis=1)

# Testlauf: nur erstes Sample N
N = 20
sample = df.head(N).copy()

records = []
for idx, row in tqdm(sample.iterrows(), total=len(sample)):
    out = call_gemini_relevance(row["eval_text"])
    records.append({
        "index": idx,
        "relevance_score": out["relevance_score"],
        "relevant": out["relevant"],
        "categories": out["categories"],
        "explanation": out["explanation"]
    })

res_df = pd.DataFrame(records).set_index("index")
df_eval = df.join(res_df, how="left")
df_eval.head()


  0%|          | 0/20 [00:00<?, ?it/s]

Unnamed: 0,provider,ticker,title,summary,url,image,source,published_at_utc,collected_at_utc,eval_text,relevance_score,relevant,categories,explanation
0,finnhub,NVDA,MAGY Vs. YMAG: Magnificent Exposure And Magnif...,,https://finnhub.io/api/news?id=09efdd3732bc5d0...,,SeekingAlpha,2025-11-30T09:04:39Z,2025-11-30T17:34:44.350537+00:00,MAGY Vs. YMAG: Magnificent Exposure And Magnif...,0.0,False,[],LLM Fehler: 429 RESOURCE_EXHAUSTED. {'error': ...
1,finnhub,NVDA,Potentially 12%-15% Consistent Income: Monthly...,Selling options can generate annual income but...,https://finnhub.io/api/news?id=f125926d5022926...,https://static.seekingalpha.com/cdn/s3/uploads...,SeekingAlpha,2025-11-30T08:30:00Z,2025-11-30T17:34:44.350550+00:00,Potentially 12%-15% Consistent Income: Monthly...,0.0,False,[],LLM Fehler: 429 RESOURCE_EXHAUSTED. {'error': ...
2,finnhub,NVDA,It All Could Come Down To Depreciation,Discover key investment opportunities and hidd...,https://finnhub.io/api/news?id=cc85681e09368b3...,https://static.seekingalpha.com/cdn/s3/uploads...,SeekingAlpha,2025-11-29T08:42:34Z,2025-11-30T17:34:44.350555+00:00,It All Could Come Down To Depreciation\nDiscov...,0.0,False,[],LLM Fehler: 429 RESOURCE_EXHAUSTED. {'error': ...
3,finnhub,NVDA,December will begin with investors owning litt...,"Wall Street thinks you don't own enough stock,...",https://finnhub.io/api/news?id=59af1ff69975059...,https://image.cnbcfm.com/api/v1/image/10822746...,CNBC,2025-11-29T08:34:55Z,2025-11-30T17:34:44.350560+00:00,December will begin with investors owning litt...,0.0,False,[],LLM Fehler: 429 RESOURCE_EXHAUSTED. {'error': ...
4,finnhub,NVDA,How I'm Investing Ahead Of America's Next Manu...,"Read how AI, automation, and re-shoring could ...",https://finnhub.io/api/news?id=f9d937310770a40...,https://static.seekingalpha.com/cdn/s3/uploads...,SeekingAlpha,2025-11-29T07:30:00Z,2025-11-30T17:34:44.350563+00:00,How I'm Investing Ahead Of America's Next Manu...,0.0,False,[],LLM Fehler: 429 RESOURCE_EXHAUSTED. {'error': ...


In [7]:
# Full run (vorsichtig: API Kosten)
run_full = False  # set True only wenn Du bereit bist
if run_full:
    results = []
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        out = call_gemini_relevance(row["eval_text"])
        results.append({
            "index": idx,
            "relevance_score": out["relevance_score"],
            "relevant": out["relevant"],
            "categories": out["categories"],
            "explanation": out["explanation"]
        })
    res_df = pd.DataFrame(results).set_index("index")
    df_eval = df.join(res_df, how="left")
else:
    print("Full run deaktiviert — teste mit Sample.")

# Speichern der Ergebnisse (Sample oder vollständige df_eval)
# Speichern der Ergebnisse exklusiv im /02-Ordner
out_path_eval = os.path.join(OUT_DIR_02, "evaluated_news.csv")
df_eval.to_csv(out_path_eval, index=False)
print("Gespeichert:", os.path.abspath(out_path_eval))

# Optional auch gefilterte (nicht-relevante) Artikel separat speichern
out_filtered = os.path.join(OUT_DIR_02, "filtered_out_02.csv")
filtered = df_eval[df_eval.get("relevant")==False] if "relevant" in df_eval.columns else pd.DataFrame()
if not filtered.empty:
    filtered.to_csv(out_filtered, index=False)
    print("Filtered (02) saved:", os.path.abspath(out_filtered))
else:
    print("Keine gefilterten Artikel zum Speichern in /02.")


Full run deaktiviert — teste mit Sample.
Gespeichert: agent_new/data/evaluated_news.csv


- Du kannst nach der Ausführung `agent_new/data/evaluated_news.csv` öffnen und prüfen.
- Möchtest du nur die ausgesonderten Artikel speichern, filtere `relevant == False`.
