In [None]:
import pandas as pd
import openai
import json
import time
import os

df_to_match = pd.read_pickle("../data/df_to_match.pkl")
results_df = pd.read_pickle("../data/results_df.pkl")
catalogue = pd.read_excel("../data/catalogue_clean_mit_aspects.xlsx")

In [None]:
# API-Key sicher einfügen (über Umgebungsvariable OPENAI_API_KEY)
openai.api_key = os.getenv("OPENAI_API_KEY")

# GPT-Model
llm_model = "gpt-4o-mini"

def check_core_aspects_with_llm(section_text, core_aspects, model=llm_model, sleep_between_calls=1.5):
    aspects_list = "\n- " + "\n- ".join(core_aspects)
    prompt = f"""Du bist ein Vertragsexperte. Prüfe den folgenden Vertragstext auf die Einhaltung der folgenden Kernanforderungen (Core Aspects).

Gib als Ergebnis für jeden einzelnen Punkt einen Erfüllungsgrad von 0 bis 1 an (0 = nicht erfüllt, 1 = voll erfüllt, 0.5 = teilweise erfüllt). Gib zusätzlich eine durchschnittliche Erfüllungsquote in Prozent für alle Core Aspects an.

Vertragstext:
{section_text}

Core Aspects:{aspects_list}

Antwortformat (nur JSON):
{{
  "core_aspect_scores": {{
    "Aspekt 1": 1,
    "Aspekt 2": 0.5
  }},
  "average_fulfillment_percent": 76.5
}}"""

    try:
        response = openai.ChatCompletion.create(
            model=model,
            messages=[{"role": "user", "content": prompt}],
            temperature=0.2
        )
        return response.choices[0].message['content']
    except Exception as e:
        print("API-Fehler:", e)
        return None
    finally:
        time.sleep(sleep_between_calls)

In [None]:
# Modellname mit bester Performance
best_model = results_df.iloc[0]["model"]

# Filtere gültige Matches
df_eval = df_to_match[df_to_match[f"match_valid_{best_model}"]].copy()
df_eval["catalog_id"] = df_eval[f"matched_catalog_id_{best_model}"]

# Verbinde mit Core Aspects
df_eval = df_eval.merge(
    catalogue[["catalog_id", "core_aspects"]],
    on="catalog_id",
    how="left"
)

# Bewertungsfunktion
def evaluate_llm(row):
    section_text = row["clean_section_content"]
    aspects = [line.strip() for line in row["core_aspects"].split("\n") if line.strip()]
    raw_response = check_core_aspects_with_llm(section_text, aspects)
    try:
        return json.loads(raw_response)
    except Exception:
        print("Parsing-Fehler. Antwort war:", raw_response)
        return {"core_aspect_scores": {}, "average_fulfillment_percent": None}

# LLM-Auswertung durchführen
df_eval["llm_eval_result"] = df_eval.apply(evaluate_llm, axis=1)
df_eval["core_aspect_scores"] = df_eval["llm_eval_result"].apply(lambda x: x.get("core_aspect_scores", {}))
df_eval["average_fulfillment_percent"] = df_eval["llm_eval_result"].apply(lambda x: x.get("average_fulfillment_percent"))


# Optionally save the evaluated results
df_eval.to_pickle("../data/llm_eval_result.pkl")

# preview first few results
display(df_eval[["clean_section_content", "core_aspect_scores", "average_fulfillment_percent"]].head())