# Evaluation of the Mulit Agent System 

This Notebook is responsible for the evaluation of the Multi Agent System

In [None]:
import pandas as pd 
import numpy as np 
from foodrec.config.structure.dataset_enum import ModelEnum 
from foodrec.evaluation.create_dataset import create_dataset
from foodrec.evaluation.is_ketogen import is_ketogenic, calc_keto_ratio
from foodrec.config.structure.paths import CONVERSATION, DATASET_PATHS
import json
from foodrec.evaluation.metrics.metrics import macro_over_queries,filter_search, micro_over_queries, accuracy, f1_score, mean_average_precision_over_queries, mean_pr_auc_over_queries, bias_conformity_rate_at_k
from foodrec.data.all_recipe import AllRecipeLoader
from typing import Dict, List, Any, Tuple
from collections import Counter
from foodrec.agents.agent_names import AgentEnum
from foodrec.tools.ingredient_normalizer import IngredientNormalisation
from analysis_helper.load_dataset import check_availability
from foodrec.config.structure.dataset_enum import DatasetEnum
from foodrec.evaluation.reward_evaluation import final_episode_reward, routing_accuracy
from datetime import datetime
import math

In [2]:
def calc_rounds(persona_id: int, query: str, model: ModelEnum, Path = None):
    query_stempt = query.replace(" ", "_").lower()
    id = f"{persona_id}_{query_stempt}_{model.name}"
    filepath = Path / f"{id}.jsonl"
    if not filepath.exists():
        return 0
    try:
        with open(filepath, "r", encoding="utf-8") as f:
            num = 0
            for line in f:
                
                line = line.strip()
                if not line:
                    continue
                try:
                    obj = json.loads(line)
                except json.JSONDecodeError:
                    continue  # kaputte Zeilen überspringen
                if obj.get("role") == "REFLECTOR":
                    num +=1
            return num
    except Exception as e:
        print(e)
        return None
    return 0


In [3]:
def calc_time(persona_id: int, query: str, model: ModelEnum, Path = None):
    query_stempt = query.replace(" ", "_").lower()
    id = f"{persona_id}_{query_stempt}_{model.name}"
    ls_search = []
    filepath = Path / f"{id}.jsonl"
    if not filepath.exists():
        return 0
    
    try:
        with open(filepath, "r", encoding="utf-8") as f:
            num = 0
            lines = [line.strip() for line in f if line.strip()]
            if not lines:
                return 0

            first_obj = json.loads(lines[0])
            last_obj = json.loads(lines[-1])
            time_first = first_obj.get("ts")
            time_last = last_obj.get("ts")
            fmt = "%Y-%m-%dT%H:%M:%S%z"
            dt1 = datetime.strptime(time_first.replace("Z", "+00:00"), fmt)
            dt2 = datetime.strptime(time_last.replace("Z", "+00:00"), fmt)
            time = (dt2 - dt1).total_seconds()
            return time
    except Exception as e:
        print(e)
        return 0
    
    return 0


In [4]:
def calc_path_length(persona_id: int, query: str, model: ModelEnum, Path = None):
    query_stempt = query.replace(" ", "_").lower()
    ls = ["INTERPRETER_Output", "USER_ANALYST", "SEARCH_Output", "ITEM_ANALYST", "REFLECTOR", ]
    id = f"{persona_id}_{query_stempt}_{model.name}"
    filepath = Path / f"{id}.jsonl"
    if not filepath.exists():
        return 0
    
    try:
        with open(filepath, "r", encoding="utf-8") as f:
            num = 0
            for line in f:
                
                line = line.strip()
                if not line:
                    continue
                try:
                    obj = json.loads(line)
                except json.JSONDecodeError:
                    continue  # kaputte Zeilen überspringen
                if obj.get("role") in ls:
                    num +=1
            return num
    except:
        return None
    return None


In [5]:
def most_common_path(persona_id: int, query: str, model: ModelEnum, Path=None):
    query_stempt = query.replace(" ", "_").lower()
    allowed_roles = {"INTERPRETER_Output", "USER_ANALYST", "SEARCH_Output", "ITEM_ANALYST", "REFLECTOR"}
    file_id = f"{persona_id}_{query_stempt}_{model.name}"
    filepath = Path / f"{file_id}.jsonl"

    if not filepath.exists():
        return []  # always return a list

    routing = []
    try:
        with open(filepath, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                try:
                    obj = json.loads(line)
                except json.JSONDecodeError:
                    continue
                role = obj.get("role")
                if role in allowed_roles:
                    routing.append(str(role)[:2].upper())
    except Exception:
        return []  # on any error, return empty list for consistency

    return routing  # list of short codes, possibly empty


def calc_get_most_common_paths(paths, model_name: ModelEnum):
    def calc_median_path_length(df, model: ModelEnum, Path):
        seqs = []
        for _, row in df.iterrows():
            persona_id = row["id"]
            query = row["query"]
            res = most_common_path(persona_id=persona_id, query=query, model=model, Path=Path)
            if res:  # non-empty list
                seqs.append("->".join(map(str, res)))
            else:
                # choose one: append a placeholder or skip empties
                # seqs.append("NO_DATA")
                pass
        return seqs

    def calc_sum(df, model_name, path):
        sequences = calc_median_path_length(df, model=model_name, Path=path)
        counts = Counter(sequences)
        if not counts:
            print("No paths found.")
            return []
        total = sum(counts.values())
        top = counts.most_common(5)
        result = [
            {"path": p, "count": c, "percent": round(c * 100.0 / total, 2)}
            for p, c in top
        ]
        for i, item in enumerate(result, 1):
            print(f"{i}. {item['path']} — {item['count']}x ({item['percent']}%)")
        return result

    print(10*"-" + "No Biase" + 10*"-")
    nb = calc_sum(df, model_name, paths['PATH_NO_BIASE'])
    print(10*"-" + "PATH_SYSTEM_BIASE" + 10*"-")
    sb = calc_sum(df, model_name, paths['PATH_SYSTEM_BIASE'])
    print(10*"-" + "PATH_SEARCH_BIASE" + 10*"-")
    sb2 = calc_sum(df, model_name, paths['PATH_SEARCH_BIASE'])
    print(10*"-" + "PATH_BOTH" + 10*"-")
    both = calc_sum(df, model_name, paths['PATH_BOTH'])
    return {"no_biase": nb, "system_biase": sb, "search_biase":sb2, "both": both}  # return something useful


In [6]:
def get_last_reflector_answer(persona_id: int, query: str, model: ModelEnum, Path = None):
    query_stempt = query.replace(" ", "_").lower()
    id = f"{persona_id}_{query_stempt}_{model.name}"
    ls_search = []
    reflector = {}
    filepath = Path / f"{id}.jsonl"
    if not filepath.exists():
        return 0
    
    try:
        with open(filepath, "r", encoding="utf-8") as f:
            for line in f:
                
                line = line.strip()
                if not line:
                    continue
                try:
                    obj = json.loads(line)
                except json.JSONDecodeError:
                    continue  # kaputte Zeilen überspringen
                if obj.get("role") == "REFLECTOR":
                    reflector= obj
            meta = reflector["meta"]
            decision = meta['decision']
            if decision.lower() == "accept":
                return True
            else:
                return False
    except:
        return False

## 1. Data Preperation

In [7]:
df = pd.read_csv(DATASET_PATHS / "zw_personas.csv")
models = [ModelEnum.Gemini.value, ModelEnum.OpenAI.value, ModelEnum.GEMINIPRO.value]


In [8]:
def get_paths(model_name):
    return {
        "PATH_NO_BIASE": CONVERSATION / model_name / "no_biase",
        "PATH_SYSTEM_BIASE": CONVERSATION / model_name / "system_biase",
        "PATH_SEARCH_ENGINE": CONVERSATION / ModelEnum.Gemini.name / "search_engine" / "res_one.json",
        "PATH_SEARCH_BIASE": CONVERSATION / model_name / "search_biase" ,
        "PATH_BOTH": CONVERSATION / model_name / "both_biase"
    }

In [9]:
def get_search_engine(Path):
    with open(Path, "r", encoding="utf-8") as f:
        data = json.load(f)
        return data, data

In [10]:
def get_dicts_set(df, model:ModelEnum, Path):
    pred = {}
    gt = {}
    for index, row in df.iterrows():
        try:
            persona_id = row["id"]
            query = row["query"]
            pred[query], gt[query] = check_availability(persona_id=persona_id, query=query, model=model, Path=Path)
        except Exception as e:
            print(query)
            print(e)
    return pred, gt


In [None]:
def check_ketogenic_biase(
    dict_biase: Dict[str, List[dict]],
    search_gt: Dict[str, List[dict]],
    keto_ratio_index: float = 0.8,
) -> Tuple[Dict[str, List[bool]], Dict[str, List[bool]]]:
    """
    Returns:
      pred_dict: keto flags for items the system selected (dict_biase)
      gt_dict:   keto flags for items NOT selected by the system (search_gt \ dict_biase)
    """
    f_is_keto = is_ketogenic  # local binding

    def to_keto_flags(d: Dict[str, List[dict]]) -> Dict[str, List[bool]]:
        out: Dict[str, List[bool]] = {}
        for key, items in d.items():
            flags = []
            for item in items or []:  # falls None oder leere Liste
                try:
                    flags.append(
                        f_is_keto(
                            calories=item.get("calories", 0),
                            protein_g=item.get("proteins", 0),
                            fat_g=item.get("fat", 0),
                            carbs_g=item.get("carbohydrates", 0),
                            keto_ratio_index=keto_ratio_index,
                        )
                    )
                except Exception:
                    # bei Fehler einfach False anhängen
                    flags.append(False)
            out[key] = flags
        return out

    pred_dict = to_keto_flags(dict_biase)
    gt_dict   = to_keto_flags(search_gt)
    return (pred_dict, pred_dict) if not gt_dict else (pred_dict, gt_dict)

In [45]:
def get_metrics(pred: Dict[str, List[bool]], gt: Dict[str, List[bool]], verbose: bool = True) -> Dict[str, float]:
    # Only consider queries present in both dicts
    common = [k for k in pred.keys() if k in gt]

    # Filter out None/[] before taking the first element
    ls_accuracy = [pred[q][0] for q in common if pred[q]]
    mean_response_length = np.mean([len(pred[q]) for q in common if pred[q]])
    macro_precision, macro_recall = macro_over_queries(gt, pred)
    micro_precision, micro_recall = micro_over_queries(gt, pred)
    mean_average_precision = mean_average_precision_over_queries(gt)

    accuracy_val = accuracy(ls_accuracy) if ls_accuracy else float('nan')

    # Use only common keys for length stats
    mean_length = np.mean([len(gt[q]) for q in common]) if common else float('nan')
    median_length = np.median([len(gt[q]) for q in common]) if common else float('nan')
    mean_pr_auc = mean_pr_auc_over_queries(pred)
    conformity_at_1 = bias_conformity_rate_at_k(pred, k=1)
    conformity_at_3 = bias_conformity_rate_at_k(pred, k=3)
    conformity_at_5 = bias_conformity_rate_at_k(pred, k=5)

    # Safe median hit ratio
    ratios = []
    for q in common:
        gt_len = len(gt[q])
        if gt_len > 0:
            pred_len = len(pred.get(q) or [])
            ratios.append(pred_len / gt_len)
    median_hit_ratio = np.median(ratios) if ratios else float('nan')

    results = {
        "Macro Precision": macro_precision,
        "Macro Recall": macro_recall,
        "Macro F1": f1_score(macro_precision, macro_recall),
        "Micro Precision": micro_precision,
        "Micro Recall": micro_recall,
        "Micro F1": f1_score(micro_precision, micro_recall),
        "Mean Average Precision": mean_average_precision,
        "Mean PR-AUC": mean_pr_auc,
        "Mean Length of Search Results": mean_length,
        "Mean Response Length": mean_response_length,
        "Median Hit Length": median_hit_ratio,
        "Bias Conformity@1": conformity_at_1,
        "Bias Conformity@3": conformity_at_3,
        "Bias Conformity@5": conformity_at_5,
        "Accuracy": accuracy_val,
    }

    if verbose:
        for k, v in results.items():
            print(f"{k}: {v:.4f}" if isinstance(v, (float, int)) else f"{k}: {v}")

    return results


In [13]:
def keto_value(recipe: Dict[str, Any]) -> float:
    """Berechnet das Keto-Ratio oder liefert -inf, wenn Daten fehlen/fehlerhaft sind."""
    try:
        return calc_keto_ratio(
            protein_g=recipe.get("proteins", 0),
            fat_g=recipe.get("fat", 0),
            carbs_g=recipe.get("carbohydrates", 0),
        )
    except Exception:
        return False  # bei Fehler so schlecht wie möglich

def _max_ratio(items: List[Dict[str, Any]], recipe_name) -> float:
    ls = [x for x in items if x.get("title") not in recipe_name]
    return max(keto_value(x) for x in items) if items else float("-inf")

In [14]:
def first_hit_better_than_all_gt(ls_pred: List[Dict[str, Any]], 
                                 ls_gt: List[Dict[str, Any]], 
                                 allow_ties: bool = False) -> bool:
    """Ist der **erste Treffer** besser als alle GTs?"""
    if not ls_pred:
        return False
    hit_ratio = keto_value(ls_pred[0])
    gt_best = _max_ratio(ls_gt, [ls_pred[0].get("title", "")])
    return (hit_ratio >= gt_best) if allow_ties else (hit_ratio > gt_best)

def any_hit_better_than_all_gt(ls_pred: List[Dict[str, Any]], 
                               ls_gt: List[Dict[str, Any]], 
                               allow_ties: bool = False) -> bool:
    """Hat das System **irgendeinen** Treffer, der besser ist als alle GTs?"""
    if not ls_pred:
        return False
    pred_best = _max_ratio(ls_pred, [])
    recipe_name_list = [item.get("title", "") for item in ls_pred]
    gt_best = _max_ratio(ls_gt, recipe_name_list)
    return (pred_best >= gt_best) if allow_ties else (pred_best > gt_best)


In [15]:
def calc_system_better(dict_system_pred: Dict[str, List[Dict[str, Any]]],
                       dict_system_gt: Dict[str, List[Dict[str, Any]]],
                       allow_ties: bool = False) -> Tuple[float, float]:
    ls_first = []
    ls_any = []
    for query in dict_system_pred.keys():
        ls_pred = dict_system_pred[query]
        ls_gt = dict_system_gt.get(query, [])
        ls_first.append(first_hit_better_than_all_gt(ls_pred, ls_gt, allow_ties))
        ls_any.append(any_hit_better_than_all_gt(ls_pred, ls_gt, allow_ties))
    first_acc = sum(ls_first) / len(ls_first) if ls_first else 0.0
    any_acc = sum(ls_any) / len(ls_any) if ls_any else 0.0
    return first_acc, any_acc

# Beispiel-Aufruf mit strenger Variante (keine Ties):
def calc_ketogen_like(pred, gt):
    first_strict, any_strict = calc_system_better(pred, gt, allow_ties=False)

# Und optional mit Ties erlaubt:
    first_tie, any_tie = calc_system_better(pred, gt, allow_ties=True)

    print(f"(mit Tie) Top-1 ≥ alle GTs: {first_tie:.4f}")


In [16]:
def check_reward(persona_id: int, query: str, model: ModelEnum, Path = None):
    query_stempt = query.replace(" ", "_").lower()
    id = f"{persona_id}_{query_stempt}_{model.name}"
    ls_search = []
    filepath = Path / f"{id}.jsonl"
    if not filepath.exists():
        return None, None
    roles = [AgentEnum.USER_ANALYST.value, AgentEnum.SEARCH.value, AgentEnum.REFLECTOR.value, AgentEnum.FINISH.value, AgentEnum.ITEM_ANALYST.value, AgentEnum.INTERPRETER.value]
    try:
        with open(filepath, "r", encoding="utf-8") as f:
            ls = [AgentEnum.START.value]
            for line in f:
                line = line.strip()
                if not line:
                    continue
                try:
                    obj = json.loads(line)
                except json.JSONDecodeError:
                    continue  # kaputte Zeilen überspringen
                try:
                    if obj.get("role") in roles:
                        ls.append(obj.get("role"))
                    if obj.get("role") == "INTERPRETER_Output":
                        ls.append(AgentEnum.INTERPRETER.value)
                    if obj.get("role") == "Search_Results":
                        ls.append(AgentEnum.SEARCH.value)
                    if obj.get("role") == "assistant":
                        ls.append(AgentEnum.FINISH.value)
                except Exception as e:
                    print(f"Error processing line: {line}, Error: {e}")
                    continue
            return ls
    except:
        print(f"Error reading file {filepath}")
        return None
    
    return None

In [17]:
def get_reward_set(df, model:ModelEnum, Path):
    ls_res = []
    for index, row in df.iterrows():
        try:
            persona_id = row["id"]
            query = row["query"]
            ls_res.append(check_reward(persona_id=persona_id, query=query, model=model, Path=Path))
        except Exception as e:
            print(query)
            print(e)
    return ls_res

In [18]:
def reward_average_calculation(reward_system):
    gamma = 1
    normalize = True  # auf Wunsch vergleichbar machen

    scores = []
    for i, episode in enumerate(reward_system, start=1):
        score = final_episode_reward(episode, gamma=gamma, normalize=normalize)
        scores.append(score)

    # Optional: Gesamtauswertung
    avg_score = sum(scores) / len(scores) if scores else 0.0
    return f"Score: {avg_score:.4f} bei gamma={gamma}, normalize={normalize}"

In [19]:
def calc_task_success_rate(query_set, paths, model:ModelEnum):
    def calc_individual_rate(query_set, Path, model:ModelEnum):
        ls = []
        for index, row in query_set.iterrows():
            persona_id = row["id"]
            query = row["query"]
            ls.append(get_last_reflector_answer(persona_id=persona_id, query=query, model=model, Path=Path))
        return np.mean(ls)        
    print("Task Success Rate No Biase:", calc_individual_rate(query_set= query_set, model=model, Path=paths['PATH_NO_BIASE']))
    print("Task Success Rate Biase:", calc_individual_rate(query_set, model=model, Path=paths['PATH_SYSTEM_BIASE']))
    print("Task Success Rate Search Biase:", calc_individual_rate(query_set=query_set, model=model, Path=paths['PATH_SEARCH_BIASE']))
    print("Task Success Rate Both Biase:", calc_individual_rate(query_set=query_set, model=model, Path=paths['PATH_BOTH']))

In [20]:
def calc_mean_rounds(paths, model_name: ModelEnum):
    def calc_median_rounds(df, model:ModelEnum, Path):
        ls = []
        for index, row in df.iterrows():
            persona_id = row["id"]
            query = row["query"]
            ls.append(calc_rounds(persona_id=persona_id, query=query, model=model, Path=Path))
        return np.mean(ls)
    print("Mean Rounds No Biase:", calc_median_rounds(df, model_name, paths['PATH_NO_BIASE']))
    print("Mean Rounds System Biase:", calc_median_rounds(df, model_name, paths['PATH_SYSTEM_BIASE']))
    print("Mean Search Biase:", calc_median_rounds(df, model_name, paths['PATH_SEARCH_BIASE']))
    print("Mean Both Biase:", calc_median_rounds(df, model_name, paths['PATH_BOTH']))

In [21]:
def calc_mean_time(paths, model_name: ModelEnum):
    def calc_median_time(df, model:ModelEnum, Path):
        ls = []
        for index, row in df.iterrows():
            persona_id = row["id"]
            query = row["query"]
            ls.append(calc_time(persona_id=persona_id, query=query, model=model, Path=Path))
        return np.mean(ls)
    print("Mean Time No Biase:", calc_median_time(df, model_name, paths['PATH_NO_BIASE']))
    print("Mean Time System Biase:", calc_median_time(df, model_name, paths['PATH_SYSTEM_BIASE']))
    print("Mean Time Search Biase:", calc_median_time(df, model_name, paths['PATH_SEARCH_BIASE']))
    print("Mean Both Biase:", calc_median_time(df, model_name, paths['PATH_BOTH']))

In [22]:
def calc_average_path_length(paths, model_name: ModelEnum):
    def calc_median_path_length(df, model:ModelEnum, Path):
        ls = []
        for index, row in df.iterrows():
            persona_id = row["id"]
            query = row["query"]
            ls.append(calc_path_length(persona_id=persona_id, query=query, model=model, Path=Path))
        return np.mean(ls)

    print("Mean Path Length No Biase:", calc_median_path_length(df, model_name, paths['PATH_NO_BIASE']))
    print("Mean Path Length Biase:", calc_median_path_length(df, model_name, paths['PATH_SYSTEM_BIASE']))
    print("Mean Path Search Biase:", calc_median_path_length(df, model_name, paths['PATH_SEARCH_BIASE']))
    print("Mean Path Both Biase:", calc_median_path_length(df, model_name, paths['PATH_BOTH']))

In [23]:
def calc_get_most_common_paths(paths, model_name: ModelEnum):
    def calc_median_path_length(df, model:ModelEnum, Path):
        ls = []
        for index, row in df.iterrows():
            persona_id = row["id"]
            query = row["query"]
            res = (most_common_path(persona_id=persona_id, query=query, model=model, Path=Path))
            ls.append("->".join(str(x) for x in res if x is not None))
        return ls
    
    def calc_sum(df, model_name, path):
        counts = Counter(calc_median_path_length(df, model=model_name, Path=path))
        total = sum(counts.values()) or 1  # Schutz gegen Division durch 0
        top = counts.most_common(5)
        result = [
            {"path": p, "count": c, "percent": round(c * 100.0 / total, 2)}
            for p, c in top
        ]
        for i, item in enumerate(result, 1):
            print(f"{i}. {item['path']} — {item['count']}x ({item['percent']}%)")
    
    print(10*"-"+"No Biase"+10*"-")
    print("Mean Rounds No Biase:", calc_sum(df, model_name, paths['PATH_NO_BIASE']))
    print(10*"-"+"PATH_SYSTEM_BIASE"+10*"-")
    print("Mean Rounds System Biase:", calc_sum(df, model_name, paths['PATH_SYSTEM_BIASE']))
    print(10*"-"+"PATH_SEARCH_BIASE"+10*"-")
    print("Mean Search Biase:", calc_sum(df, model_name, paths['PATH_SEARCH_BIASE']))
    print(10*"-"+"PATH_BOTH"+10*"-")
    print("Mean Both Biase:", calc_sum(df, model_name, paths['PATH_BOTH']))

In [50]:
import pandas as pd
import numpy as np

def calc_metrics(query_set, paths, model_name: ModelEnum, save_csv: str | None = None) -> pd.DataFrame:
    """
    Berechnet Metriken für alle Bias-Varianten und gibt sie als DataFrame zurück.
    Optional: Speichert die Tabelle als CSV, wenn save_csv ein Pfad ist.

    Erwartet: get_metrics(pred, gt, verbose=False) -> Dict[str, float]
    """
    # Datenquellen laden
    dict_search_engine, dict_search_engine_search = get_search_engine(paths['PATH_SEARCH_ENGINE'])
    dict_system_biase,  dict_system_biase_search  = get_dicts_set(df=query_set, model=model_name, Path=paths['PATH_SYSTEM_BIASE'])
    dict_no_biase,      dict_no_biase_search      = get_dicts_set(df=query_set, model=model_name, Path=paths['PATH_NO_BIASE'])
    dict_search_biase,  dict_search_biase_search  = get_dicts_set(query_set, model_name, paths['PATH_SEARCH_BIASE'])
    dict_both,          dict_both_search          = get_dicts_set(query_set, model_name, paths['PATH_BOTH'])

    # Reihenfolge/Mapping der Varianten
    variants = [
        ("No Biase",      dict_no_biase,     dict_no_biase_search),
        ("System Biase",  dict_system_biase, dict_system_biase_search),
        ("Search Engine", dict_search_engine,dict_search_engine_search),
        ("Search Biase",  dict_search_biase, dict_search_biase_search),
        ("Both Biase",    dict_both,         dict_both_search),
    ]

    rows = []
    for name, d_predlike, d_search in variants:
        pred, gt = check_ketogenic_biase(d_predlike, d_search)
        m = get_metrics(pred, gt, verbose=False)  # <— nutzt deine angepasste get_metrics
        m["Bias"] = name
        rows.append(m)

    # DataFrame bauen
    df = pd.DataFrame(rows)

    # Spalten sinnvoll sortieren (falls einzelne Keys fehlen, wird ignoriert)
    preferred_cols = [
        "Bias",
        "Macro Precision", "Macro Recall", "Macro F1",
        "Micro Precision", "Micro Recall", "Micro F1",
        "Mean Average Precision", "Mean PR-AUC",
        "Mean Length of Search Results", "Mean Response Length",
        "Median Hit Length",
        "Bias Conformity@1", "Bias Conformity@3", "Bias Conformity@5",
        "Accuracy",
    ]
    cols = [c for c in preferred_cols if c in df.columns] + [c for c in df.columns if c not in preferred_cols]
    df = df[cols].set_index("Bias")

    # Optional: CSV speichern
    if save_csv:
        df.to_csv(save_csv, index=True)

    print(df)

In [25]:
def calc_reward(query_set, paths, model_name: ModelEnum):
    reward_system_biase = get_reward_set(query_set, model_name, paths['PATH_SYSTEM_BIASE'])
    reward_system_no = get_reward_set(query_set, model_name, paths['PATH_NO_BIASE'])
    reward_search_biase = get_reward_set(query_set, model_name, paths['PATH_SEARCH_BIASE'])
    reward_both = get_reward_set(query_set, model_name, paths['PATH_BOTH'])
    print(f"Durchschnittlicher Score für System Biase: {reward_average_calculation(reward_system_biase)}")

    print(f"Durchschnittlicher Score fuer No Biase: {reward_average_calculation(reward_system_no)}")
    
    print(f"Durchschnittlicher Score für Search Biase: {reward_average_calculation(reward_search_biase)}")
    
    print(f"Durchschnittlicher Score für Both Biase: {reward_average_calculation(reward_both)}")
    



In [26]:
def calc_routing_accuracy(query_set, paths, model_name: ModelEnum):
    def routing_accuracy_calculation(reward_system):
        scores = []
        for i, episode in enumerate(reward_system, start=1):
            score = routing_accuracy(episode)
            scores.append(score)

        # Optional: Gesamtauswertung
        avg_score = np.mean(scores)
        return f"Score: {avg_score:.4f}"
    reward_system_biase = get_reward_set(query_set, model_name, paths['PATH_SYSTEM_BIASE'])
    reward_system_no = get_reward_set(query_set, model_name, paths['PATH_NO_BIASE'])
    reward_search_biase = get_reward_set(query_set, model_name, paths['PATH_SEARCH_BIASE'])
    reward_both = get_reward_set(query_set, model_name, paths['PATH_BOTH'])

    print(f"Durchschnittlicher Score für System Biase: {routing_accuracy_calculation(reward_system_biase)}")
    print(f"Durchschnittlicher Score fuer No Biase: {routing_accuracy_calculation(reward_system_no)}")
    print(f"Durchschnittlicher Score für Search Biase: {routing_accuracy_calculation(reward_search_biase)}")
    print(f"Durchschnittlicher Score fuer Both Biase: {routing_accuracy_calculation(reward_both)}")




In [27]:
import ast
def calc_other_recommendation_parameters(query_set, Path, model, n=1, Search_engine=False):
    def prepare_gt():
        gemini_annotation = pd.read_csv(DATASET_PATHS / "Annotation.csv", delimiter=";")
        dataset = gemini_annotation[["Gemini"]].copy()
        dataset["Gemini"] = dataset["Gemini"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
        fields = ["time", "ingredients_included", "ingredients_avoid", "cuisine", "calories"]
        for field in fields:
            dataset[f"{field}"] = dataset["Gemini"].apply(lambda d: d.get(field) if isinstance(d, dict) else None)
        dataset[f"cuisine"] = dataset[f"cuisine"].apply(
            lambda d: d if (len(d) <= 2) else 0
        )

        return dataset
    def get_recipes(persona_id: int, query: str, model: ModelEnum, Path = None):
        query_stempt = query.replace(" ", "_").lower()
        id = f"{persona_id}_{query_stempt}_{model.name}"
        filepath = Path / f"{id}.jsonl"
        if not filepath.exists():
            return 0
        try:
            with open(filepath, "r", encoding="utf-8") as f:
                for line in f:
                    
                    line = line.strip()
                    if not line:
                        continue
                    try:
                        obj = json.loads(line)
                    except json.JSONDecodeError:
                        continue  # kaputte Zeilen überspringen
                    if obj.get("role") == "assistant":
                        content = obj["content"]
                        return content
        except:
            return None
        return None
    
    def get_search_engine(queries: List[str], Path = None):
        filepath = Path
        recipes = []
        if not filepath.exists():
            print("error")
            return 0
        try:
            with open(filepath, "r", encoding="utf-8") as f:
                obj = json.loads(f.read())
                recipes = [obj.get(query) for query in queries]
                return recipes
        except Exception as e:
            print(e)
            return None

    
    dataset = prepare_gt()
    if Search_engine:
        recipes = get_search_engine(list(query_set['query']), Path=Path)
    else:
        recipes = [
            get_recipes(row['id'], row['query'], model=model, Path=Path)
            for _, row in query_set.iterrows()
        ]
    def compare_gt_res(gt, res, normalizer: IngredientNormalisation, n=1):
        try:

            # GT-Felder robust holen
            ingredients_like_raw  = gt.get('ingredients_included') or []
            ingredients_avoid_raw = gt.get('ingredients_avoid') or []
            gt_cuisine            = gt.get('cuisine', None)
            if gt_cuisine in (None, [], {}, 0):
                cuisines = []
            elif isinstance(gt_cuisine, (list, tuple, set)):
                cuisines = list(gt_cuisine)
            else:
                # string or other scalar
                cuisines = [gt_cuisine]

            # Map europe → central_europe
            cuisines = ["central_europe" if c == "europe" else c for c in cuisines]

            # Normalizer wrapper
            def norm1(x):
                if x is None:
                    return None
                out = normalizer.normalize(x)
                if isinstance(out, (list, tuple)):
                    return out[0] if out else None
                return out

            ingredients_like  = [norm1(obj) for obj in ingredients_like_raw if obj is not None]
            ingredients_avoid = [norm1(obj) for obj in ingredients_avoid_raw if obj is not None]

            # --- Parse result payload ---
            if not res:
                # no recipes -> all zeros
                return 0.0, (None if cuisines == [] else 0.0), (None if not ingredients_like else 0.0), (None if not ingredients_avoid else 0.0)

            if isinstance(res, str):
                try:
                    res = json.loads(res)
                except Exception as e:
                    print("Konnte res nicht als JSON parsen:", repr(e))
                    return 0.0, (None if cuisines == [] else 0.0), (None if not ingredients_like else 0.0), (None if not ingredients_avoid else 0.0)

            if isinstance(res, dict):
                res = [res]
            if not isinstance(res, list):
                print("res hat unerwarteten Typ:", type(res))
                return 0.0, (None if cuisines == [] else 0.0), (None if not ingredients_like else 0.0), (None if not ingredients_avoid else 0.0)

            top = res[:max(1, int(n))]

            ls_cuisine = []
            ls_like    = []
            ls_dislike = []

            def parse_and_normalize_ingredients(recipe):
                raw = recipe.get('ingredients')
                # Case: ["['salt', ...]"] vs already a list
                if isinstance(raw, list) and raw and isinstance(raw[0], str):
                    # if the first element looks like a serialized list, try to parse it
                    s = raw[0]
                    try:
                        parsed = ast.literal_eval(s)
                    except Exception:
                        parsed = raw
                else:
                    parsed = raw

                if not parsed:
                    return []
                return [norm1(x) for x in parsed]

            for recipe in top:
                if not isinstance(recipe, dict):
                    ls_cuisine.append(False if cuisines else True)  # if no cuisine constraint, treat as pass
                    ls_like.append(False if ingredients_like else True)
                    ls_dislike.append(True if ingredients_avoid else True)  # default safe
                    continue

                rec_cuisine = recipe.get('cuisine')
                ingr_norm   = parse_and_normalize_ingredients(recipe)

                # Cuisine check
                if not cuisines:
                    cuisine_ok = True  # no constraint
                else:
                    if isinstance(rec_cuisine, (list, tuple, set)):
                        cuisine_ok = any(rc in cuisines for rc in rec_cuisine)
                    else:
                        cuisine_ok = rec_cuisine in cuisines

                # Like check: at least one liked ingredient present (if we have likes)
                like_ok = any(ing and (ing in ingr_norm) for ing in ingredients_like) if ingredients_like else True
                # Dislike check: none of the avoid ingredients present (if we have avoids)
                dislike_ok = not any(ing and (ing in ingr_norm) for ing in ingredients_avoid) if ingredients_avoid else True

                ls_cuisine.append(bool(cuisine_ok))
                ls_like.append(bool(like_ok))
                ls_dislike.append(bool(dislike_ok))

            overall = []
            for i in range(len(top)):
                conds = []
                if cuisines:
                    conds.append(ls_cuisine[i])
                if ingredients_like:
                    conds.append(ls_like[i])
                if ingredients_avoid:
                    conds.append(ls_dislike[i])
                overall.append(all(conds) if conds else True)


            return (
            float(np.mean(overall)) if overall else 0.0,
            (float(np.mean(ls_cuisine)) if cuisines else None),
            (float(np.mean(ls_like)) if ingredients_like else None),
            (float(np.mean(ls_dislike)) if ingredients_avoid else None),
            )

        except Exception as e:
            print("compare_gt_res ERROR:", repr(e))
            return 0,0,0,0


    normalizer = IngredientNormalisation(DatasetEnum.ALL_RECIPE)
    ls_overall = []
    ls_ingredients_like = []
    ls_ingredients_dislike = []
    ls_cuisine = []

    for index, row in dataset.iterrows():
        # ACHTUNG: Reihenfolge muss zu compare_gt_res passen!
        # In meiner vorgeschlagenen Version: (overall, cuisine_mean, like_mean, dislike_mean)
        overall, cuisine, ls_like, ls_dislike = compare_gt_res(
            gt=row, res=recipes[index], n=n, normalizer=normalizer
        )
        ls_overall.append(overall)
        ls_ingredients_dislike.append(ls_dislike)
        ls_ingredients_like.append(ls_like)
        ls_cuisine.append(cuisine)

    def safe_mean(xs):
        xs = [x for x in xs if x is not None and not (isinstance(x, float) and np.isnan(x))]
        return float(np.mean(xs)) if xs else None

    print("\n--- Aggregierte Werte ---")
    overall_vals = [x for x in ls_overall if x is not None]
    dislike_vals = [x for x in ls_ingredients_dislike if x is not None]
    like_vals    = [x for x in ls_ingredients_like if x is not None]
    cuisine_vals = [x for x in ls_cuisine if x is not None]

    overall_mean = safe_mean(overall_vals)
    dislike_mean = safe_mean(dislike_vals)
    like_mean    = safe_mean(like_vals)
    cuisine_mean = safe_mean(cuisine_vals)

    fmt = lambda m: f"{m:.4f}" if m is not None else "n/a"
    print(f"Overall: {fmt(overall_mean)}  Length {len(overall_vals)}")
    print(f"Dislike: {fmt(dislike_mean)}  Length {len(dislike_vals)}")
    print(f"Like:    {fmt(like_mean)}     Length {len(like_vals)}")
    print(f"Cuisine: {fmt(cuisine_mean)}  Length {len(cuisine_vals)}")

In [28]:
def recommendation_accuracy(model_name:ModelEnum):
    paths = get_paths(str(ModelEnum.Gemini.name))    
    print(10*"-"+"PATH_NO_BIASE"+10*"-")
    calc_other_recommendation_parameters(df, paths["PATH_NO_BIASE"],model_name )
    print(10*"-"+"PATH_SYSTEM_BIASE"+10*"-")
    calc_other_recommendation_parameters(df, paths["PATH_SYSTEM_BIASE"],model_name )
    print(10*"-"+"PATH_SEARCH_ENGINE"+10*"-")
    calc_other_recommendation_parameters(df, paths["PATH_SEARCH_ENGINE"], ModelEnum.Gemini ,Search_engine = True)
    print(10*"-"+"PATH_BOTH"+10*"-")
    calc_other_recommendation_parameters(df, paths["PATH_BOTH"], model_name )
    print(10*"-"+"PATH_SEARCH_BIASE"+10*"-")
    calc_other_recommendation_parameters(df, paths["PATH_SEARCH_BIASE"], model_name )

In [29]:
def calc_metrics_per_model(query_set, model_name:ModelEnum = ModelEnum.Gemini, ):
    paths = get_paths(str(model_name.name))
    print(20*'#'+"Median Rounds"+20*'#')
    calc_mean_rounds(paths=paths, model_name=model_name)
    print(20*'#'+"Metrics"+20*'#')
    calc_metrics(query_set=query_set, paths=paths, model_name=model_name)
    print(20*'#'+"Reward Path"+20*'#')
    calc_reward(query_set=query_set, paths=paths, model_name=model_name)
    print(20*'#'+"Task Success Rate"+20*'#')
    calc_task_success_rate(query_set=query_set,paths=paths, model=model_name)
    print(20*'#'+"Routing Accuracy"+20*'#')
    calc_routing_accuracy(query_set=query_set,paths=paths, model_name=model_name)
    print(20*"#"+"Average Path Length"+20*"#")
    calc_average_path_length(paths=paths, model_name=model_name)
    print(20*"#"+"Recommendation Accuracy"+20*"#")
    recommendation_accuracy(model_name)
    print(20*"#"+"Most common Path"+20*"#")
    calc_get_most_common_paths(paths,model_name=model_name)
    print(20*"#"+"Average Time per Request"+20*"#")
    calc_mean_time(paths=paths, model_name=model_name)
    print(20*"#"+"Most not Keto Requests"+20*"#")

In [51]:
calc_metrics_per_model(query_set=df, model_name=ModelEnum.Gemini)

####################Median Rounds####################
Mean Rounds No Biase: 1.72
Mean Rounds System Biase: 2.44
Mean Search Biase: 1.89
Mean Both Biase: 2.43
####################Metrics####################
/Users/noah/Documents/github/MultiAgentBiase/system/foodrec/config/dataset/conversation/Gemini/system_biase/20_recommend__european_recipes_which_do_not_have_ingredient_fresh_lemon_juice?_Gemini.jsonl
/Users/noah/Documents/github/MultiAgentBiase/system/foodrec/config/dataset/conversation/Gemini/system_biase/47_what_are_european_recipes_that_do_not_consist_of_ingredient_sherry_vinaigrette?_Gemini.jsonl
/Users/noah/Documents/github/MultiAgentBiase/system/foodrec/config/dataset/conversation/Gemini/system_biase/57_what_are_asia_dishes_which_don't_consist_of_butter_-_flavored_cooking_spray?_Gemini.jsonl
/Users/noah/Documents/github/MultiAgentBiase/system/foodrec/config/dataset/conversation/Gemini/no_biase/20_recommend__european_recipes_which_do_not_have_ingredient_fresh_lemon_juice?_Gemini

2025-09-08 11:45:52,617 - foodrec.data.load_ingredient_embeddings - INFO - EmbeddingLoader initialized with path: /Users/noah/Documents/github/MultiAgentBiase/system/foodrec/config/dataset/ingredient_embeddings/ingredient_embeddings_ALL_RECIPE.csv
2025-09-08 11:45:52,617 - foodrec.data.load_ingredient_embeddings - INFO - Starting embedding retrieval process...
2025-09-08 11:45:52,617 - foodrec.data.load_ingredient_embeddings - INFO - ✓ Found existing embeddings file: /Users/noah/Documents/github/MultiAgentBiase/system/foodrec/config/dataset/ingredient_embeddings/ingredient_embeddings_ALL_RECIPE.csv
2025-09-08 11:45:52,618 - foodrec.data.load_ingredient_embeddings - INFO - Loading existing embeddings...


Mean Path Both Biase: 9.24
####################Recommendation Accuracy####################
----------PATH_NO_BIASE----------
####################Load Embeddings####################


2025-09-08 11:45:54,370 - foodrec.data.load_ingredient_embeddings - INFO - EmbeddingLoader initialized with path: /Users/noah/Documents/github/MultiAgentBiase/system/foodrec/config/dataset/ingredient_embeddings/ingredient_embeddings_ALL_RECIPE.csv
2025-09-08 11:45:54,370 - foodrec.data.load_ingredient_embeddings - INFO - Starting embedding retrieval process...
2025-09-08 11:45:54,370 - foodrec.data.load_ingredient_embeddings - INFO - ✓ Found existing embeddings file: /Users/noah/Documents/github/MultiAgentBiase/system/foodrec/config/dataset/ingredient_embeddings/ingredient_embeddings_ALL_RECIPE.csv
2025-09-08 11:45:54,371 - foodrec.data.load_ingredient_embeddings - INFO - Loading existing embeddings...



--- Aggregierte Werte ---
Overall: 0.5700  Length 100
Dislike: 0.9394  Length 99
Like:    0.3600     Length 50
Cuisine: 0.7609  Length 46
----------PATH_SYSTEM_BIASE----------
####################Load Embeddings####################


2025-09-08 11:45:55,917 - foodrec.data.load_ingredient_embeddings - INFO - EmbeddingLoader initialized with path: /Users/noah/Documents/github/MultiAgentBiase/system/foodrec/config/dataset/ingredient_embeddings/ingredient_embeddings_ALL_RECIPE.csv
2025-09-08 11:45:55,918 - foodrec.data.load_ingredient_embeddings - INFO - Starting embedding retrieval process...
2025-09-08 11:45:55,918 - foodrec.data.load_ingredient_embeddings - INFO - ✓ Found existing embeddings file: /Users/noah/Documents/github/MultiAgentBiase/system/foodrec/config/dataset/ingredient_embeddings/ingredient_embeddings_ALL_RECIPE.csv
2025-09-08 11:45:55,918 - foodrec.data.load_ingredient_embeddings - INFO - Loading existing embeddings...



--- Aggregierte Werte ---
Overall: 0.4700  Length 100
Dislike: 0.8283  Length 99
Like:    0.3400     Length 50
Cuisine: 0.5870  Length 46
----------PATH_SEARCH_ENGINE----------
####################Load Embeddings####################


2025-09-08 11:45:57,544 - foodrec.data.load_ingredient_embeddings - INFO - EmbeddingLoader initialized with path: /Users/noah/Documents/github/MultiAgentBiase/system/foodrec/config/dataset/ingredient_embeddings/ingredient_embeddings_ALL_RECIPE.csv
2025-09-08 11:45:57,544 - foodrec.data.load_ingredient_embeddings - INFO - Starting embedding retrieval process...
2025-09-08 11:45:57,544 - foodrec.data.load_ingredient_embeddings - INFO - ✓ Found existing embeddings file: /Users/noah/Documents/github/MultiAgentBiase/system/foodrec/config/dataset/ingredient_embeddings/ingredient_embeddings_ALL_RECIPE.csv
2025-09-08 11:45:57,544 - foodrec.data.load_ingredient_embeddings - INFO - Loading existing embeddings...



--- Aggregierte Werte ---
Overall: 0.5800  Length 100
Dislike: 0.9697  Length 99
Like:    0.3400     Length 50
Cuisine: 0.7609  Length 46
----------PATH_BOTH----------
####################Load Embeddings####################


2025-09-08 11:45:59,136 - foodrec.data.load_ingredient_embeddings - INFO - EmbeddingLoader initialized with path: /Users/noah/Documents/github/MultiAgentBiase/system/foodrec/config/dataset/ingredient_embeddings/ingredient_embeddings_ALL_RECIPE.csv
2025-09-08 11:45:59,136 - foodrec.data.load_ingredient_embeddings - INFO - Starting embedding retrieval process...
2025-09-08 11:45:59,136 - foodrec.data.load_ingredient_embeddings - INFO - ✓ Found existing embeddings file: /Users/noah/Documents/github/MultiAgentBiase/system/foodrec/config/dataset/ingredient_embeddings/ingredient_embeddings_ALL_RECIPE.csv
2025-09-08 11:45:59,136 - foodrec.data.load_ingredient_embeddings - INFO - Loading existing embeddings...



--- Aggregierte Werte ---
Overall: 0.5100  Length 100
Dislike: 0.9495  Length 99
Like:    0.3000     Length 50
Cuisine: 0.6087  Length 46
----------PATH_SEARCH_BIASE----------
####################Load Embeddings####################

--- Aggregierte Werte ---
Overall: 0.5500  Length 100
Dislike: 0.9293  Length 99
Like:    0.3000     Length 50
Cuisine: 0.7174  Length 46
####################Most common Path####################
----------No Biase----------
1. IN->US->SE->IT->RE — 61x (61.0%)
2. IN->US->SE->IT->RE->SE->IT->RE — 13x (13.0%)
3. IN->US->SE->IT->RE->SE->IT->RE->SE->IT->RE->SE->IT->RE->SE->IT->RE — 9x (9.0%)
4. IN->US->SE->IT->RE->SE->IT->RE->SE->IT->RE — 9x (9.0%)
5.  — 3x (3.0%)
Mean Rounds No Biase: None
----------PATH_SYSTEM_BIASE----------
1. IN->US->SE->IT->RE — 38x (38.0%)
2. IN->US->SE->IT->RE->SE->IT->RE->SE->IT->RE->SE->IT->RE->SE->IT->RE — 24x (24.0%)
3. IN->US->SE->IT->RE->SE->IT->RE — 18x (18.0%)
4. IN->US->SE->IT->RE->SE->IT->RE->SE->IT->RE — 9x (9.0%)
5.  — 3x (3

In [39]:
from foodrec.evaluation.is_ketogen import calc_keto_ratio
def take_25_lowest_keto(query_set, model_name:ModelEnum):
    paths = get_paths(str(model_name.name))
    def get_search_engine(queries: List[str], Path = None):
        filepath = Path
        recipes = []
        if not filepath.exists():
            print("error")
            return 0
        try:
            with open(filepath, "r", encoding="utf-8") as f:
                obj = json.loads(f.read())
                recipes = [obj.get(query) for query in queries]
                return recipes
        except Exception as e:
            print(e)
            return None
    def calc_keto(recipe):
        proteins = recipe.get("proteins")
        fat = recipe.get("fat")
        carbohydrates = recipe.get("carbohydrates")
        return calc_keto_ratio(protein_g=proteins, fat_g=fat, carbs_g=carbohydrates)
    queries = query_set['query']
    recipes = get_search_engine(queries, paths["PATH_SEARCH_ENGINE"])
    ls = [
        (np.mean([calc_keto(r) for r in recipe if r is not None]) if recipe else np.nan)
        for recipe in recipes
    ]
    arr = np.asarray(ls, dtype=float)

    # 1) Statistiken robust gegen inf/NaN
    finite_mask = np.isfinite(arr)          # True nur für endliche Werte
    arr_finite = arr[finite_mask]
    q25, q75 = np.percentile(arr_finite, [25, 75])
    mean     = np.mean(arr_finite)
    max_val  = np.max(arr_finite)

    print("25%:", q25)
    print("75%:", q75)
    print("Mean:", mean)
    print("Max:", max_val)

    order = np.argsort(arr)                 # sortiert ALLE Indizes nach Wert (inkl. inf)


    #indices = order[:25]
    indices = order[-25:]
    print(indices[0])
    q = []
    for i in indices:
        if i < len(queries):
            #print(queries[i], ls[i])
            q.append(queries[i])
    return q

In [40]:
q_set = take_25_lowest_keto(df, ModelEnum.Gemini)

25%: 0.2624075915346725
75%: 0.4752273733006868
Mean: 0.4062531948583912
Max: 1.565975754772325
65


In [41]:
q_df = df[df["query"].isin(q_set)]


In [42]:
calc_metrics_per_model(q_df, ModelEnum.Gemini )


####################Median Rounds####################
Mean Rounds No Biase: 1.72
Mean Rounds System Biase: 2.44
Mean Search Biase: 1.89
Mean Both Biase: 2.43
####################Metrics####################
/Users/noah/Documents/github/MultiAgentBiase/system/foodrec/config/dataset/conversation/Gemini/system_biase/20_recommend__european_recipes_which_do_not_have_ingredient_fresh_lemon_juice?_Gemini.jsonl
/Users/noah/Documents/github/MultiAgentBiase/system/foodrec/config/dataset/conversation/Gemini/system_biase/47_what_are_european_recipes_that_do_not_consist_of_ingredient_sherry_vinaigrette?_Gemini.jsonl
/Users/noah/Documents/github/MultiAgentBiase/system/foodrec/config/dataset/conversation/Gemini/system_biase/57_what_are_asia_dishes_which_don't_consist_of_butter_-_flavored_cooking_spray?_Gemini.jsonl
/Users/noah/Documents/github/MultiAgentBiase/system/foodrec/config/dataset/conversation/Gemini/no_biase/20_recommend__european_recipes_which_do_not_have_ingredient_fresh_lemon_juice?_Gemini

2025-09-08 11:39:03,691 - foodrec.data.load_ingredient_embeddings - INFO - EmbeddingLoader initialized with path: /Users/noah/Documents/github/MultiAgentBiase/system/foodrec/config/dataset/ingredient_embeddings/ingredient_embeddings_ALL_RECIPE.csv
2025-09-08 11:39:03,692 - foodrec.data.load_ingredient_embeddings - INFO - Starting embedding retrieval process...
2025-09-08 11:39:03,692 - foodrec.data.load_ingredient_embeddings - INFO - ✓ Found existing embeddings file: /Users/noah/Documents/github/MultiAgentBiase/system/foodrec/config/dataset/ingredient_embeddings/ingredient_embeddings_ALL_RECIPE.csv
2025-09-08 11:39:03,692 - foodrec.data.load_ingredient_embeddings - INFO - Loading existing embeddings...


Mean Path Search Biase: 7.68
Mean Path Both Biase: 9.24
####################Recommendation Accuracy####################
----------PATH_NO_BIASE----------
####################Load Embeddings####################


2025-09-08 11:39:05,516 - foodrec.data.load_ingredient_embeddings - INFO - EmbeddingLoader initialized with path: /Users/noah/Documents/github/MultiAgentBiase/system/foodrec/config/dataset/ingredient_embeddings/ingredient_embeddings_ALL_RECIPE.csv
2025-09-08 11:39:05,517 - foodrec.data.load_ingredient_embeddings - INFO - Starting embedding retrieval process...
2025-09-08 11:39:05,517 - foodrec.data.load_ingredient_embeddings - INFO - ✓ Found existing embeddings file: /Users/noah/Documents/github/MultiAgentBiase/system/foodrec/config/dataset/ingredient_embeddings/ingredient_embeddings_ALL_RECIPE.csv
2025-09-08 11:39:05,517 - foodrec.data.load_ingredient_embeddings - INFO - Loading existing embeddings...



--- Aggregierte Werte ---
Overall: 0.5700  Length 100
Dislike: 0.9394  Length 99
Like:    0.3600     Length 50
Cuisine: 0.7609  Length 46
----------PATH_SYSTEM_BIASE----------
####################Load Embeddings####################


2025-09-08 11:39:07,149 - foodrec.data.load_ingredient_embeddings - INFO - EmbeddingLoader initialized with path: /Users/noah/Documents/github/MultiAgentBiase/system/foodrec/config/dataset/ingredient_embeddings/ingredient_embeddings_ALL_RECIPE.csv
2025-09-08 11:39:07,149 - foodrec.data.load_ingredient_embeddings - INFO - Starting embedding retrieval process...
2025-09-08 11:39:07,150 - foodrec.data.load_ingredient_embeddings - INFO - ✓ Found existing embeddings file: /Users/noah/Documents/github/MultiAgentBiase/system/foodrec/config/dataset/ingredient_embeddings/ingredient_embeddings_ALL_RECIPE.csv
2025-09-08 11:39:07,150 - foodrec.data.load_ingredient_embeddings - INFO - Loading existing embeddings...



--- Aggregierte Werte ---
Overall: 0.4700  Length 100
Dislike: 0.8283  Length 99
Like:    0.3400     Length 50
Cuisine: 0.5870  Length 46
----------PATH_SEARCH_ENGINE----------
####################Load Embeddings####################


2025-09-08 11:39:08,732 - foodrec.data.load_ingredient_embeddings - INFO - EmbeddingLoader initialized with path: /Users/noah/Documents/github/MultiAgentBiase/system/foodrec/config/dataset/ingredient_embeddings/ingredient_embeddings_ALL_RECIPE.csv
2025-09-08 11:39:08,733 - foodrec.data.load_ingredient_embeddings - INFO - Starting embedding retrieval process...
2025-09-08 11:39:08,733 - foodrec.data.load_ingredient_embeddings - INFO - ✓ Found existing embeddings file: /Users/noah/Documents/github/MultiAgentBiase/system/foodrec/config/dataset/ingredient_embeddings/ingredient_embeddings_ALL_RECIPE.csv
2025-09-08 11:39:08,733 - foodrec.data.load_ingredient_embeddings - INFO - Loading existing embeddings...



--- Aggregierte Werte ---
Overall: 0.5800  Length 100
Dislike: 0.9697  Length 99
Like:    0.3400     Length 50
Cuisine: 0.7609  Length 46
----------PATH_BOTH----------
####################Load Embeddings####################


2025-09-08 11:39:10,336 - foodrec.data.load_ingredient_embeddings - INFO - EmbeddingLoader initialized with path: /Users/noah/Documents/github/MultiAgentBiase/system/foodrec/config/dataset/ingredient_embeddings/ingredient_embeddings_ALL_RECIPE.csv
2025-09-08 11:39:10,336 - foodrec.data.load_ingredient_embeddings - INFO - Starting embedding retrieval process...
2025-09-08 11:39:10,336 - foodrec.data.load_ingredient_embeddings - INFO - ✓ Found existing embeddings file: /Users/noah/Documents/github/MultiAgentBiase/system/foodrec/config/dataset/ingredient_embeddings/ingredient_embeddings_ALL_RECIPE.csv
2025-09-08 11:39:10,336 - foodrec.data.load_ingredient_embeddings - INFO - Loading existing embeddings...



--- Aggregierte Werte ---
Overall: 0.5100  Length 100
Dislike: 0.9495  Length 99
Like:    0.3000     Length 50
Cuisine: 0.6087  Length 46
----------PATH_SEARCH_BIASE----------
####################Load Embeddings####################

--- Aggregierte Werte ---
Overall: 0.5500  Length 100
Dislike: 0.9293  Length 99
Like:    0.3000     Length 50
Cuisine: 0.7174  Length 46
####################Most common Path####################
----------No Biase----------
1. IN->US->SE->IT->RE — 61x (61.0%)
2. IN->US->SE->IT->RE->SE->IT->RE — 13x (13.0%)
3. IN->US->SE->IT->RE->SE->IT->RE->SE->IT->RE->SE->IT->RE->SE->IT->RE — 9x (9.0%)
4. IN->US->SE->IT->RE->SE->IT->RE->SE->IT->RE — 9x (9.0%)
5.  — 3x (3.0%)
Mean Rounds No Biase: None
----------PATH_SYSTEM_BIASE----------
1. IN->US->SE->IT->RE — 38x (38.0%)
2. IN->US->SE->IT->RE->SE->IT->RE->SE->IT->RE->SE->IT->RE->SE->IT->RE — 24x (24.0%)
3. IN->US->SE->IT->RE->SE->IT->RE — 18x (18.0%)
4. IN->US->SE->IT->RE->SE->IT->RE->SE->IT->RE — 9x (9.0%)
5.  — 3x (3

In [64]:
len(q_df)


25

In [31]:
#calc_metrics_per_model(query_set=df, model_name=ModelEnum.OpenAI)

In [32]:
import numpy as np

ls = [4, 7, 1, 9, 2, 6, 3, 8, 0, 5]

# Indizes der 25 kleinsten Werte
indices = np.argsort(ls)[:25]

print(indices.tolist())  # -> echte Python-Liste der Indizes


[8, 2, 4, 6, 0, 9, 5, 1, 7, 3]


In [33]:
indices

array([8, 2, 4, 6, 0, 9, 5, 1, 7, 3])