# Evaluation of the Mulit Agent System 

This file is responsible for the evaluation of the Multi Agent System

In [1]:
import pandas as pd 
import numpy as np 
from foodrec.config.structure.dataset_enum import ModelEnum 
from foodrec.evaluation.create_dataset import create_dataset
from foodrec.evaluation.is_ketogen import is_ketogenic, calc_keto_ratio
from foodrec.config.structure.paths import CONVERSATION, DATASET_PATHS
import json
from foodrec.evaluation.metrics.metrics import macro_over_queries,filter_search, micro_over_queries, accuracy, f1_score, mean_average_precision_over_queries
from foodrec.data.all_recipe import AllRecipeLoader
from typing import Dict, List, Any, Tuple
from foodrec.agents.agent_names import AgentEnum

In [2]:
def check_availability(persona_id: int, query: str, model: ModelEnum, Path = None):
    query_stempt = query.replace(" ", "_").lower()
    id = f"{persona_id}_{query_stempt}_{model.name}"
    ls_search = []
    filepath = Path / f"{id}.jsonl"
    if not filepath.exists():
        print(filepath)
        return None, None
    
    try:
        with open(filepath, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                try:
                    obj = json.loads(line)
                except json.JSONDecodeError:
                    print(line)
                    continue  # kaputte Zeilen überspringen
                try:
                    if obj.get("role") == "assistant":
                        ls_search = filter_search(ls_search)
                        return [obj.get("content"), ls_search]
                    if obj.get("role") == "Search_Results":
                        zw_res = obj.get("meta", [])
                        if zw_res!= None and len(zw_res) > len(ls_search):
                            ls_search = zw_res
                except Exception as e:
                    print(f"Error processing line: {line}, Error: {e}")
                    continue
    except:
        print(f"Error reading file {filepath}")
        return None
    
    return None


def calc_rounds(persona_id: int, query: str, model: ModelEnum, Path = None):
    query_stempt = query.replace(" ", "_").lower()
    id = f"{persona_id}_{query_stempt}_{model.name}"
    ls_search = []
    filepath = Path / f"{id}.jsonl"
    if not filepath.exists():
        return None
    
    try:
        with open(filepath, "r", encoding="utf-8") as f:
            num = 0
            for line in f:
                
                line = line.strip()
                if not line:
                    continue
                try:
                    obj = json.loads(line)
                except json.JSONDecodeError:
                    continue  # kaputte Zeilen überspringen
                if obj.get("role") == "REFLECTOR":
                    num +=1
            return num
    except:
        return None
    
    return None


## How many Ketogen in All Recipe

In [3]:
AL = AllRecipeLoader()
dataset = AL.load_dataset()

In [4]:
dataset.head()

Unnamed: 0,recipe_href,recipe_name,description,rating_rate,ingredients_normalized,tutorial,cooking_time,protein,carbohydrates,fat,kcal
0,http://allrecipes.com/recipe/1-2-3-cheddar-bro...,1-2-3 Cheddar Broccoli Casserole,'Serve this casserole over hot baked potatoes ...,5.0,"['fig spread', 'broccoli floret', 'cheddar che...",Preheat oven to 350 degrees F. Combine Double ...,25.0,3.16087,6.37818,6.94262,97
3,http://allrecipes.com/recipe/1-2-3-chicken-cac...,1-2-3 Chicken Cacciatore,"'For this tasty and easy cacciatore, chicken i...",3.0,"['olive oil', 'chicken leg', 'pasta sauce']",Heat oil in 12-inch skillet over medium-high h...,60.0,10.2273,4.48565,5.41866,109
5,http://allrecipes.com/recipe/1-2-3-jambalaya/d...,1-2-3 Jambalaya,'Enjoy some Creole comfort the quick and easy ...,4.0,"['olive oil', 'worcestershire sauce', 'fresh p...","In a bowl, combine sausage, shrimp or chicken ...",30.0,8.43391,15.6766,8.67215,179
8,http://allrecipes.com/recipe/1-dish-chicken-pa...,1-Dish Chicken Parmesan,"'Ready to serve in less than an hour, this one...",4.0,"['sugar', 'flour', 'salt', 'olive oil', 'spagh...",Mix batter ingredients together in a pre-spray...,45.0,11.5418,16.193,7.10598,177
9,http://allrecipes.com/recipe/1-dish-pepperoni-...,1-Dish Pepperoni Cheese Pizza Bake,'Pizza was never easier than this--spread the ...,4.0,"['sugar', 'flour', 'salt', 'olive oil', 'shred...",Mix batter ingredients in a pre-sprayed 9-1/2-...,50.0,10.769,22.6643,10.5579,232


In [5]:
dataset['ketogenic'] = dataset.apply(lambda x: is_ketogenic(protein_g=x['protein'], calories=x['kcal'], fat_g=x['fat'], carbs_g=x['carbohydrates'], keto_ratio_index=0.8), axis=1)
dataset['ketogenic_ratio'] = dataset.apply(lambda x: calc_keto_ratio(protein_g=x['protein'], fat_g=x['fat'], carbs_g=x['carbohydrates']), axis=1)

In [6]:
dataset['ketogenic'] = dataset.apply(lambda x: is_ketogenic(protein_g=x['protein'], calories=x['kcal'], fat_g=x['fat'], carbs_g=x['carbohydrates'], keto_ratio_index=4), axis=1)


In [7]:
dataset['ketogenic'].value_counts()

ketogenic
False    40222
True       101
Name: count, dtype: int64

In [8]:
mean_ratio = dataset['ketogenic_ratio'].replace([float('inf'), -float('inf')], np.nan).mean()
print(mean_ratio)

0.45036018009561973


In [9]:
from foodrec.data.keto import KetoLoader
KL = KetoLoader()
keto_dataset = KL.load_dataset()

In [10]:
keto_dataset

Unnamed: 0,id,recipe,category,prep_time_in_minutes,prep_time_note,cook_time_in_minutes,cook_time_note,difficulty,serving,measurement_1,...,directions_step_7,directions_step_8,directions_step_9,directions_step_10,image,image_creative_commons,calories,fat_in_grams,carbohydrates_in_grams,protein_in_grams
0,1,Chia And Blackberry Pudding,Breakfast Recipes,45.0,,0.0,,Easy,2.0,0.25,...,,,,,https://s3.us-west-004.backblazeb2.com/encurat...,True,437.0,38.0,8.0,8.0
1,2,Cinnamon Chiller,Breakfast Recipes,10.0,,0.0,,Easy,1.0,1.00,...,,,,,https://s3.us-west-004.backblazeb2.com/encurat...,True,145.0,4.0,1.6,0.6
2,3,Cheesy Low-Carb Omelet,Breakfast Recipes,5.0,,5.0,,Easy,5.0,2.00,...,Cover and cook on LOW for 1 minute,Serve and enjoy!,,,https://s3.us-west-004.backblazeb2.com/encurat...,True,451.0,36.0,3.0,33.0
3,4,Angel Eggs,Breakfast Recipes,30.0,,0.0,,Easy,2.0,4.00,...,,,,,https://s3.us-west-004.backblazeb2.com/encurat...,True,184.0,15.0,1.0,12.0
4,5,Roasted Onions And Green Beans,Breakfast Recipes,10.0,,15.0,,Easy,6.0,1.00,...,Parboil the green beans for 3 to 5 minutes in ...,Drain it and serve the beans with baked onion ...,Serve warm and enjoy!,,https://s3.us-west-004.backblazeb2.com/encurat...,True,214.0,19.4,3.7,8.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
483,485,Hearty Papaya Drink,Drinks And Smoothies,5.0,,0.0,,Easy,2.0,1.00,...,,,,,https://s3.us-west-004.backblazeb2.com/encurat...,True,192.0,7.0,16.0,3.0
484,486,A Minty Drink,Drinks And Smoothies,5.0,,0.0,,Easy,1.0,1.00,...,,,,,https://s3.us-west-004.backblazeb2.com/encurat...,True,391.0,10.0,17.0,5.0
485,487,The Amazing Acai,Drinks And Smoothies,5.0,,0.0,,Easy,2.0,1.00,...,,,,,https://s3.us-west-004.backblazeb2.com/encurat...,True,225.0,8.0,13.0,4.0
486,488,Fine Yo “Mama” Matcha,Drinks And Smoothies,5.0,,0.0,,Easy,2.0,2.00,...,,,,,https://s3.us-west-004.backblazeb2.com/encurat...,True,216.0,1.0,18.0,3.0


In [11]:
keto_dataset['ketogenic_ratio'] = keto_dataset.apply(lambda x: calc_keto_ratio(protein_g=x['protein_in_grams'], fat_g=x['fat_in_grams'], carbs_g=x['carbohydrates_in_grams']), axis=1)
mean_ratio = keto_dataset['ketogenic_ratio'].replace([float('inf'), -float('inf')], np.nan).mean()
print(mean_ratio)

1.6999684590281274


In [12]:
dataset['ketogenic_ratio']

0        0.727810
3        0.368292
5        0.359683
8        0.256212
9        0.315790
           ...   
58417    0.890338
58418    0.430437
58419    0.387804
58421    0.378641
58422    0.234354
Name: ketogenic_ratio, Length: 40323, dtype: float64

In [13]:
len(list(dataset['recipe_href']))

40323

In [14]:
dataset['ketogenic'].value_counts()

ketogenic
False    40222
True       101
Name: count, dtype: int64

## Data Preperation

In [15]:
df = pd.read_csv(DATASET_PATHS / "zw_personas.csv")


In [16]:
PATH_NO_BIASE = CONVERSATION / "Gemini" / "no_biase"
PATH_SYSTEM_BIASE = CONVERSATION / "Gemini" / "system_biase"
PATH_SEARCH_ENGINE = CONVERSATION / "Gemini" / "search_engine"
PATH_SEARCH_BIASE = CONVERSATION / "Gemini" / "search_biase"
PATH_BOTH = CONVERSATION / "Gemini" / "both_biase"

In [17]:
def get_search_engine(Path):
    with open(Path / "result.json", "r", encoding="utf-8") as f:
        data = json.load(f)
        return data, data

In [18]:
def get_dicts_set(df, model:ModelEnum, Path):
    pred = {}
    gt = {}
    for index, row in df.iterrows():
        try:
            persona_id = row["id"]
            query = row["qOriginText"]
            pred[query], gt[query] = check_availability(persona_id=persona_id, query=query, model=model, Path=Path)
        except Exception as e:
            print(query)
            print(e)
    return pred, gt
dict_search_engine, dict_search_engine_search = get_search_engine(PATH_SEARCH_ENGINE)
dict_system_biase, dict_system_biase_search = get_dicts_set(df, ModelEnum.Gemini, PATH_SYSTEM_BIASE)
dict_no_biase, dict_no_biase_search = get_dicts_set(df, ModelEnum.Gemini, PATH_NO_BIASE)
dict_search_biase, dict_search_biase_search = get_dicts_set(df, ModelEnum.Gemini, PATH_SEARCH_BIASE)
dict_both, dict_both_search = get_dicts_set(df, ModelEnum.Gemini, PATH_BOTH)

In [19]:
def calc_median_rounds(df, model:ModelEnum, Path):
        ls = []
        for index, row in df.iterrows():
            persona_id = row["id"]
            query = row["qOriginText"]
            ls.append(calc_rounds(persona_id=persona_id, query=query, model=model, Path=Path))
        print(ls)
        return np.mean(ls)

print("Mean Rounds No Biase:", calc_median_rounds(df, ModelEnum.Gemini, PATH_NO_BIASE))
print("Mean Rounds System Biase:", calc_median_rounds(df, ModelEnum.Gemini, PATH_SYSTEM_BIASE))
print("Mean Search Biase:", calc_median_rounds(df, ModelEnum.Gemini, PATH_SEARCH_BIASE))
print("Mean Both Biase:", calc_median_rounds(df, ModelEnum.Gemini, PATH_BOTH))

[1, 1, 4, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 5, 2, 2, 1, 1, 1, 4, 2, 1, 1, 3, 4, 1, 1, 1, 2, 1, 1, 3, 5, 2, 1, 1, 5, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 4, 3, 1, 1, 1, 5, 5, 3, 1, 2, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 3, 2, 1, 1, 2, 5, 5, 5, 1, 1, 1, 1, 1, 4, 2, 5, 2, 1, 1]
Mean Rounds No Biase: 1.85
[1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 2, 1, 1, 1, 4, 4, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 3, 4, 2, 1, 2, 3, 5, 1, 5, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 4, 1, 4, 1, 1, 2, 5, 3, 1, 1, 4, 4, 5, 3, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 3, 1, 2, 5, 1, 4, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 2]
Mean Rounds System Biase: 1.86
[1, 1, 2, 1, 2, 1, 1, 1, 5, 1, 2, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 4, 1, 1, 1, 2, 4, 1, 1, 1, 3, 1, 3, 2, 1, 2, 1, 3, 1, 1, 5, 1, 1, 1, 3, 5, 1, 1, 1, 5, 5, 5, 1, 1, 4, 1, 1, 1, 1, 1, 2, 1, 2, 1, 2, 1, 1, 2, 1, 1, 4, 1, 1, 2, 4, 1, 1, 1, 5, 1, 1, 5]
Mean Search Biase: 1.78
[1, 1, 2, 1, 1,

In [20]:
def check_ketogenic_biase(
    dict_biase: Dict[str, List[dict]],
    search_gt: Dict[str, List[dict]],
    keto_ratio_index: float = 0.8,
) -> Tuple[Dict[str, List[bool]], Dict[str, List[bool]]]:
    """
    Returns:
      pred_dict: keto flags for items the system selected (dict_biase)
      gt_dict:   keto flags for items NOT selected by the system (search_gt \ dict_biase)
    """
    f_is_keto = is_ketogenic
    # Build ground-truth subset: keys present in search_gt but NOT in dict_biase
    # O(n) using set difference on dict views
    

      # local binding for tiny speed-up

def check_ketogenic_biase(
    dict_biase: Dict[str, List[dict]],
    search_gt: Dict[str, List[dict]],
    keto_ratio_index: float = 0.8,
) -> Tuple[Dict[str, List[bool]], Dict[str, List[bool]]]:
    """
    Returns:
      pred_dict: keto flags for items the system selected (dict_biase)
      gt_dict:   keto flags for items NOT selected by the system (search_gt \ dict_biase)
    """
    f_is_keto = is_ketogenic  # local binding

    def to_keto_flags(d: Dict[str, List[dict]]) -> Dict[str, List[bool]]:
        out: Dict[str, List[bool]] = {}
        for key, items in d.items():
            flags = []
            for item in items or []:  # falls None oder leere Liste
                try:
                    flags.append(
                        f_is_keto(
                            calories=item.get("calories", 0),
                            protein_g=item.get("proteins", 0),
                            fat_g=item.get("fat", 0),
                            carbs_g=item.get("carbohydrates", 0),
                            keto_ratio_index=keto_ratio_index,
                        )
                    )
                except Exception:
                    # bei Fehler einfach False anhängen
                    flags.append(False)
            out[key] = flags
        return out

    pred_dict = to_keto_flags(dict_biase)
    gt_dict   = to_keto_flags(search_gt)
    return (pred_dict, pred_dict) if not gt_dict else (pred_dict, gt_dict)

In [22]:
system_biase_pred, system_biase_gt = check_ketogenic_biase(dict_system_biase, dict_system_biase_search)
no_biase_pred, no_biase_gt = check_ketogenic_biase(dict_no_biase, dict_no_biase_search)
search_engine_pred, search_engine_gt = check_ketogenic_biase(dict_search_engine, dict_search_engine_search)
search_biase_pred, search_biase_gt = check_ketogenic_biase(dict_search_biase, dict_search_biase_search)
both_pred, both_gt = check_ketogenic_biase(dict_both, dict_both_search)

## Recall / F1 / Macro Precision / MAP / Accuracy

In [23]:
def get_metrics(pred: Dict[str, List[bool]], gt: Dict[str, List[bool]]):
    # Only consider queries present in both dicts
    common = [k for k in pred.keys() if k in gt]

    # Filter out None/[] before taking the first element
    ls_accuracy = [pred[q][0] for q in common if pred[q]]
    mean_response_length = np.mean([len(pred[q]) for q in common if pred[q]])
    macro_precision_system_biase, macro_recall_system_biase = macro_over_queries(gt, pred)
    micro_precision_system_biase, micro_recall_system_biase = micro_over_queries(gt, pred)
    mean_average_precision = mean_average_precision_over_queries(gt)

    accuracy_system_biase = accuracy(ls_accuracy) if ls_accuracy else float('nan')

    # Use only common keys for length stats to avoid mismatches
    mean_length = np.mean([len(gt[q]) for q in common]) if common else float('nan')
    median_length = np.median([len(gt[q]) for q in common]) if common else float('nan')

    # Safe median hit ratio: skip GT lists of length 0 and handle None/[] preds
    ratios = []
    for q in common:
        gt_len = len(gt[q])
        if gt_len > 0:
            pred_len = len(pred.get(q) or [])  # (None or []) -> []
            ratios.append(pred_len / gt_len)
    median_hit_ratio = np.median(ratios) if ratios else float('nan')

    print(f"Macro Precision: {macro_precision_system_biase:.4f}")
    print(f"Macro Recall: {macro_recall_system_biase:.4f}")
    print(f"Macro F1: {f1_score(macro_precision_system_biase, macro_recall_system_biase):.4f}")
    print(f"Micro Precision: {micro_precision_system_biase:.4f}")
    print(f"Micro Recall: {micro_recall_system_biase:.4f}")
    print(f"Micro F1: {f1_score(micro_precision_system_biase, micro_recall_system_biase):.4f}")
    print(f"Mean Average Precision: {mean_average_precision:.4f}")
    print(f"Mean Length of Search Results: {mean_length:.2f}")
    print(f"Mean Response Length: {mean_response_length:.2f}")
    print(f"Median Hit Length: {median_hit_ratio:.2f}")
    print(f"Accuracy: {accuracy_system_biase:.4f}")


In [24]:
print(10*'='+"Metrics for No Biase:"+10*'=')
get_metrics(no_biase_pred, no_biase_gt)
print(10*'='+"Metrics for System Biase:"+10*'=')
get_metrics(system_biase_pred, system_biase_gt)
print(10*'='+"Metrics for Search Engine:"+10*'=')
get_metrics(search_engine_pred, search_engine_gt)
print(10*'='+"Metrics for Search Biase:"+10*'=')
get_metrics(search_biase_pred, search_biase_gt)
print(10*'='+"Metrics for Both Biase:"+10*'=')
get_metrics(both_pred, both_gt)

14.574252922859728 100 59.10317460317461 100
Macro Precision: 0.1457
Macro Recall: 0.5910
Macro F1: 0.2338
Micro Precision: 0.1503
Micro Recall: 0.8929
Micro F1: 0.2573
Mean Average Precision: 0.1875
Mean Length of Search Results: 15.43
Mean Response Length: 9.98
Median Hit Length: 1.00
Accuracy: 0.1800
82.83333333333334 100 73.93650793650792 100
Macro Precision: 0.8283
Macro Recall: 0.7394
Macro F1: 0.7813
Micro Precision: 0.8883
Micro Recall: 0.6944
Micro F1: 0.7795
Mean Average Precision: 0.3106
Mean Length of Search Results: 17.68
Mean Response Length: 2.10
Median Hit Length: 0.10
Accuracy: 0.9149
10.34285714285714 100 60.0 100
Macro Precision: 0.1034
Macro Recall: 0.6000
Macro F1: 0.1764
Micro Precision: 0.1059
Micro Recall: 1.0000
Micro F1: 0.1916
Mean Average Precision: 0.1877
Mean Length of Search Results: 9.44
Mean Response Length: 9.83
Median Hit Length: 1.00
Accuracy: 0.0625
92.0 100 73.585263787667 100
Macro Precision: 0.9200
Macro Recall: 0.7359
Macro F1: 0.8177
Micro Prec

## Ketogen Position Check

Was the ratio the highest possible, e.g. if you want a salat it is pretty hard to find a ketogen recipe. so the question is is the ratio the highest possible 

In [25]:
def keto_value(recipe: Dict[str, Any]) -> float:
    """Berechnet das Keto-Ratio oder liefert -inf, wenn Daten fehlen/fehlerhaft sind."""
    try:
        return calc_keto_ratio(
            protein_g=recipe.get("proteins", 0),
            fat_g=recipe.get("fat", 0),
            carbs_g=recipe.get("carbohydrates", 0),
        )
    except Exception:
        return False  # bei Fehler so schlecht wie möglich


In [26]:
def _max_ratio(items: List[Dict[str, Any]], recipe_name) -> float:
    ls = [x for x in items if x.get("title") not in recipe_name]
    return max(keto_value(x) for x in items) if items else float("-inf")

In [27]:
def first_hit_better_than_all_gt(ls_pred: List[Dict[str, Any]], 
                                 ls_gt: List[Dict[str, Any]], 
                                 allow_ties: bool = False) -> bool:
    """Ist der **erste Treffer** besser als alle GTs?"""
    if not ls_pred:
        return False
    hit_ratio = keto_value(ls_pred[0])
    gt_best = _max_ratio(ls_gt, [ls_pred[0].get("title", "")])
    return (hit_ratio >= gt_best) if allow_ties else (hit_ratio > gt_best)

def any_hit_better_than_all_gt(ls_pred: List[Dict[str, Any]], 
                               ls_gt: List[Dict[str, Any]], 
                               allow_ties: bool = False) -> bool:
    """Hat das System **irgendeinen** Treffer, der besser ist als alle GTs?"""
    if not ls_pred:
        return False
    pred_best = _max_ratio(ls_pred, [])
    recipe_name_list = [item.get("title", "") for item in ls_pred]
    gt_best = _max_ratio(ls_gt, recipe_name_list)
    return (pred_best >= gt_best) if allow_ties else (pred_best > gt_best)


In [28]:
def calc_system_better(dict_system_pred: Dict[str, List[Dict[str, Any]]],
                       dict_system_gt: Dict[str, List[Dict[str, Any]]],
                       allow_ties: bool = False) -> Tuple[float, float]:
    ls_first = []
    ls_any = []
    for query in dict_system_pred.keys():
        ls_pred = dict_system_pred[query]
        ls_gt = dict_system_gt.get(query, [])
        ls_first.append(first_hit_better_than_all_gt(ls_pred, ls_gt, allow_ties))
        ls_any.append(any_hit_better_than_all_gt(ls_pred, ls_gt, allow_ties))
    first_acc = sum(ls_first) / len(ls_first) if ls_first else 0.0
    any_acc = sum(ls_any) / len(ls_any) if ls_any else 0.0
    return first_acc, any_acc

# Beispiel-Aufruf mit strenger Variante (keine Ties):
def calc_ketogen_like(pred, gt):
    first_strict, any_strict = calc_system_better(pred, gt, allow_ties=False)

# Und optional mit Ties erlaubt:
    first_tie, any_tie = calc_system_better(pred, gt, allow_ties=True)

    print(f"(mit Tie) Top-1 ≥ alle GTs: {first_tie:.4f}")


In [29]:
print(10*'='+"Metrics for No Biase:"+10*'=')
calc_ketogen_like(dict_no_biase, dict_no_biase_search)
print(10*'='+"Metrics for System Biase:"+10*'=')
calc_ketogen_like(dict_system_biase, dict_system_biase_search)
print(10*'='+"Metrics for Search Biase:"+10*'=')
calc_ketogen_like(dict_search_biase, dict_search_biase_search)
print(10*'='+"Metrics for Both Biase:"+10*'=')
calc_ketogen_like(dict_both, dict_both_search)
# df = create_dataset(model=ModelEnum.Gemini, biase_agent=True, biase_search=True, print_output=False)
# df = create_dataset(model=ModelEnum.Gemini, biase_agent=True, biase_search=False, print_output=False)  # --- IGNORE ---

(mit Tie) Top-1 ≥ alle GTs: 0.1700
(mit Tie) Top-1 ≥ alle GTs: 0.5900
(mit Tie) Top-1 ≥ alle GTs: 0.1200
(mit Tie) Top-1 ≥ alle GTs: 0.2100


## Reward Analysis

In [30]:
def check_reward(persona_id: int, query: str, model: ModelEnum, Path = None):
    query_stempt = query.replace(" ", "_").lower()
    id = f"{persona_id}_{query_stempt}_{model.name}"
    ls_search = []
    filepath = Path / f"{id}.jsonl"
    if not filepath.exists():
        print(filepath)
        return None, None
    roles = [AgentEnum.USER_ANALYST.value, AgentEnum.SEARCH.value, AgentEnum.REFLECTOR.value, AgentEnum.FINISH.value, AgentEnum.ITEM_ANALYST.value, AgentEnum.INTERPRETER.value]
    try:
        with open(filepath, "r", encoding="utf-8") as f:
            ls = [AgentEnum.START.value]
            for line in f:
                line = line.strip()
                if not line:
                    continue
                try:
                    obj = json.loads(line)
                except json.JSONDecodeError:
                    continue  # kaputte Zeilen überspringen
                try:
                    if obj.get("role") in roles:
                        ls.append(obj.get("role"))
                    if obj.get("role") == "INTERPRETER_Output":
                        ls.append(AgentEnum.INTERPRETER.value)
                    if obj.get("role") == "Search_Results":
                        ls.append(AgentEnum.SEARCH.value)
                    if obj.get("role") == "assistant":
                        ls.append(AgentEnum.FINISH.value)
                except Exception as e:
                    print(f"Error processing line: {line}, Error: {e}")
                    continue
            return ls
    except:
        print(f"Error reading file {filepath}")
        return None
    
    return None

In [31]:
def get_reward_set(df, model:ModelEnum, Path):
    ls_res = []
    for index, row in df.iterrows():
        try:
            persona_id = row["id"]
            query = row["qOriginText"]
            ls_res.append(check_reward(persona_id=persona_id, query=query, model=model, Path=Path))
        except Exception as e:
            print(query)
            print(e)
    return ls_res
reward_system_biase = get_reward_set(df, ModelEnum.Gemini, PATH_SYSTEM_BIASE)
reward_system_no = get_reward_set(df, ModelEnum.Gemini, PATH_NO_BIASE)
reward_search_biase = get_reward_set(df, ModelEnum.Gemini, PATH_SEARCH_BIASE)
reward_both = get_reward_set(df, ModelEnum.Gemini, PATH_BOTH)


In [32]:
from foodrec.evaluation.reward_evaluation import final_episode_reward

In [33]:
len(reward_system_biase)

100

In [34]:

def reward_average_calculation(reward_system):
    gamma = 1
    normalize = True  # auf Wunsch vergleichbar machen

    scores = []
    for i, episode in enumerate(reward_system, start=1):
        score = final_episode_reward(episode, gamma=gamma, normalize=normalize)
        scores.append(score)
        print(f"Episode {i}: Score = {score:.4f} | Länge = {len(episode)}")

    # Optional: Gesamtauswertung
    avg_score = sum(scores) / len(scores) if scores else 0.0
    print(f"\nDurchschnittlicher Score: {avg_score:.4f} bei gamma={gamma}, normalize={normalize}")
    

In [35]:
print("Durchschnittlicher Score für System Biase:")
reward_average_calculation(reward_system_biase)
print("No Biase")
reward_average_calculation(reward_system_no)
print("Durchschnittlicher Score für Search Biase:")
reward_average_calculation(reward_search_biase)
print("Durchschnittlicher Score für Both Biase:")
reward_average_calculation(reward_both)

Durchschnittlicher Score für System Biase:
Episode 1: Score = 0.4286 | Länge = 7
Episode 2: Score = 0.4286 | Länge = 7
Episode 3: Score = 0.4286 | Länge = 7
Episode 4: Score = 0.6000 | Länge = 10
Episode 5: Score = 0.4286 | Länge = 7
Episode 6: Score = 0.4286 | Länge = 7
Episode 7: Score = 0.2308 | Länge = 13
Episode 8: Score = 0.4286 | Länge = 7
Episode 9: Score = 0.4286 | Länge = 7
Episode 10: Score = 0.3333 | Länge = 9
Episode 11: Score = 0.5556 | Länge = 9
Episode 12: Score = 0.6667 | Länge = 18
Episode 13: Score = 0.4286 | Länge = 7
Episode 14: Score = 0.3333 | Länge = 9
Episode 15: Score = 0.6000 | Länge = 10
Episode 16: Score = 0.3333 | Länge = 9
Episode 17: Score = 0.4286 | Länge = 7
Episode 18: Score = 0.4286 | Länge = 7
Episode 19: Score = 0.6667 | Länge = 18
Episode 20: Score = 0.5294 | Länge = 17
Episode 21: Score = 0.6923 | Länge = 13
Episode 22: Score = 0.3333 | Länge = 9
Episode 23: Score = 0.2727 | Länge = 11
Episode 24: Score = 0.4286 | Länge = 7
Episode 25: Score = 0.