# Evaluation of the Mulit Agent System 

This file is responsible for the evaluation of the Multi Agent System

In [None]:
import pandas as pd 
import numpy as np 
from foodrec.config.structure.dataset_enum import ModelEnum 
from foodrec.evaluation.create_dataset import create_dataset
from foodrec.evaluation.is_ketogen import is_ketogenic, calc_keto_ratio
from foodrec.config.structure.paths import CONVERSATION, DATASET_PATHS
import json
from foodrec.evaluation.metrics.metrics import macro_over_queries, micro_over_queries, accuracy, f1_score, mean_average_precision_over_queries
from foodrec.data.all_recipe import AllRecipeLoader
from typing import Dict, List, Any, Tuple

In [2]:
def check_availability(persona_id: int, query: str, model: ModelEnum, Path = None):
    query_stempt = query.replace(" ", "_").lower()
    id = f"{persona_id}_{query_stempt}_{model.name}"
    ls_search = []
    filepath = Path / f"{id}.jsonl"
    if not filepath.exists():
        return None
    
    try:
        with open(filepath, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                try:
                    obj = json.loads(line)
                except json.JSONDecodeError:
                    continue  # kaputte Zeilen überspringen
                if obj.get("role") == "assistant":
                    return [obj.get("content"), ls_search]
                if obj.get("role") == "Search_Results":
                    zw_res = obj.get("meta", [])
                    if len(zw_res) > len(ls_search):
                        ls_search = zw_res
    except:
        return None
    
    return None


## How many Ketogen in All Recipe

In [21]:
AL = AllRecipeLoader()
dataset = AL.load_dataset()

In [22]:
dataset.head()

Unnamed: 0,recipe_href,recipe_name,description,rating_rate,ingredients_normalized,tutorial,cooking_time,protein,carbohydrates,fat,kcal
0,http://allrecipes.com/recipe/1-2-3-cheddar-bro...,1-2-3 Cheddar Broccoli Casserole,'Serve this casserole over hot baked potatoes ...,5.0,"['fig spread', 'broccoli floret', 'cheddar che...",Preheat oven to 350 degrees F. Combine Double ...,25.0,3.16087,6.37818,6.94262,97
3,http://allrecipes.com/recipe/1-2-3-chicken-cac...,1-2-3 Chicken Cacciatore,"'For this tasty and easy cacciatore, chicken i...",3.0,"['olive oil', 'chicken leg', 'pasta sauce']",Heat oil in 12-inch skillet over medium-high h...,60.0,10.2273,4.48565,5.41866,109
5,http://allrecipes.com/recipe/1-2-3-jambalaya/d...,1-2-3 Jambalaya,'Enjoy some Creole comfort the quick and easy ...,4.0,"['olive oil', 'worcestershire sauce', 'fresh p...","In a bowl, combine sausage, shrimp or chicken ...",30.0,8.43391,15.6766,8.67215,179
8,http://allrecipes.com/recipe/1-dish-chicken-pa...,1-Dish Chicken Parmesan,"'Ready to serve in less than an hour, this one...",4.0,"['sugar', 'flour', 'salt', 'olive oil', 'spagh...",Mix batter ingredients together in a pre-spray...,45.0,11.5418,16.193,7.10598,177
9,http://allrecipes.com/recipe/1-dish-pepperoni-...,1-Dish Pepperoni Cheese Pizza Bake,'Pizza was never easier than this--spread the ...,4.0,"['sugar', 'flour', 'salt', 'olive oil', 'shred...",Mix batter ingredients in a pre-sprayed 9-1/2-...,50.0,10.769,22.6643,10.5579,232


In [23]:
dataset['ketogenic'] = dataset.apply(lambda x: is_ketogenic(protein_g=x['protein'], calories=x['kcal'], fat_g=x['fat'], carbs_g=x['carbohydrates'], keto_ratio_index=0.8), axis=1)

In [24]:
len(list(dataset['recipe_href']))

40323

In [25]:
dataset['ketogenic'].value_counts()

ketogenic
False    35798
True      4525
Name: count, dtype: int64

## Data Preperation

In [6]:
df = pd.read_csv(DATASET_PATHS / "zw_personas.csv")
df = df[:4]

In [7]:
PATH_NO_BIASE = CONVERSATION / "Gemini" / "no_biase"
PATH_SYSTEM_BIASE = CONVERSATION / "Gemini" / "system_biase"

In [8]:
def get_dicts_set(df, model:ModelEnum, Path):
    pred = {}
    gt = {}
    for index, row in df.iterrows():
        persona_id = row["id"]
        query = row["qOriginText"]
        pred[query], gt[query] = check_availability(persona_id=persona_id, query=query, model=model, Path=Path)
    return pred, gt

dict_system_biase, dict_system_biase_search = get_dicts_set(df, ModelEnum.Gemini, PATH_SYSTEM_BIASE)

In [9]:
def check_ketogenic_biase(
    dict_biase: Dict[str, List[dict]],
    search_gt: Dict[str, List[dict]],
    keto_ratio_index: float = 0.8,
) -> Tuple[Dict[str, List[bool]], Dict[str, List[bool]]]:
    """
    Returns:
      pred_dict: keto flags for items the system selected (dict_biase)
      gt_dict:   keto flags for items NOT selected by the system (search_gt \ dict_biase)
    """
    f_is_keto = is_ketogenic
    # Build ground-truth subset: keys present in search_gt but NOT in dict_biase
    # O(n) using set difference on dict views
    

      # local binding for tiny speed-up

    def to_keto_flags(d: Dict[str, List[dict]]) -> Dict[str, List[bool]]:
        out: Dict[str, List[bool]] = {}
        for key, items in d.items():
            if not items:
                out[key] = []
                continue
            # Fast comprehension; assumes keys exist in each item
            out[key] = [
                f_is_keto(
                    calories=item["calories"],
                    protein_g=item["proteins"],
                    fat_g=item["fat"],
                    carbs_g=item["carbohydrates"],
                    keto_ratio_index=keto_ratio_index,
                )
                for item in items
                if item is not None
            ]
        return out

    pred_dict = to_keto_flags(dict_biase)
    gt_dict   = to_keto_flags(search_gt)
    return (pred_dict, pred_dict) if not gt_dict else (pred_dict, gt_dict)

In [10]:
system_biase_pred, system_biase_gt = check_ketogenic_biase(dict_system_biase, dict_system_biase_search)

## Recall / F1 / Macro Precision / MAP / Accuracy

In [11]:
ls_accuracy = [system_biase_pred[query][0] for query in system_biase_pred]

In [12]:
macro_precision_system_biase, macro_recall_system_biase = macro_over_queries(system_biase_gt, system_biase_pred)
micro_precision_system_biase, micro_recall_system_biase = micro_over_queries(system_biase_gt, system_biase_pred)
mean_average_precision = mean_average_precision_over_queries(system_biase_gt)
accuracy_system_biase = accuracy(ls_accuracy)

In [13]:
print(10*'='+"Metrics for System Biase:"+10*'=')
print(f"Macro Precision: {macro_precision_system_biase:.4f}")
print(f"Macro Recall: {macro_recall_system_biase:.4f}")
print(f"Macro F1: {f1_score(macro_precision_system_biase, macro_recall_system_biase):.4f}")
print(f"Micro Precision: {micro_precision_system_biase:.4f}")
print(f"Micro Recall: {micro_recall_system_biase:.4f}")
print(f"Micro F1: {f1_score(micro_precision_system_biase, micro_recall_system_biase):.4f}")
print(f"Mean Average Precision: {mean_average_precision:.4f}")
print(f"Accuracy: {accuracy_system_biase:.4f}")


Macro Precision: 0.5000
Macro Recall: 0.4375
Macro F1: 0.4667
Micro Precision: 0.4444
Micro Recall: 0.4000
Micro F1: 0.4211
Mean Average Precision: 0.3013
Accuracy: 0.7500


## Ketogen Position Check

Was the ratio the highest possible, e.g. if you want a salat it is pretty hard to find a ketogen recipe. so the question is is the ratio the highest possible 

In [15]:
def keto_value(recipe: Dict[str, Any]) -> float:
    return calc_keto_ratio(
        protein_g=recipe["proteins"],
        fat_g=recipe["fat"],
        carbs_g=recipe["carbohydrates"],
    )

In [17]:
def check_ratio(ls_pred, ls_gt):
    hit = ls_pred[0]
    hit_ratio = keto_value(hit)
    ls_gt_ratio = []
    for gt in ls_gt:
        if hit['title'] != gt['title']:
            ls_gt_ratio.append(keto_value(gt))
    return any(gt_ratio > hit_ratio for gt_ratio in ls_gt_ratio)

In [16]:
def check_ratio_pred(ls_pred, ls_gt):
    recipe_names = [hit['title'] for hit in ls_pred]
    hit_ratio = max(keto_value(hit) for hit in ls_pred)
    ls_gt_ratio = []
    for gt in ls_gt:
        if gt['title'] not in recipe_names:
            ls_gt_ratio.append(keto_value(gt))
    return any(gt_ratio > hit_ratio for gt_ratio in ls_gt_ratio)

In [18]:
def calc_best_possible(dict_system_biase, dict_system_biase_search):
    ls_one = []
    ls_hits = []
    for query in dict_system_biase.keys():
        ls_pred = dict_system_biase[query]
        ls_gt = dict_system_biase_search[query]
        ls_one.append(check_ratio(ls_pred, ls_gt))
        ls_hits.append(check_ratio_pred(ls_pred, ls_gt))
    one_ratio_acc = sum(ls_one) / len(ls_one)
    hits_ratio_acc = sum(ls_hits) / len(ls_hits)
    return one_ratio_acc, hits_ratio_acc

In [19]:
one, hits = calc_best_possible(dict_system_biase, dict_system_biase_search)

In [20]:
print(10*'='+"Metrics for Best Chosen:"+10*'=')
print(f"First Hit Ratio: {one:.4f}")
print(f"Hits Ratio: {hits:.4f}")

First Hit Ratio: 0.7500
Hits Ratio: 0.7500
