# Ablaition Study

In [None]:
import pandas as pd 
import numpy as np 
import json
from foodrec.config.structure.dataset_enum import ModelEnum 
from foodrec.evaluation.is_ketogen import calc_keto_ratio
from foodrec.config.structure.paths import CONVERSATION, DATASET_PATHS
from foodrec.evaluation.is_ketogen import calc_keto_ratio
from foodrec.evaluation.reward_evaluation import routing_accuracy
from analysis_helper.load_dataset import get_dicts_set, get_search_engine
from analysis_helper.get_lowes_highest import take_25_lowest_keto
from analysis_helper.get_metrics import calc_metrics
from analysis_helper.mean_rounds import calc_rounds
from analysis_helper.query_analysis import calc_other_recommendation_parameters
from analysis_helper.calc_routing_reward import get_reward_set, reward_average_calculation
from analysis_helper.calc_path_length import calc_path_length
from analysis_helper.time import calc_mean_time
from analysis_helper.reflector_analysis import cals_reflector_accuracy
from analysis_helper.ketogen_available import ketogen_available
from analysis_helper.load_dataset import get_file_path

In [3]:
query_set = pd.read_csv(DATASET_PATHS / "zw_personas.csv")
model = ModelEnum.Gemini

In [4]:
def get_paths(model_name):
    return {
        "PATH_NO_BIASE": CONVERSATION / model_name / "no_biase",
        "PATH_SYSTEM_BIASE": CONVERSATION / model_name / "system_biase",
        "PATH_SEARCH_ENGINE": CONVERSATION / "search_engine" / "res_one.json",
        "PATH_SEARCH_BIASE": CONVERSATION / model_name / "search_biase" ,
        "PATH_BOTH": CONVERSATION / model_name / "both_biase"
    }

In [5]:
paths = get_paths(ModelEnum.Gemini.name)

## Reasons for Hypothesis Results

### task success lowest 25 highest 25

In [6]:
q_set_highest = take_25_lowest_keto(query_set,paths, high=True)
pr_auc_raw_high = calc_metrics(query_set=q_set_highest, paths=paths, model_name=ModelEnum.Gemini, ref_include=False)

25%: 0.2624075915346725
75%: 0.4752273733006868
Mean: 0.4078187139691796
Max: 1.565975754772325
28
2.1010442773600673 25 9.357142857142858 25
21.5 25 19.09621212121212 25
10.342857142857138 100 55.0 100
24.0 25 13.66678845451224 25
25.0 25 9.521862948077782 25
               Macro Precision  Macro Recall  Macro F1  Micro Precision  \
Bias                                                                      
No Biase              0.084042      0.374286  0.137263         0.105590   
System Biase          0.860000      0.763848  0.809078         0.982143   
Search Engine         0.103429      0.550000  0.174115         0.104569   
Search Biase          0.960000      0.546672  0.696641         1.000000   
Both Biase            1.000000      0.380875  0.551642         1.000000   

               Micro Recall  Micro F1  Mean Average Precision  Mean PR-AUC  \
Bias                                                                         
No Biase           0.414634  0.168317                0.20

In [7]:
q_set_low = take_25_lowest_keto(query_set,paths, high=False)
pr_auc_raw_low = calc_metrics(query_set=q_set_low, paths=paths, model_name=ModelEnum.Gemini, ref_include=False)

25%: 0.2624075915346725
75%: 0.4752273733006868
Mean: 0.4078187139691796
Max: 1.565975754772325
34
1.4023809523809525 25 6.5 25
13.666666666666668 25 14.833333333333334 25
10.342857142857138 100 55.0 100
21.617424242424242 25 12.972010929179714 25
23.5 25 7.266743480014867 25
               Macro Precision  Macro Recall  Macro F1  Micro Precision  \
Bias                                                                      
No Biase              0.056095      0.260000  0.092281         0.081301   
System Biase          0.546667      0.593333  0.569045         0.675000   
Search Engine         0.103429      0.550000  0.174115         0.104569   
Search Biase          0.864697      0.518880  0.648571         0.930070   
Both Biase            0.940000      0.290670  0.444034         0.988095   

               Micro Recall  Micro F1  Mean Average Precision  Mean PR-AUC  \
Bias                                                                         
No Biase           0.500000  0.139860    

### No Reflector

In [8]:
def calc_no_improvment(df, paths, model_name: ModelEnum):
    def calc_median_rounds(df, model:ModelEnum, Path):
        ls = []
        for index, row in df.iterrows():
            persona_id = row["id"]
            query = row["query"]
            ls.append(cals_reflector_accuracy(persona_id=persona_id, query=query, model=model, Path=Path))
        return ls
    ls_no_biase = calc_median_rounds(df, model_name, paths['PATH_NO_BIASE'])
    ls_system_biase = calc_median_rounds(df, model_name, paths['PATH_SYSTEM_BIASE'])
    ls_search_biase = calc_median_rounds(df, model_name, paths['PATH_SEARCH_BIASE'])
    ls_both_biase = calc_median_rounds(df, model_name, paths['PATH_BOTH'])

    print(f"Mean Rounds No Biase: Accuracy:{np.mean([True if keto > 0.8 else False for keto in ls_no_biase])}  Mean Rate {np.mean(ls_no_biase)}")
    print(f"Mean Rounds System Biase:  Accuracy:{np.mean([True if keto > 0.8 else False for keto in ls_system_biase])}  Mean Rate {np.mean(ls_system_biase)}")
    print(f"Mean Search Biase:  Accuracy:{np.mean([True if keto > 0.8 else False for keto in ls_search_biase])}  Mean Rate {np.mean(ls_search_biase)}")
    print(f"Mean Both Biase:  Accuracy:{np.mean([True if keto > 0.8 else False for keto in ls_both_biase])}  Mean Rate {np.mean(ls_both_biase)}")

In [9]:
calc_no_improvment(query_set, paths, ModelEnum.Gemini)

Mean Rounds No Biase: Accuracy:0.11  Mean Rate 0.41720981764709003
Mean Rounds System Biase:  Accuracy:0.63  Mean Rate 0.9009933569286928
Mean Search Biase:  Accuracy:0.94  Mean Rate 1.2604315432172488
Mean Both Biase:  Accuracy:0.98  Mean Rate 1.7588926057664371


### No Ketogen Recipe

In [10]:
def was_ketogen_available(df, paths, model_name: ModelEnum):
    def calc_median_rounds(df, model:ModelEnum, Path):
        ls = []
        for index, row in df.iterrows():
            persona_id = row["id"]
            query = row["query"]
            ls.append(ketogen_available(persona_id=persona_id, query=query, model=model, Path=Path))
        filtered = [x for x in ls if x is not None]
        return filtered
    ls_no_biase = calc_median_rounds(df, model_name, paths['PATH_NO_BIASE'])
    ls_system_biase = calc_median_rounds(df, model_name, paths['PATH_SYSTEM_BIASE'])
    ls_search_biase = calc_median_rounds(df, model_name, paths['PATH_SEARCH_BIASE'])
    ls_both_biase = calc_median_rounds(df, model_name, paths['PATH_BOTH'])


    print(f"Mean Rounds No Biase: Accuracy: {np.mean(ls_no_biase)}")
    print(f"Mean Rounds search Biase: Accuracy: {np.mean(ls_search_biase)}")
    print(f"Mean Rounds both Biase: Accuracy: {np.mean(ls_both_biase)}")

    print(f"Mean Rounds System Biase: {np.mean(ls_system_biase)}")

In [11]:
was_ketogen_available(query_set, paths, ModelEnum.Gemini)

list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
unsupported operand type(s) for +: 'NoneType' and 'NoneType'
list index out of range
list index out of range
list index out of range
unsupported operand type(s) for +: 'NoneType' and 'NoneType'
list index out of range
unsupported operand type(s) for +: 'NoneType' and 'NoneType'
list index out of range
list index out of range
Mean Rounds No Biase: Accuracy: 0.6179775280898876
Mean Rounds search Biase: Accuracy: 1.0
Mean Rounds both Biase: Accuracy: 1.0
Mean Rounds System Biase: 0.23076923076923078


## Reference Model

### Recommendations Accuracy

In [11]:
def recommendation_accuracy(df, model_name:ModelEnum):
    paths = get_paths(str(model_name.name))    
    print(10*"-"+"PATH_NO_BIASE"+10*"-")
    calc_other_recommendation_parameters(df, paths["PATH_NO_BIASE"],model_name )
    print(10*"-"+"PATH_SYSTEM_BIASE"+10*"-")
    calc_other_recommendation_parameters(df, paths["PATH_SYSTEM_BIASE"],model_name )
    print(10*"-"+"PATH_BOTH"+10*"-")
    calc_other_recommendation_parameters(df, paths["PATH_BOTH"], model_name )
    print(10*"-"+"PATH_SEARCH_BIASE"+10*"-")
    calc_other_recommendation_parameters(df, paths["PATH_SEARCH_BIASE"], model_name )

In [12]:
recommendation_accuracy(query_set, ModelEnum.OpenAI)

----------PATH_NO_BIASE----------
EmbeddingLoader initialised
Like 0.5306122448979592
 Avoid 0.9912280701754386
Cuisine 0.90625
Overall0.8093634383577992
----------PATH_SYSTEM_BIASE----------
EmbeddingLoader initialised
Like 0.4857142857142857
 Avoid 1.0
Cuisine 0.9
Overall0.7952380952380952
----------PATH_BOTH----------
EmbeddingLoader initialised
Like 0.45652173913043476
 Avoid 0.990990990990991
Cuisine 0.873015873015873
Overall0.7735095343790995
----------PATH_SEARCH_BIASE----------
EmbeddingLoader initialised
Like 0.45454545454545453
 Avoid 0.9906542056074766
Cuisine 0.8524590163934426
Overall0.7658862255154579


### Descriptive Data

### Recommendation Accuracy Ketogen

In [13]:
paths = get_paths(ModelEnum.OpenAI.name)
pr_auc_raw = calc_metrics(query_set=query_set, paths=paths, model_name=ModelEnum.OpenAI, ref_include=False)

10.904761904761903 100 22.978571428571428 100
75.0 100 61.3734126984127 100
10.342857142857138 100 55.0 100
91.0 100 41.22900241126103 100
94.0 100 34.003591546092125 100
               Macro Precision  Macro Recall  Macro F1  Micro Precision  \
Bias                                                                      
No Biase              0.109048      0.229786  0.147905         0.103371   
System Biase          0.750000      0.613734  0.675059         0.981366   
Search Engine         0.103429      0.550000  0.174115         0.104569   
Search Biase          0.910000      0.412290  0.567476         0.994667   
Both Biase            0.940000      0.340036  0.499414         0.994652   

               Micro Recall  Micro F1  Mean Average Precision  Mean PR-AUC  \
Bias                                                                         
No Biase           0.340741  0.158621                0.211093     0.178813   
System Biase       0.578755  0.728111                0.297362     0.7

### Mean Rounds

In [14]:
def calc_mean_rounds(df, paths, model_name: ModelEnum):
    def calc_median_rounds(df, model:ModelEnum, Path):
        ls = []
        for index, row in df.iterrows():
            persona_id = row["id"]
            query = row["query"]
            ls.append(calc_rounds(persona_id=persona_id, query=query, model=model, Path=Path))
        return np.mean(ls)
    print("Mean Rounds No Biase:", calc_median_rounds(df, model_name, paths['PATH_NO_BIASE']))
    print("Mean Rounds System Biase:", calc_median_rounds(df, model_name, paths['PATH_SYSTEM_BIASE']))
    print("Mean Search Biase:", calc_median_rounds(df, model_name, paths['PATH_SEARCH_BIASE']))
    print("Mean Both Biase:", calc_median_rounds(df, model_name, paths['PATH_BOTH']))

In [15]:
calc_mean_rounds(query_set, paths=paths, model_name=ModelEnum.OpenAI)

Mean Rounds No Biase: 1.57
Mean Rounds System Biase: 3.02
Mean Search Biase: 2.03
Mean Both Biase: 2.15


### Reward

In [16]:
def calc_reward(query_set, paths, model_name: ModelEnum):
    reward_system_biase = get_reward_set(query_set, model_name, paths['PATH_SYSTEM_BIASE'])
    reward_system_no = get_reward_set(query_set, model_name, paths['PATH_NO_BIASE'])
    reward_search_biase = get_reward_set(query_set, model_name, paths['PATH_SEARCH_BIASE'])
    reward_both = get_reward_set(query_set, model_name, paths['PATH_BOTH'])
    print(f"Durchschnittlicher Score für System Biase: {reward_average_calculation(reward_system_biase)}")

    print(f"Durchschnittlicher Score fuer No Biase: {reward_average_calculation(reward_system_no)}")
    
    print(f"Durchschnittlicher Score für Search Biase: {reward_average_calculation(reward_search_biase)}")
    
    print(f"Durchschnittlicher Score für Both Biase: {reward_average_calculation(reward_both)}")
    


In [17]:
calc_reward(query_set=query_set, paths=paths, model_name=ModelEnum.OpenAI)

Durchschnittlicher Score für System Biase: Score: 0.6136 bei gamma=1, normalize=True
Durchschnittlicher Score fuer No Biase: Score: 0.4841 bei gamma=1, normalize=True
Durchschnittlicher Score für Search Biase: Score: 0.5259 bei gamma=1, normalize=True
Durchschnittlicher Score für Both Biase: Score: 0.5391 bei gamma=1, normalize=True


### Path Length

In [18]:
def calc_average_path_length(df, paths, model_name: ModelEnum):
    def calc_median_path_length(df, model:ModelEnum, Path):
        ls = []
        for index, row in df.iterrows():
            persona_id = row["id"]
            query = row["query"]
            ls.append(calc_path_length(persona_id=persona_id, query=query, model=model, path=Path))
        return np.mean(ls)

    print("Mean Path Length No Biase:", calc_median_path_length(df, model_name, paths['PATH_NO_BIASE']))
    print("Mean Path Length Biase:", calc_median_path_length(df, model_name, paths['PATH_SYSTEM_BIASE']))
    print("Mean Path Search Biase:", calc_median_path_length(df, model_name, paths['PATH_SEARCH_BIASE']))
    print("Mean Path Both Biase:", calc_median_path_length(df, model_name, paths['PATH_BOTH']))

In [19]:
calc_average_path_length(df=query_set,paths=paths, model_name=ModelEnum.OpenAI)


Mean Path Length No Biase: 6.75
Mean Path Length Biase: 11.19
Mean Path Search Biase: 8.14
Mean Path Both Biase: 8.56


### Routing Accuracy

In [20]:
def calc_routing_accuracy(query_set, paths, model_name: ModelEnum):
    def routing_accuracy_calculation(reward_system):
        scores = []
        for i, episode in enumerate(reward_system, start=1):
            score = routing_accuracy(episode)
            scores.append(score)

        # Optional: Gesamtauswertung
        avg_score = np.mean(scores)
        return f"Score: {avg_score:.4f}"
    reward_system_biase = get_reward_set(query_set, model_name, paths['PATH_SYSTEM_BIASE'])
    reward_system_no = get_reward_set(query_set, model_name, paths['PATH_NO_BIASE'])
    reward_search_biase = get_reward_set(query_set, model_name, paths['PATH_SEARCH_BIASE'])
    reward_both = get_reward_set(query_set, model_name, paths['PATH_BOTH'])

    print(f"Durchschnittlicher Score für System Biase: {routing_accuracy_calculation(reward_system_biase)}")
    print(f"Durchschnittlicher Score fuer No Biase: {routing_accuracy_calculation(reward_system_no)}")
    print(f"Durchschnittlicher Score für Search Biase: {routing_accuracy_calculation(reward_search_biase)}")
    print(f"Durchschnittlicher Score fuer Both Biase: {routing_accuracy_calculation(reward_both)}")


In [21]:
calc_routing_accuracy(query_set=query_set,paths=paths, model_name=ModelEnum.OpenAI)

Durchschnittlicher Score für System Biase: Score: 0.8067
Durchschnittlicher Score fuer No Biase: Score: 0.7418
Durchschnittlicher Score für Search Biase: Score: 0.7633
Durchschnittlicher Score fuer Both Biase: Score: 0.7690


### Mean Time

In [22]:
calc_mean_time(query_set, paths=paths, model_name=ModelEnum.OpenAI)

Mean Time No Biase: 308.31
Mean Time System Biase: 493.52
Mean Time Search Biase: 357.86
Mean Both Biase: 406.13


### Last Reflector Answer

In [23]:
def get_last_reflector_answer(persona_id: int, query: str, model: ModelEnum, Path = None):
    filepath = get_file_path(Path=Path, query=query, persona_id=persona_id, model=model)
    reflector = {}
    if filepath == None:
        return 0
    
    try:
        with open(filepath, "r", encoding="utf-8") as f:
            for line in f:
                
                line = line.strip()
                if not line:
                    continue
                try:
                    obj = json.loads(line)
                except json.JSONDecodeError:
                    continue  # kaputte Zeilen überspringen
                if obj.get("role") == "REFLECTOR":
                    reflector= obj
            meta = reflector["meta"]
            decision = meta['decision']
            if decision.lower() == "accept":
                return True
            else:
                return False
    except:
        return False

### Task Success Rate

In [24]:
def calc_task_success_rate(query_set, paths, model:ModelEnum):
    def calc_individual_rate(query_set, Path, model:ModelEnum):
        ls = []
        for index, row in query_set.iterrows():
            persona_id = row["id"]
            query = row["query"]
            ls.append(get_last_reflector_answer(persona_id=persona_id, query=query, model=model, Path=Path))
        return np.mean(ls), ls     
    no_biase_mean, no_biase_raw =   calc_individual_rate(query_set= query_set, model=model, Path=paths['PATH_NO_BIASE'])
    system_biase, system_biase_raw = calc_individual_rate(query_set, model=model, Path=paths['PATH_SYSTEM_BIASE'])
    search_biase, search_biase_raw = calc_individual_rate(query_set=query_set, model=model, Path=paths['PATH_SEARCH_BIASE'])
    both_biase, search_both_biase = calc_individual_rate(query_set=query_set, model=model, Path=paths['PATH_BOTH'])
    print("Task Success Rate No Biase:", no_biase_mean)
    print("Task Success Rate Biase:", system_biase)
    print("Task Success Rate Search Biase:", search_biase)
    print("Task Success Rate Both Biase:", both_biase)
    return no_biase_raw, system_biase_raw, search_biase_raw, search_both_biase

In [25]:
no_biase_raw, system_biase_raw, search_biase_raw, search_both_biase = calc_task_success_rate(query_set=query_set, paths=paths,model=ModelEnum.OpenAI)
eval_h2 = {
    "SystemBiase": system_biase_raw,
    "NoBiase":no_biase_raw,
    "SearchBiase":search_biase_raw,
    "BothBiase":search_both_biase
}

Task Success Rate No Biase: 0.93
Task Success Rate Biase: 0.6
Task Success Rate Search Biase: 0.84
Task Success Rate Both Biase: 0.83


### Calc Ketogen Ratio

In [26]:
def calc_keto(dict_biase: dict, df, keto_ratio_index: float = 0.8, mode: str = "top1") -> dict:
    """
    dict_biase: { query -> [ {recipe_dict}, ... ] } ODER { query -> {recipe_dict} }
    mode: 'top1' (nur erstes Rezept) oder 'any' (mind. ein Rezept ist ketogen)
    return: { query -> 0/1 }
    """
    def _is_keto(rec: dict) -> bool:
        obj = calc_keto_ratio(
            protein_g=rec.get("proteins", 0),
            fat_g=rec.get("fat", 0),
            carbs_g=rec.get("carbohydrates", 0),
        )
        if obj == np.inf:
            return 0
        return obj

    res = {}
    for _, row in df.iterrows():
        q = row["query"]

        # existiert der Query überhaupt?
        if q not in dict_biase:
            res[q] = 0
            continue

        items = dict_biase[q]

        # Normalisieren auf Liste von Rezepten
        if items is None:
            res[q] = 0
            continue
        if isinstance(items, dict):
            items = [items]
        if not isinstance(items, (list, tuple)) or len(items) == 0:
            res[q] = 0
            continue

        # Modus: top1 oder any
        if mode == "top1":
            res[q] = float(_is_keto(items[0]))
        elif mode == "any":
            res[q] = float(any(_is_keto(r) for r in items))
        else:
            raise ValueError("mode must be 'top1' or 'any'")

    return res


In [27]:
dict_search_engine, dict_search_engine_search = get_search_engine(paths['PATH_SEARCH_ENGINE'])
dict_system_biase,  dict_system_biase_search, ref_system_biase  = get_dicts_set(df=query_set, model=ModelEnum.OpenAI, Path=paths['PATH_SYSTEM_BIASE'])
dict_no_biase,      dict_no_biase_search, ref_no_biase      = get_dicts_set(df=query_set, model=ModelEnum.OpenAI, Path=paths['PATH_NO_BIASE'])
dict_search_biase,  dict_search_biase_search, ref_search_biase  = get_dicts_set(query_set, ModelEnum.OpenAI, paths['PATH_SEARCH_BIASE'])
dict_both,          dict_both_search, ref_both_biase          = get_dicts_set(query_set, ModelEnum.OpenAI, paths['PATH_BOTH'])

In [28]:
eval = {
    "SystemBiase": calc_keto(df=query_set, dict_biase=dict_system_biase),
    "NoBiase":calc_keto(df=query_set, dict_biase=dict_no_biase),
    "SearchEngine": calc_keto(df=query_set, dict_biase=dict_search_engine),
    "SearchBiase":calc_keto(df=query_set, dict_biase=dict_search_biase),
    "BothBiase":calc_keto(df=query_set, dict_biase=dict_both)
}

In [29]:
import pandas as pd

df_wide = pd.DataFrame(eval)              
df_wide = df_wide.reset_index()            
df_wide = df_wide.rename(columns={"index": "query"})

In [30]:
for col in df_wide.columns:
    if col == "query": 
        continue
    x = df_wide[col].dropna().to_numpy()
    print(f"{col} Mean: {np.median(x)}")


SystemBiase Mean: 1.010412667903395
NoBiase Mean: 0.2356610811198101
SearchEngine Mean: 0.2503780723728942
SearchBiase Mean: 0.9819557717116554
BothBiase Mean: 1.1648577159671825


### Costs

In [31]:
def calc_open_ai_costs(df, paths, model_name: ModelEnum):
    def calc_costs(df, model:ModelEnum, Path):
        ls_costs = []
        ls_input_token = []
        ls_output_token = []

        for index, row in df.iterrows():
            persona_id = row["id"]
            query = row["query"]
            try:
                input_token, output_token, costs = calc_openai_costs(persona_id=persona_id, query=query, model=model, Path=Path)
                ls_costs.append(costs)
                ls_input_token.append(input_token)
                ls_output_token.append(output_token)
            except: 
                print(query)
                continue
        return ls_input_token, ls_output_token, ls_costs
    
    ls_input_token_no, ls_output_token_no, ls_costs_no = calc_costs(df, model_name, paths['PATH_NO_BIASE'])
    ls_input_token_system, ls_output_token_system, ls_costs_system = calc_costs(df, model_name, paths['PATH_SYSTEM_BIASE'])
    ls_input_token_search, ls_output_token_search, ls_costs_search = calc_costs(df, model_name, paths['PATH_SEARCH_BIASE'])
    ls_input_token_both, ls_output_token_both, ls_costs_both = calc_costs(df, model_name, paths['PATH_BOTH'])

    total_costs = sum(ls_costs_no) + sum(ls_costs_system) + sum(ls_costs_search) + sum(ls_costs_both)
    print(len(ls_costs_no))
    print(f'Total Costs: ${total_costs}')
    print(f"Costs No Biase: ${np.mean(ls_costs_no)} ")
    print(f"Costs System Biase: ${np.mean(ls_costs_system)}")
    print(f"Costs Search Biase: ${np.mean(ls_costs_search)}")
    print(f"Costs Both Biase: ${np.mean(ls_costs_both)}")

In [32]:
calc_open_ai_costs(df=query_set,paths=paths, model_name=ModelEnum.OpenAI)

100
Total Costs: $20.091077750000004
Costs No Biase: $0.03790495000000001 
Costs System Biase: $0.06465689499999999
Costs Search Biase: $0.045768772500000006
Costs Both Biase: $0.052580160000000015


## Comparison with Gemini Pro

In [33]:
recommendation_accuracy(query_set, ModelEnum.GEMINIPRO)

----------PATH_NO_BIASE----------
EmbeddingLoader initialised
Like 0.5510204081632653
 Avoid 0.9823008849557522
Cuisine 0.8461538461538461
Overall0.7931583797576213
----------PATH_SYSTEM_BIASE----------
EmbeddingLoader initialised
Like 0.5121951219512195
 Avoid 0.9900990099009901
Cuisine 0.8392857142857143
Overall0.780526615379308
----------PATH_BOTH----------
EmbeddingLoader initialised
Like 0.5128205128205128
 Avoid 0.9897959183673469
Cuisine 0.9655172413793104
Overall0.8227112241890567
----------PATH_SEARCH_BIASE----------
EmbeddingLoader initialised
Like 0.5641025641025641
 Avoid 0.98989898989899
Cuisine 0.8771929824561403
Overall0.8103981788192315


### Ketogen Accuracy Recommendation

In [34]:
paths = get_paths(ModelEnum.GEMINIPRO.name)
pr_auc_raw = calc_metrics(query_set=query_set, paths=paths, model_name=ModelEnum.GEMINIPRO, ref_include=False)

10.866666666666667 100 22.87777777777778 100
74.75 100 53.38748196248197 100
10.342857142857138 100 55.0 100
85.0 100 31.817554242023142 100
85.0 100 21.447853239792103 100
               Macro Precision  Macro Recall  Macro F1  Micro Precision  \
Bias                                                                      
No Biase              0.108667      0.228778  0.147346         0.124688   
System Biase          0.747500      0.533875  0.622880         0.853881   
Search Engine         0.103429      0.550000  0.174115         0.104569   
Search Biase          0.850000      0.318176  0.463028         1.000000   
Both Biase            0.850000      0.214479  0.342528         1.000000   

               Micro Recall  Micro F1  Mean Average Precision  Mean PR-AUC  \
Bias                                                                         
No Biase           0.263158  0.169205                0.196583     0.177417   
System Biase       0.504043  0.633898                0.222287     0

### Mean Rounds

In [35]:
calc_mean_rounds(query_set, paths=paths, model_name=ModelEnum.GEMINIPRO)

Mean Rounds No Biase: 2.18
Mean Rounds System Biase: 4.25
Mean Search Biase: 2.62
Mean Both Biase: 3.89


### Reward Calculation

In [36]:
calc_reward(query_set=query_set, paths=paths, model_name=ModelEnum.GEMINIPRO)

Durchschnittlicher Score für System Biase: Score: 0.7317 bei gamma=1, normalize=True
Durchschnittlicher Score fuer No Biase: Score: 0.5504 bei gamma=1, normalize=True
Durchschnittlicher Score für Search Biase: Score: 0.5975 bei gamma=1, normalize=True
Durchschnittlicher Score für Both Biase: Score: 0.7049 bei gamma=1, normalize=True


### Average Path Length

In [37]:
calc_average_path_length(df=query_set,paths=paths, model_name=ModelEnum.GEMINIPRO)


Mean Path Length No Biase: 8.56
Mean Path Length Biase: 14.75
Mean Path Search Biase: 9.87
Mean Path Both Biase: 13.69


### Routing Accuracy

In [38]:
calc_routing_accuracy(query_set=query_set,paths=paths, model_name=ModelEnum.GEMINIPRO)

Durchschnittlicher Score für System Biase: Score: 0.8662
Durchschnittlicher Score fuer No Biase: Score: 0.7746
Durchschnittlicher Score für Search Biase: Score: 0.7988
Durchschnittlicher Score fuer Both Biase: Score: 0.8524


### Mean Time

In [39]:
calc_mean_time(query_set, paths=paths, model_name=ModelEnum.GEMINIPRO)

Mean Time No Biase: 333.78
Mean Time System Biase: 905.95
Mean Time Search Biase: 395.26
Mean Both Biase: 721.99


### Task Success Rate

In [40]:
no_biase_raw, system_biase_raw, search_biase_raw, search_both_biase = calc_task_success_rate(query_set=query_set, paths=paths,model=ModelEnum.GEMINIPRO)
eval_h2 = {
    "SystemBiase": system_biase_raw,
    "NoBiase":no_biase_raw,
    "SearchBiase":search_biase_raw,
    "BothBiase":search_both_biase
}

Task Success Rate No Biase: 0.85
Task Success Rate Biase: 0.33
Task Success Rate Search Biase: 0.78
Task Success Rate Both Biase: 0.43


In [41]:
dict_search_engine, dict_search_engine_search = get_search_engine(paths['PATH_SEARCH_ENGINE'])
dict_system_biase,  dict_system_biase_search, ref_system_biase  = get_dicts_set(df=query_set, model=ModelEnum.GEMINIPRO, Path=paths['PATH_SYSTEM_BIASE'])
dict_no_biase,      dict_no_biase_search, ref_no_biase      = get_dicts_set(df=query_set, model=ModelEnum.GEMINIPRO, Path=paths['PATH_NO_BIASE'])
dict_search_biase,  dict_search_biase_search, ref_search_biase  = get_dicts_set(query_set, ModelEnum.GEMINIPRO, paths['PATH_SEARCH_BIASE'])
dict_both,          dict_both_search, ref_both_biase          = get_dicts_set(query_set, ModelEnum.GEMINIPRO, paths['PATH_BOTH'])

In [42]:
eval = {
    "SystemBiase": calc_keto(df=query_set, dict_biase=dict_system_biase),
    "NoBiase":calc_keto(df=query_set, dict_biase=dict_no_biase),
    "SearchEngine": calc_keto(df=query_set, dict_biase=dict_search_engine),
    "SearchBiase":calc_keto(df=query_set, dict_biase=dict_search_biase),
    "BothBiase":calc_keto(df=query_set, dict_biase=dict_both)
}

In [43]:
df_wide = pd.DataFrame(eval)              
df_wide = df_wide.reset_index()            
df_wide = df_wide.rename(columns={"index": "query"})

In [44]:
for col in df_wide.columns:
    if col == "query": 
        continue
    x = df_wide[col].dropna().to_numpy()
    print(f"{col} Mean: {np.median(x)}")

SystemBiase Mean: 1.0010899274947511
NoBiase Mean: 0.27783885940861475
SearchEngine Mean: 0.2503780723728942
SearchBiase Mean: 0.972796922504326
BothBiase Mean: 1.0539998428126967
