## Evaluate Dataset from Paper "Personalized Food Recommendation as Constrained Question Answering over a Large scale Food Knowledge Graph"

In [None]:
import pandas as pd 
import numpy as np 
from foodrec.config.structure.paths import DATASET_PATHS

In [18]:
df = pd.read_json(DATASET_PATHS / "test_qas_090820.json", orient="records", lines=True)
df_sample = pd.read_csv(DATASET_PATHS / "Annotation.csv", delimiter=";")
df_hand_annotation = pd.read_csv(DATASET_PATHS / "Hand_Annotation.csv", delimiter=";")

In [19]:
df.head()

Unnamed: 0,entities,topicKey,rel_path,qOriginText,qType,multi_tag_type,origin_answers,answers,log_dishes,qText,guideline,persona,domainType,qId,explicit_nutrition
0,"[[papaya, tag]]",[http://idea.rpi.edu/heals/kb/tag/papaya],[tagged_dishes],Can you suggest papaya recipes that do not con...,constraint,none,"[Thai Curried Prawn Soup, Pineapple, Papaya & ...",[Tropical Quinoa (Ww)],"[Irish Roasted Salmon, Slammin Salmon, Salmon ...",Can you suggest papaya recipes that do not con...,"{'saturated fat': {'percentage': 'calories', '...","{'ingredient_likes': ['scallions'], 'ingredien...",in-domain,constraint-qas-test-00000,
1,"[[labor-day, tag]]",[http://idea.rpi.edu/heals/kb/tag/labor-day],[tagged_dishes],What are low protein labor-day recipes which d...,constraint,none,"[Bacon-Wrapped Tater Tots, Southwestern Roaste...","[Mandarin Chicken Pasta Salad - Pampered Chef,...",[Fassolia Gigantes Plaki (Giant Beans Baked in...,What are low protein labor-day recipes which d...,"{'carbohydrates': {'unit': 'g', 'meal': {'type...","{'ingredient_likes': ['red bell peppers'], 'in...",in-domain,constraint-qas-test-00001,"[{'nutrition': 'protein', 'level': 'low', 'ran..."
2,"[[shakes, tag]]",[http://idea.rpi.edu/heals/kb/tag/shakes],[tagged_dishes],Can you suggest shakes recipes that do not con...,constraint,none,"[Baileys Frappe, Chocolate Coconut Milkshake, ...","[Grape Juice Shake, My Great Grape and Banana ...",[Fassolia Gigantes Plaki (Giant Beans Baked in...,Can you suggest shakes recipes that do not con...,"{'sugar': {'unit': 'g', 'meal': {'type': 'rang...","{'ingredient_likes': ['grape juice'], 'ingredi...",in-domain,constraint-qas-test-00002,
3,"[[rosh-hashana, tag]]",[http://idea.rpi.edu/heals/kb/tag/rosh-hashana],[tagged_dishes],Suggest low carbohydrates rosh-hashana dishes ...,constraint,none,"[Winter Fruit Salad, Canadian \'old-Time\' Bra...",[Red Snapper Baked with Orange],"[Irish Roasted Salmon, Slammin Salmon, Salmon ...",Suggest low carbohydrates rosh-hashana dishes ...,"{'sugar': {'unit': 'g', 'meal': {'type': 'rang...","{'ingredient_likes': ['red snapper'], 'ingredi...",in-domain,constraint-qas-test-00003,"[{'nutrition': 'carbohydrates', 'level': 'low'..."
4,"[[cherries, tag]]",[http://idea.rpi.edu/heals/kb/tag/cherries],[tagged_dishes],What cherries dishes do not contain ingredient...,constraint,none,"[Italian Cherry Sauce, Cherry Berry Smoothies,...",[Chocolate-Dipped Cherries With Pistachios],"[Irish Roasted Salmon, Slammin Salmon, Salmon ...",What cherries dishes do not contain ingredient...,"{'sugar': {'unit': 'g', 'meal': {'type': 'rang...","{'ingredient_likes': ['heavy cream', 'pistachi...",in-domain,constraint-qas-test-00004,


## Exploratory Analysis

"" We are particulary interested in the personas and Queries as one example shown below. The problem is we must adapt those to our possible personas including the variables cuisine and time""

First lets analyse different types of queries

In [20]:
print("One Example of a persona and query:")
row = df.iloc[40]
print(f"Query: {row['qOriginText']}")
print(f"Persona: {row['persona']}")

One Example of a persona and query:
Query: What squid dishes can I make that do not contain toasted sesame seeds?
Persona: {'ingredient_likes': ['carrot'], 'ingredient_dislikes': ['lobsters'], 'constrained_entities': {'1': ['carrot', 'calories with desired range 100.0 calories to 800.0 calories'], '2': ['lobsters', 'toasted sesame seeds']}}


## Select Ingredients

## Bring Personas and Sample together

In [21]:
mapping = (
    df.drop_duplicates('qOriginText', keep='first')
      .set_index('qOriginText')['persona']
)
df_sample['persona'] = df_sample['query'].map(mapping)


In [22]:
df_sample

Unnamed: 0,query,LLAMA,Gemini,OpenAI,persona
0,Can you suggest flat-shapes recipes that do no...,"{'time': [], 'ingredients_included': [], 'ingr...","{'time': [], 'ingredients_included': [], 'ingr...","{'time': [], 'ingredients_included': [], 'ingr...","{'ingredient_likes': ['ground cumin'], 'ingred..."
1,Can you suggest middle_east or european recipe...,"{'time': [], 'ingredients_included': [], 'ingr...","{'time': [], 'ingredients_included': [], 'ingr...","{'time': [], 'ingredients_included': [], 'ingr...",
2,Can you suggest non-alcoholic recipes that do ...,"{'time': [], 'ingredients_included': [], 'ingr...","{'time': [], 'ingredients_included': [], 'ingr...","{'time': [], 'ingredients_included': [], 'ingr...",{'ingredient_likes': ['non - dairy coffee crea...
3,Can you suggest octopus recipes that do not co...,"{'time': [], 'ingredients_included': ['octopus...","{'time': [], 'ingredients_included': ['octopus...","{'time': [], 'ingredients_included': ['octopus...","{'ingredient_likes': ['light beer'], 'ingredie..."
4,Can you suggest peruvian recipes that do not c...,"{'time': [], 'ingredients_included': [], 'ingr...","{'time': [], 'ingredients_included': [], 'ingr...","{'time': [], 'ingredients_included': [], 'ingr...","{'ingredient_likes': ['smoked tofu'], 'ingredi..."
...,...,...,...,...,...
95,What reynolds-wrap dishes don't have cornstarc...,"{'time': [], 'ingredients_included': [], 'ingr...","{'time': [], 'ingredients_included': [], 'ingr...","{'time': [], 'ingredients_included': [], 'ingr...","{'ingredient_likes': ['green bell pepper'], 'i..."
96,What somalian dishes can I make without chicke...,"{'time': [], 'ingredients_included': [], 'ingr...","{'time': [], 'ingredients_included': [], 'ingr...","{'time': [], 'ingredients_included': [], 'ingr...","{'ingredient_likes': ['lemons'], 'ingredient_d..."
97,What st-patricks-day dishes do not contain ing...,"{'time': [], 'ingredients_included': [], 'ingr...","{'time': [], 'ingredients_included': [], 'ingr...","{'time': [], 'ingredients_included': [], 'ingr...",{'ingredient_likes': ['semi - sweet chocolate ...
98,What stocks dishes don't have dried navy beans?,"{'time': [], 'ingredients_included': [], 'ingr...","{'time': [], 'ingredients_included': [], 'ingr...","{'time': [], 'ingredients_included': [], 'ingr...","{'ingredient_likes': ['cauliflower'], 'ingredi..."


In [None]:
import ast
import json
import math

def drop_key(val, key="constrained_entities"):
    if val is None or (isinstance(val, float) and math.isnan(val)):
        return val
    
    if isinstance(val, str):
        s = val.strip()
        try:
            if (s.startswith("{") and s.endswith("}")) or (s.startswith("[") and s.endswith("]")):
                d = json.loads(s)
            else:
                d = ast.literal_eval(s)
        except Exception:
            return val
    elif isinstance(val, dict):
        d = val.copy()
    else:
        return val

    d.pop(key, None)  
    return d

df_sample["persona_processed"] = df_sample["persona"].apply(
    lambda v: json.dumps(drop_key(v), ensure_ascii=False) if isinstance(drop_key(v), (dict, list)) else drop_key(v)
)

In [24]:
df_sample["persona_processed"].iloc[0]

'{"ingredient_likes": ["ground cumin"], "ingredient_dislikes": ["rye flour", "cornmeal mix"]}'

In [26]:
df_filtered = df_sample[['persona_processed', 'query']].copy()

In [27]:
df_filtered['persona_processed'].iloc[0]

'{"ingredient_likes": ["ground cumin"], "ingredient_dislikes": ["rye flour", "cornmeal mix"]}'

In [28]:
def replace_nutritions(text):
    text = text.replace("carbohydrates", "")
    text = text.replace("protein", "")
    text = text.replace("fat", "")
    text = text.replace("low", "")
    text = text.replace("high", "")
    text = text.replace("medium", "")
    text = text.replace("  ", " ")
    return text

In [29]:
df_filtered['query'] = df_filtered['query'].apply(replace_nutritions)
df_filtered["id"] = range(1, len(df_filtered) + 1)


In [16]:
df_filtered.to_csv("zw_personas.csv")

## Query Analysis

In [None]:
import ast
import math
import json
import pandas as pd

def to_dict(val):
    if val is None or (isinstance(val, float) and math.isnan(val)):
        return None
    if isinstance(val, dict):
        return val
    if isinstance(val, str):
        s = val.strip()
        try:
            return json.loads(s)
        except Exception:
            pass
        try:
            return ast.literal_eval(s)
        except Exception:
            return None
    return None

df_sample['Gemini_dict'] = df_sample['Gemini'].apply(to_dict)

fields = ["time", "ingredients_included", "ingredients_avoid", "cuisine", "calories"]
for field in fields:
    df_sample[field] = df_sample['Gemini_dict'].apply(lambda d: d.get(field) if isinstance(d, dict) else None)


In [14]:
df_sample['Gemini'].iloc[0]

"{'time': [], 'ingredients_included': [], 'ingredients_avoid': ['mustard powder', 'anise'], 'cuisine': ['latin', 'europe', 'asia', 'middle_east'], 'calories': []}"

In [15]:
df_sample["cuisine"] = df_sample["cuisine"].apply(
    lambda v: 0 if isinstance(v, (list, tuple, set)) and len(v) >= 2 else v
)


In [None]:
import pandas as pd
import numpy as np

def summarize_column(df, colname):
    s = df[colname]

    print(f"\n=== {colname} ===")
    print("Non-null:", s.notna().sum())

    if s.apply(lambda x: isinstance(x, (list, tuple, set))).any():
        combos = (
            s.dropna()
             .apply(lambda v: tuple(v) if isinstance(v, (list, tuple, set)) else v)
        )
        print("Unique combinations (row-level):", combos.nunique())

        exploded = df.explode(colname)[colname].dropna()
        exploded = exploded.apply(lambda v: tuple(v) if isinstance(v, (list, tuple, set)) else v)
        print("Unique items:", exploded.nunique())
        print("Top 10 items:")
        print(exploded.value_counts().head(10))

        if exploded.dtype.kind in "iufc":  
            print("\nNumeric item stats:")
            print(exploded.astype(float).describe())

        s_scalar = s.apply(lambda v: v[0] if isinstance(v, (list, tuple)) and len(v)==1 else np.nan)
        if s_scalar.notna().any():
            print("\nSingle-value list -> scalar stats:")
            print(pd.to_numeric(s_scalar, errors='coerce').describe())

    else:
        if pd.api.types.is_numeric_dtype(s):
            print(s.describe())
        else:
            print("Unique:", s.nunique(dropna=True))
            print(s.value_counts(dropna=True).head(10))

for field in ["time", "ingredients_included", "ingredients_avoid", "cuisine", "calories"]:
    summarize_column(df_sample, field)



=== time ===
Non-null: 100
Unique combinations (row-level): 3
Unique items: 2
Top 10 items:
time
fast    4
slow    1
Name: count, dtype: int64

=== ingredients_included ===
Non-null: 100
Unique combinations (row-level): 44
Unique items: 48
Top 10 items:
ingredients_included
chocolate    2
oatmeal      2
whitefish    2
wild-game    2
trout        2
sole         2
flounder     2
grapes       2
cherries     2
plums        1
Name: count, dtype: int64

Single-value list -> scalar stats:
count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: ingredients_included, dtype: float64

=== ingredients_avoid ===
Non-null: 100
Unique combinations (row-level): 100
Unique items: 114
Top 10 items:
ingredients_avoid
white bread                   2
mozzarella cheese             2
mustard powder                1
jalapeno                      1
cracked wheat                 1
garlic                        1
unsweetened dark chocolate    1
mint         

## Summary

We got 100 real queries + personas from the Paper above, which we now can use to test our model. We added randomly cuisine and diet preferences and deleted stuff like low carb or high carbohydrates, because we want that it comes from the system itself