In [1]:
from fuzzywuzzy import process, fuzz
import re
import matplotlib.pyplot as plt
import numpy as np
from rapidfuzz import fuzz
import os
import json
import string
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON
from sentence_transformers import SentenceTransformer, util

In [2]:
# Initialize SPARQL endpoint and model
sparql = SPARQLWrapper("http://agrovoc.fao.org/sparql")
model = SentenceTransformer('all-MiniLM-L6-v2')

CACHE_PATH = "agro_cache.json"

In [3]:
def load_cache():
    if not os.path.exists(CACHE_PATH):
        return {}
    with open(CACHE_PATH, "r", encoding="utf-8") as f:
        cache = json.load(f)
        return cache

def save_cache(cache):
    with open(CACHE_PATH, "w", encoding="utf-8") as f:
        json.dump(cache, f, indent=2)

In [4]:
def semantic_best_match(value, candidates):
    input_embedding = model.encode(value, convert_to_tensor=True)
    candidate_texts = []
    for c in candidates:
        candidate_texts.append(c["label"])
    candidate_embeddings = model.encode(candidate_texts, convert_to_tensor=True)
    scores = util.cos_sim(input_embedding, candidate_embeddings)[0]
    best_idx = scores.argmax().item()
    return candidates[best_idx]

In [5]:
def query_agrovoc(value, cache):


    if value in cache:
        print(f"Cache hit: '{value}' found in cache")
        return cache[value]
    
    print(f"Cache miss: '{value}' not in cache, querying AGROVOC endpoint")
    # Run SPARQL query
    query = f"""
    PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
    SELECT ?concept ?label WHERE {{
        ?concept skos:prefLabel ?label .
        FILTER(LANG(?label) = "en" && REGEX(?label, "{value}", "i"))
    }}
    """
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    candidates = []
    for result in results["results"]["bindings"]:
        label = result["label"]["value"]
        uri = result["concept"]["value"]
        candidate = {"label": label, "uri": uri}
        candidates.append(candidate)
    if not candidates:
        match = {"label": None, "uri": None}
    else:
        best = semantic_best_match(value, candidates)
        match = best
    # Cache the result
    cache[value] = match
    save_cache(cache)
    return match

In [6]:
def enrich_with_agrovoc(df, column_name):
    cache = load_cache()
    labels = []
    uris = []
    for val in df[column_name]:
        match = query_agrovoc(val, cache)
        labels.append(match["label"])
        uris.append(match["uri"])
    df["AGROVOC_label"] = labels
    df["AGROVOC_uri"] = uris
    return df

In [7]:
# Load the food consumption data
df_consumption = pd.read_csv("data/chronic_consumption_gday_allsubjects.csv", encoding="utf-16")

# Load the SuEatableLife dataset
df_wf = pd.read_excel("data/sueatablelife_dataset.xlsx", sheet_name="SEL WF for users")
df_cf = pd.read_excel("data/sueatablelife_dataset.xlsx", sheet_name="SEL CF for users")

In [8]:
df_consumption

Unnamed: 0,Survey's country,Survey start year,Survey name,Population Group (L2),Exposure hierarchy (L1),Exposure hierarchy (L2),Exposure hierarchy (L3),Exposure hierarchy (L4),Exposure hierarchy (L5),Exposure hierarchy (L6),...,Number of consumers,Mean,Standard Deviation,5th percentile,10th percentile,Median,95th percentile,97.5th percentile,99th percentile,Comment
0,Italy,2018,Italian national dietary survey on adult popul...,Adolescents,Alcoholic beverages,Beer and beer-like beverage,Beer,Beer,Beer,Beer,...,9,8.40,79.01,0.0,0.0,0.0,0.00,7.88,165.00,"'Number of observations lower than 298, the 99..."
1,Italy,2018,Italian national dietary survey on adult popul...,Adolescents,Alcoholic beverages,Unsweetened spirits and liqueurs,Liqueurs,Liqueurs,Liqueurs,Liqueurs,...,2,0.00,0.01,0.0,0.0,0.0,0.00,0.00,0.00,"'Number of observations lower than 298, the 99..."
2,Italy,2018,Italian national dietary survey on adult popul...,Adolescents,Alcoholic beverages,Unsweetened spirits and liqueurs,Unsweetened spirits,Spirits from fruit,Spirits made from fruits other than stone fruits,Spirits made from fruits other than stone fruits,...,1,0.05,0.75,0.0,0.0,0.0,0.00,0.00,0.00,"'Number of observations lower than 298, the 99..."
3,Italy,2018,Italian national dietary survey on adult popul...,Adolescents,Alcoholic beverages,Wine and wine-like drinks,Wine,Sparkling wine,Sparkling wine,Sparkling wine,...,1,0.08,1.28,0.0,0.0,0.0,0.00,0.00,0.00,"'Number of observations lower than 298, the 99..."
4,Italy,2018,Italian national dietary survey on adult popul...,Adolescents,Alcoholic beverages,Wine and wine-like drinks,Wine,"Wine, red","Wine, red","Wine, red",...,16,1.19,15.21,0.0,0.0,0.0,0.09,0.12,2.96,"'Number of observations lower than 298, the 99..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1484,Italy,2018,Italian national dietary survey on adult popul...,Elderly,Water and water-based beverages,Water based beverages,Soft drinks,Soft drinks with minor amounts of fruits or fl...,"Soft drink, flavoured, no fruit",Soft drink with bitter principle,...,2,0.58,5.11,0.0,0.0,0.0,0.00,0.00,42.20,"'Number of observations lower than 180, the 97..."
1485,Italy,2018,Italian national dietary survey on adult popul...,Elderly,Water and water-based beverages,Water based beverages,Soft drinks,Soft drinks with minor amounts of fruits or fl...,"Soft drink, flavoured, no fruit","Soft drink, flavoured, no fruit",...,1,1.03,12.81,0.0,0.0,0.0,0.00,0.00,0.00,"'Number of observations lower than 180, the 97..."
1486,Italy,2018,Italian national dietary survey on adult popul...,Elderly,Water and water-based beverages,Water based beverages,Soft drinks,Soft drinks with minor amounts of fruits or fl...,"Soft drink, flavoured, no fruit","Soft drink, lemon flavour",...,1,0.90,11.21,0.0,0.0,0.0,0.00,0.00,0.00,"'Number of observations lower than 180, the 97..."
1487,Italy,2018,Italian national dietary survey on adult popul...,Elderly,Water and water-based beverages,Water based beverages,Soft drinks,Soft drinks with minor amounts of fruits or fl...,"Soft drink, with fruit juice (fruit content be...","Fruit soft drink, orange",...,2,0.91,8.14,0.0,0.0,0.0,0.00,0.00,64.95,"'Number of observations lower than 180, the 97..."


In [9]:
df_consumption = df_consumption[(df_consumption["Exposure hierarchy (L7)"] != "Natural mineral water") &
    (df_consumption["Exposure hierarchy (L7)"] != "Tap water") &
    (df_consumption["Exposure hierarchy (L7)"] != "Filtered tap water")]

top_n = df_consumption.groupby("Exposure hierarchy (L5)", as_index=False)["Mean"].sum().sort_values('Mean', ascending=False).head(15)

top_n[["Exposure hierarchy (L5)", "Mean"]]

Unnamed: 0,Exposure hierarchy (L5),Mean
147,Cow milk,307.48
120,Coffee (average strength) beverage,175.78
8,Apples,134.45
524,Tomatoes,134.18
396,Potatoes,123.47
554,"Wheat bread and rolls, white (refined flour)",122.61
565,"Wine, red",108.29
95,Chicken fresh meat,99.33
177,Dried pasta,88.81
126,"Cola beverages, caffeinic",85.1


In [10]:
# Example: merge rows where column == "Milk" or "Whole milk"
rows_to_merge = top_n[top_n["Exposure hierarchy (L5)"].isin(["Coffee (average strength) beverage", "Coffee espresso (beverage)"])]
merged_row = rows_to_merge.sum(numeric_only=True)
merged_row["Exposure hierarchy (L5)"] = "Coffee"

# Drop the original rows and append the merged row
top_n = top_n[~top_n["Exposure hierarchy (L5)"].isin(["Coffee (average strength) beverage", "Coffee espresso (beverage)"])]
top_n = pd.concat([top_n, pd.DataFrame([merged_row])], ignore_index=True)

In [11]:
top_n = top_n.sort_values("Mean", ascending=False).reset_index(drop=True)

top_n

Unnamed: 0,Exposure hierarchy (L5),Mean
0,Cow milk,307.48
1,Coffee,253.19
2,Apples,134.45
3,Tomatoes,134.18
4,Potatoes,123.47
5,"Wheat bread and rolls, white (refined flour)",122.61
6,"Wine, red",108.29
7,Chicken fresh meat,99.33
8,Dried pasta,88.81
9,"Cola beverages, caffeinic",85.1


In [17]:
top_n_enriched = enrich_with_agrovoc(top_n, "Exposure hierarchy (L5)")

top_n_enriched

Cache hit: 'Cow milk' found in cache
Cache hit: 'Coffee' found in cache
Cache hit: 'Apples' found in cache
Cache hit: 'Tomatoes' found in cache
Cache hit: 'Potatoes' found in cache
Cache hit: 'Wheat bread and rolls, white (refined flour)' found in cache
Cache hit: 'Wine, red' found in cache
Cache hit: 'Chicken fresh meat' found in cache
Cache hit: 'Dried pasta' found in cache
Cache hit: 'Cola beverages, caffeinic' found in cache
Cache hit: 'Olive oil, virgin or extra-virgin' found in cache
Cache hit: 'Beer' found in cache
Cache hit: 'Cow, ox or bull fresh meat' found in cache
Cache hit: 'Common peaches' found in cache


Unnamed: 0,Exposure hierarchy (L5),Mean,clean_label,AGROVOC_label,AGROVOC_uri
0,Cow milk,307.48,cow milk,cow milk,http://aims.fao.org/aos/agrovoc/c_16080
1,Coffee,253.19,coffee,coffee,http://aims.fao.org/aos/agrovoc/c_1731
2,Apples,134.45,apple,apples,http://aims.fao.org/aos/agrovoc/c_541
3,Tomatoes,134.18,tomatoe,tomatoes,http://aims.fao.org/aos/agrovoc/c_7805
4,Potatoes,123.47,potatoe,potatoes,http://aims.fao.org/aos/agrovoc/c_13551
5,"Wheat bread and rolls, white (refined flour)",122.61,bread and rolls white,bread,http://aims.fao.org/aos/agrovoc/c_1071
6,"Wine, red",108.29,wine red,red wines,http://aims.fao.org/aos/agrovoc/c_28614
7,Chicken fresh meat,99.33,chicken,chicken meat,http://aims.fao.org/aos/agrovoc/c_24000
8,Dried pasta,88.81,dried pasta,,
9,"Cola beverages, caffeinic",85.1,cola beverage,soft drinks,http://aims.fao.org/aos/agrovoc/c_7149


In [12]:
# List of (pattern, replacement) in priority order:
PLURAL_RULES = [
    # -ies → -y      (e.g. “berries” → “berry”)
    (r'(?i)([a-z]+)ies$', r'\1y'),
    # -ves → -f      (e.g. “wolves” → “wolf”)
    (r'(?i)([a-z]+)ves$', r'\1f'),
    # -oes → -oe     (e.g. “heroes” → “heroe”—rare, you may want 'o')
    (r'(?i)([a-z]+)oes$', r'\1oe'),
    # -ses → -s      (e.g. “dresses” → “dress”)
    (r'(?i)([a-z]+)ses$', r'\1s'),
    # -xes → -x      (e.g. “boxes” → “box”)
    (r'(?i)([a-z]+)xes$', r'\1x'),
    # -ches/-shes → -ch/-sh  (e.g. “churches” → “church”)
    (r'(?i)([a-z]+(?:ch|sh))es$', r'\1'),
    # -s → ''        (catch‑all; e.g. “cars” → “car”)
    (r'(?i)([a-z]+)s$', r'\1'),
]

def singularize(word: str) -> str:
    """Apply common English plural→singular regex rules."""
    for pattern, repl in PLURAL_RULES:
        if re.search(pattern, word):
            return re.sub(pattern, repl, word)
    return word  # no rule matched

In [13]:
# Function to normalize labels
def normalize_label(s):
    s = s.lower().strip().replace('-', ' ').replace('_', ' ')
     # Remove numbers and letters in brackets, asteriks, punctuation
    s = re.sub(r'\(.*?\)', '', s)  # Remove text in brackets
    s = re.sub(r'\d+', '', s)  # Remove numbers
    s = re.sub(r'\*', '', s)  # Remove asterisks
    s = re.sub(r'[^a-z0-9 ]+', ' ', s)       # drop punctuation
    s = re.sub(r'\b(semi[- ]skimmed|organic|low[- ]fat)\b', '', s)
    s = re.sub(r'\s+', ' ', s)               # collapse whitespace
    s = re.sub(r"(fresh|caffeinic|common)", "", s) # remove common useless attributes
    s = re.sub(r"wheat bread", "bread", s)
    s = s.strip()  # Remove leading and trailing whitespace
    s = re.sub(r"meat", "", s) # remove meat to improve the results of fuzzy matching for different animal meat
    s = re.sub(r"or", "", s)
    # handling plurals
    s = singularize(s)

    return s

In [14]:
s = "apples"

print(normalize_label(s))

apple


In [15]:
top_n['clean_label'] = top_n["Exposure hierarchy (L5)"].apply(normalize_label)

In [16]:
top_n_enriched = enrich_with_agrovoc(top_n, "clean_label")

top_n_enriched

Cache miss: 'cow milk' not in cache, querying AGROVOC endpoint
Cache miss: 'coffee' not in cache, querying AGROVOC endpoint
Cache miss: 'apple' not in cache, querying AGROVOC endpoint
Cache miss: 'tomatoe' not in cache, querying AGROVOC endpoint
Cache miss: 'potatoe' not in cache, querying AGROVOC endpoint
Cache miss: 'bread and rolls white' not in cache, querying AGROVOC endpoint
Cache miss: 'wine red' not in cache, querying AGROVOC endpoint
Cache miss: 'chicken  ' not in cache, querying AGROVOC endpoint
Cache miss: 'dried pasta' not in cache, querying AGROVOC endpoint
Cache miss: 'cola beverage' not in cache, querying AGROVOC endpoint
Cache miss: 'olive oil virgin  extra virgin' not in cache, querying AGROVOC endpoint
Cache miss: 'beer' not in cache, querying AGROVOC endpoint
Cache miss: 'cow ox  bull  ' not in cache, querying AGROVOC endpoint
Cache miss: 'peach' not in cache, querying AGROVOC endpoint


Unnamed: 0,Exposure hierarchy (L5),Mean,clean_label,AGROVOC_label,AGROVOC_uri
0,Cow milk,307.48,cow milk,cow milk,http://aims.fao.org/aos/agrovoc/c_16080
1,Coffee,253.19,coffee,coffee,http://aims.fao.org/aos/agrovoc/c_1731
2,Apples,134.45,apple,apples,http://aims.fao.org/aos/agrovoc/c_541
3,Tomatoes,134.18,tomatoe,tomatoes,http://aims.fao.org/aos/agrovoc/c_7805
4,Potatoes,123.47,potatoe,potatoes,http://aims.fao.org/aos/agrovoc/c_13551
5,"Wheat bread and rolls, white (refined flour)",122.61,bread and rolls white,,
6,"Wine, red",108.29,wine red,,
7,Chicken fresh meat,99.33,chicken,,
8,Dried pasta,88.81,dried pasta,,
9,"Cola beverages, caffeinic",85.1,cola beverage,,


In [None]:
top_n

In [None]:
df_cf['clean_label'] = df_cf["Food commodity ITEM"].apply(normalize_label)
df_wf['clean_label'] = df_wf["Food commodity ITEM"].apply(normalize_label)

In [None]:
# Create a mapping from top_n foods to the closest cF foods
matches = {}
for food_top in top_n['clean_label']:
    result = process.extractOne(food_top, df_cf['clean_label'], score_cutoff=85)
    if result is not None:
        match, score, _ = result
        matches[food_top] = match

In [None]:
top_n['matched_food'] = top_n['clean_label'].map(matches)

In [None]:
merged_df = top_n.merge(df_cf, left_on='matched_food', right_on='clean_label', suffixes=('_cons', '_cf'))

In [None]:
merged_df

In [None]:
# Manually fix the data
# Choose the correct coffee

coffee_rows_cf = df_cf[df_cf["clean_label"].str.contains("coffe", case=False, na=False)]

coffee_rows_cf


In [None]:
single_coffee_row = df_cf[df_cf["clean_label"] == "coffee drip filtered"]

single_coffee_row