In [1]:
import pandas as pd
from fuzzywuzzy import process, fuzz
import re
import matplotlib.pyplot as plt
import numpy as np
from rapidfuzz import fuzz
from SPARQLWrapper import SPARQLWrapper, JSON
from sentence_transformers import SentenceTransformer, util

In [2]:
# Load sentence transformer for semantic matching
model = SentenceTransformer('all-MiniLM-L6-v2')

# Setup SPARQL endpoint
sparql = SPARQLWrapper("http://agrovoc.fao.org/sparql")

def query_agrovoc(value):

  
    query = f"""
    PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
    SELECT ?concept ?label WHERE {{
        ?concept skos:prefLabel ?label .
        FILTER(LANG(?label) = "en" && REGEX(?label, "{value}", "i"))
    }}
    """
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    matches = []

    for result in results["results"]["bindings"]:
        label = result.get("label", {}).get("value")
        concept = result.get("concept", {}).get("value")
        
        if label and concept:
            matches.append((label, concept))
                
    return matches


def best_match(value, candidates):
    if not candidates:
        return None, None
    input_embedding = model.encode(value, convert_to_tensor=True)
    candidate_texts = [label for label, _ in candidates]
    candidate_embeddings = model.encode(candidate_texts, convert_to_tensor=True)
    scores = util.cos_sim(input_embedding, candidate_embeddings)[0]
    best_idx = scores.argmax().item()
    return candidates[best_idx]


def int_df_agrovoc(df, column):
    agrovoc_labels = []
    agrovoc_uris = []

    for val in df[column]:
        matches = query_agrovoc(val)
        best_label, best_uri = best_match(val, matches)
        agrovoc_labels.append(best_label)
        agrovoc_uris.append(best_uri)

    df["AGROVOC_label"] = agrovoc_labels
    df["AGROVOC_uri"] = agrovoc_uris
    return df

In [3]:
# Load the food consumption data
df_consumption = pd.read_csv("data/chronic_consumption_gday_allsubjects.csv", encoding="utf-16")

# Load the SuEatableLife dataset
df_wf = pd.read_excel("data/sueatablelife_dataset.xlsx", sheet_name="SEL WF for users")
df_cf = pd.read_excel("data/sueatablelife_dataset.xlsx", sheet_name="SEL CF for users")

In [4]:
df_consumption

Unnamed: 0,Survey's country,Survey start year,Survey name,Population Group (L2),Exposure hierarchy (L1),Exposure hierarchy (L2),Exposure hierarchy (L3),Exposure hierarchy (L4),Exposure hierarchy (L5),Exposure hierarchy (L6),...,Number of consumers,Mean,Standard Deviation,5th percentile,10th percentile,Median,95th percentile,97.5th percentile,99th percentile,Comment
0,Italy,2018,Italian national dietary survey on adult popul...,Adolescents,Alcoholic beverages,Beer and beer-like beverage,Beer,Beer,Beer,Beer,...,9,8.40,79.01,0.0,0.0,0.0,0.00,7.88,165.00,"'Number of observations lower than 298, the 99..."
1,Italy,2018,Italian national dietary survey on adult popul...,Adolescents,Alcoholic beverages,Unsweetened spirits and liqueurs,Liqueurs,Liqueurs,Liqueurs,Liqueurs,...,2,0.00,0.01,0.0,0.0,0.0,0.00,0.00,0.00,"'Number of observations lower than 298, the 99..."
2,Italy,2018,Italian national dietary survey on adult popul...,Adolescents,Alcoholic beverages,Unsweetened spirits and liqueurs,Unsweetened spirits,Spirits from fruit,Spirits made from fruits other than stone fruits,Spirits made from fruits other than stone fruits,...,1,0.05,0.75,0.0,0.0,0.0,0.00,0.00,0.00,"'Number of observations lower than 298, the 99..."
3,Italy,2018,Italian national dietary survey on adult popul...,Adolescents,Alcoholic beverages,Wine and wine-like drinks,Wine,Sparkling wine,Sparkling wine,Sparkling wine,...,1,0.08,1.28,0.0,0.0,0.0,0.00,0.00,0.00,"'Number of observations lower than 298, the 99..."
4,Italy,2018,Italian national dietary survey on adult popul...,Adolescents,Alcoholic beverages,Wine and wine-like drinks,Wine,"Wine, red","Wine, red","Wine, red",...,16,1.19,15.21,0.0,0.0,0.0,0.09,0.12,2.96,"'Number of observations lower than 298, the 99..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1484,Italy,2018,Italian national dietary survey on adult popul...,Elderly,Water and water-based beverages,Water based beverages,Soft drinks,Soft drinks with minor amounts of fruits or fl...,"Soft drink, flavoured, no fruit",Soft drink with bitter principle,...,2,0.58,5.11,0.0,0.0,0.0,0.00,0.00,42.20,"'Number of observations lower than 180, the 97..."
1485,Italy,2018,Italian national dietary survey on adult popul...,Elderly,Water and water-based beverages,Water based beverages,Soft drinks,Soft drinks with minor amounts of fruits or fl...,"Soft drink, flavoured, no fruit","Soft drink, flavoured, no fruit",...,1,1.03,12.81,0.0,0.0,0.0,0.00,0.00,0.00,"'Number of observations lower than 180, the 97..."
1486,Italy,2018,Italian national dietary survey on adult popul...,Elderly,Water and water-based beverages,Water based beverages,Soft drinks,Soft drinks with minor amounts of fruits or fl...,"Soft drink, flavoured, no fruit","Soft drink, lemon flavour",...,1,0.90,11.21,0.0,0.0,0.0,0.00,0.00,0.00,"'Number of observations lower than 180, the 97..."
1487,Italy,2018,Italian national dietary survey on adult popul...,Elderly,Water and water-based beverages,Water based beverages,Soft drinks,Soft drinks with minor amounts of fruits or fl...,"Soft drink, with fruit juice (fruit content be...","Fruit soft drink, orange",...,2,0.91,8.14,0.0,0.0,0.0,0.00,0.00,64.95,"'Number of observations lower than 180, the 97..."


In [5]:
df_consumption = df_consumption[(df_consumption["Exposure hierarchy (L7)"] != "Natural mineral water") &
    (df_consumption["Exposure hierarchy (L7)"] != "Tap water") &
    (df_consumption["Exposure hierarchy (L7)"] != "Filtered tap water")]

top_n = df_consumption.groupby("Exposure hierarchy (L5)", as_index=False)["Mean"].sum().sort_values('Mean', ascending=False).head(15)

top_n[["Exposure hierarchy (L5)", "Mean"]]

Unnamed: 0,Exposure hierarchy (L5),Mean
147,Cow milk,307.48
120,Coffee (average strength) beverage,175.78
8,Apples,134.45
524,Tomatoes,134.18
396,Potatoes,123.47
554,"Wheat bread and rolls, white (refined flour)",122.61
565,"Wine, red",108.29
95,Chicken fresh meat,99.33
177,Dried pasta,88.81
126,"Cola beverages, caffeinic",85.1


In [6]:
# Example: merge rows where column == "Milk" or "Whole milk"
rows_to_merge = top_n[top_n["Exposure hierarchy (L5)"].isin(["Coffee (average strength) beverage", "Coffee espresso (beverage)"])]
merged_row = rows_to_merge.sum(numeric_only=True)
merged_row["Exposure hierarchy (L5)"] = "Coffee"

# Drop the original rows and append the merged row
top_n = top_n[~top_n["Exposure hierarchy (L5)"].isin(["Coffee (average strength) beverage", "Coffee espresso (beverage)"])]
top_n = pd.concat([top_n, pd.DataFrame([merged_row])], ignore_index=True)

In [11]:
top_n = top_n.sort_values("Mean", ascending=False).reset_index(drop=True)

top_n

Unnamed: 0,Exposure hierarchy (L5),Mean
0,Cow milk,307.48
1,Coffee,253.19
2,Apples,134.45
3,Tomatoes,134.18
4,Potatoes,123.47
5,"Wheat bread and rolls, white (refined flour)",122.61
6,"Wine, red",108.29
7,Chicken fresh meat,99.33
8,Dried pasta,88.81
9,"Cola beverages, caffeinic",85.1


In [24]:
# Function to normalize labels
def normalize_label(s):
    s = s.lower().strip().replace('-', ' ').replace('_', ' ')
     # Remove numbers and letters in brackets, asteriks, punctuation
    s = re.sub(r'\(.*?\)', '', s)  # Remove text in brackets
    s = re.sub(r'\d+', '', s)  # Remove numbers
    s = re.sub(r'\*', '', s)  # Remove asterisks
    s = re.sub(r'[^a-z0-9 ]+', ' ', s)       # drop punctuation
    s = re.sub(r'\b(semi[- ]skimmed|organic|low[- ]fat)\b', '', s)
    s = re.sub(r'\s+', ' ', s)               # collapse whitespace
    s = re.sub(r"(fresh|caffeinic|common)", "", s) # remove common useless attributes
    s = s.strip()  # Remove leading and trailing whitespace

    return s

In [25]:
top_n['clean_label'] = top_n["Exposure hierarchy (L5)"].apply(normalize_label)

In [26]:
top_n

Unnamed: 0,Exposure hierarchy (L5),Mean,clean_label
0,Cow milk,307.48,cow milk
1,Coffee,253.19,coffee
2,Apples,134.45,apples
3,Tomatoes,134.18,tomatoes
4,Potatoes,123.47,potatoes
5,"Wheat bread and rolls, white (refined flour)",122.61,wheat bread and rolls white
6,"Wine, red",108.29,wine red
7,Chicken fresh meat,99.33,chicken meat
8,Dried pasta,88.81,dried pasta
9,"Cola beverages, caffeinic",85.1,cola beverages


In [30]:
df_cf['clean_label'] = df_cf["Food commodity ITEM"].apply(normalize_label)
df_wf['clean_label'] = df_wf["Food commodity ITEM"].apply(normalize_label)

In [40]:
# Create a mapping from top_n foods to the closest cF foods
matches = {}
for food_top in top_n['clean_label']:
    result = process.extractOne(food_top, df_cf['clean_label'], score_cutoff=70)
    if result is not None:
        match, score, _ = result
        matches[food_top] = match

In [41]:
top_n['matched_food'] = top_n['clean_label'].map(matches)

In [43]:
merged_df = top_n.merge(df_cf, left_on='matched_food', right_on='clean_label', suffixes=('_cf', '_wf'))

In [44]:
merged_df

Unnamed: 0,Exposure hierarchy (L5),Mean,clean_label_cf,matched_food,FOOD COMMODITY GROUP,Food commodity ITEM,Carbon Footprint kg CO2eq/kg or l of food ITEM,Uncertainty low (L) high (H),Suggested CF value,Food commodity TYPOLOGY,Carbon Footprint g CO2eq/g o cc of food TYPOLOGY,Food commodity sub-TYPOLOGY,Carbon Footprint g CO2eq/g o cc of food sub-TYPOLOGY,clean_label_wf
0,Cow milk,307.48,cow milk,cow milk,ANIMAL HUSBANDRY,COW MILK,1.305,L,OK item,MILK,1.437,-,-,cow milk
1,Coffee,253.19,coffee,coffee ground,AGRICULTURAL PROCESSED,COFFEE GROUND,5.129286,H,better typology,COFFEE GROUND & PARCHMENT,6.71,-,-,coffee ground
2,Apples,134.45,apples,apple,CROPS,APPLE,0.2539,L,OK item,FRUIT OPENFIELD,0.4,FRUIT OTHER,0.32,apple
3,Tomatoes,134.18,tomatoes,tomato,CROPS,TOMATO (G),2.425,L,OK item,VEGETABLES HEATED GREENHOUSE,2.55,-,-,tomato
4,Tomatoes,134.18,tomatoes,tomato,CROPS,TOMATO (g),0.90855,L,OK item,VEGETABLES NOT HEATED GREENHOUSE,1.18,-,-,tomato
5,Tomatoes,134.18,tomatoes,tomato,CROPS,TOMATO,0.4541,L,OK item,VEGETABLES OPENFIELD,0.33,"VEGETABLES (fruits, steems, flowers)",0.33,tomato
6,Potatoes,123.47,potatoes,potato,CROPS,POTATO,0.24,L,OK item,STARCHY TUBERS,0.27,-,-,potato
7,"Wheat bread and rolls, white (refined flour)",122.61,wheat bread and rolls white,wheat,CROPS,WHEAT,0.5715,L,OK item,GRAINS,0.57,-,-,wheat
8,"Wine, red",108.29,wine red,wine red,AGRICULTURAL PROCESSED,WINE RED,0.873,L,OK item,WINE,0.66,-,-,wine red
9,Chicken fresh meat,99.33,chicken meat,buffalo bone free meat,ANIMAL HUSBANDRY,BUFFALO BONE FREE MEAT*,78.800175,H,Item matching typology,BUFFALO BONE FREE MEAT*,78.8,-,-,buffalo bone free meat
