In [1]:
import pandas as pd
from fuzzywuzzy import process, fuzz
import re
import matplotlib.pyplot as plt
import numpy as np
from rapidfuzz import fuzz
from sentence_transformers import SentenceTransformer
import numpy as np

In [2]:
# Load the food consumption data
food_consumption_data = pd.read_csv("data/chronic_consumption_gday_allsubjects.csv", encoding="utf-16")

# Load the SuEatableLife dataset
df_wf = pd.read_excel("data/sueatablelife_dataset.xlsx", sheet_name="SEL WF for users")
df_cf = pd.read_excel("data/sueatablelife_dataset.xlsx", sheet_name="SEL CF for users")

In [3]:
# Function to normalize labels
def normalize_label(s):
    s = s.lower().strip().replace('-', ' ').replace('_', ' ')
     # Remove numbers and letters in brackets, asteriks, punctuation
    s = re.sub(r'\(.*?\)', '', s)  # Remove text in brackets
    s = re.sub(r'\d+', '', s)  # Remove numbers
    s = re.sub(r'\*', '', s)  # Remove asterisks
    s = re.sub(r'[^a-z0-9 ]+', ' ', s)       # drop punctuation
    s = re.sub(r'\b(semi[- ]skimmed|organic|low[- ]fat)\b', '', s)
    s = re.sub(r'\s+', ' ', s)               # collapse whitespace
    s = s.strip()  # Remove leading and trailing whitespace

    return s

In [4]:
# changing the name of a column in the food consumption data to match the SuEatableLife dataset
food_consumption_data.rename(columns={"Exposure hierarchy (L7)": 'Food commodity ITEM'}, inplace=True)

In [5]:
# group same foods in the food consumption data
df_fcd_grouped = food_consumption_data.groupby("Food commodity ITEM")["Mean"].sum().reset_index()

df_fcd_grouped

Unnamed: 0,Food commodity ITEM,Mean
0,Alcoholic sauce,0.03
1,Alcopop and flavoured wine,1.40
2,"Algae based formulations (e.g. Spirulina, chlo...",0.02
3,Almond drink,3.68
4,Almonds sweet,3.75
...,...,...
624,"Yoghurt drinks, including sweetened and/or fla...",3.76
625,"Yoghurt, cow milk",9.43
626,"Yoghurt, cow milk, flavoured",27.85
627,"Yoghurt, cow milk, plain",24.21


In [6]:
# keep only the relevant columns
df_fcd_grouped = df_fcd_grouped[["Food commodity ITEM", "Mean"]]
df_wf = df_wf[["Food commodity ITEM", "Water Footprint liters water/kg o liter of food ITEM"]]
df_cf = df_cf[["Food commodity ITEM", "Carbon Footprint kg CO2eq/kg or l of food ITEM"]]

In [7]:
df_cf

Unnamed: 0,Food commodity ITEM,Carbon Footprint kg CO2eq/kg or l of food ITEM
0,BEER IN CAN,0.693000
1,BEER IN GLASS,0.960500
2,BEER MODULAR CAN,0.530600
3,CHOCOLATE OR CREAM FILLED COOKIES**,1.533000
4,SIMPLE COOKIES**,1.390750
...,...,...
319,OCTOPUS,5.800000
320,SQUID,6.910000
321,PRAWNS/SHRIMPS,7.040000
322,OCTOPUS (F),7.878611


In [8]:
# Normalize the food labels in the dataframes

for df in [df_cf, df_wf, df_fcd_grouped]:
    df['clean_label'] = df["Food commodity ITEM"].apply(normalize_label)

In [9]:
# Create a union of all clean labels from the three dataframes

all_labels = pd.concat([
    df_cf['clean_label'],
    df_wf['clean_label'],
    df_fcd_grouped['clean_label'],
]).drop_duplicates().reset_index(drop=True)

master = pd.DataFrame({'clean_label': all_labels})

master


Unnamed: 0,clean_label
0,beer in can
1,beer in glass
2,beer modular can
3,chocolate or cream filled cookies
4,simple cookies
...,...
1051,yoghurt drinks including sweetened and or flav...
1052,yoghurt cow milk
1053,yoghurt cow milk flavoured
1054,yoghurt cow milk plain


In [10]:
threshold = 90
clusters = []        # list of lists of labels
used = set()

for label in master['clean_label']:
    if label in used:
        continue
    group = [label]
    used.add(label)
    for other in master['clean_label']:
        if other not in used:
            score = fuzz.token_set_ratio(label, other)
            if score >= threshold:
                group.append(other)
                used.add(other)
    clusters.append(group)


In [11]:
print(clusters)

[['beer in can', 'beer'], ['beer in glass'], ['beer modular can'], ['chocolate or cream filled cookies', 'chocolate', 'cream', 'filled chocolate'], ['simple cookies'], ['mineral water', 'natural mineral water'], ['bread multicereal', 'bread'], ['bread plain'], ['bread whole'], ['bread frozen'], ['cornflakes'], ['dark chocolate'], ['milk chocolate'], ['coffee ground'], ['coffee parchment'], ['coffee drip filtered'], ['coffee soluble powder', 'coffee soluble'], ['espresso', 'coffee espresso'], ['flavored crackers'], ['plain crackers'], ['wholegrain crackers'], ['crispbread'], ['ketchup', 'tomato ketchup and related sauces'], ['mayonnaise', 'mayonnaise sauce'], ['tempe'], ['egg pasta', 'pasta', 'dried egg pasta', 'fresh egg pasta', 'fresh stuffed egg pasta'], ['raspberries'], ['apple juice', 'apple', 'juice apple'], ['blueberry juice', 'blueberry'], ['cranberry juice', 'cranberry'], ['fig juice', 'fig'], ['kiwi juice', 'kiwi'], ['mango juice', 'mango'], ['pineapple juice', 'pineapple', 'j

In [12]:
# Try with transformers to get better results
model = SentenceTransformer("paraphrase-MiniLM-L6-v2")  # Or use multilingual for non-English labels
labels = master["clean_label"].tolist()
embeddings = model.encode(labels, convert_to_tensor=True)

In [13]:
import torch
from sentence_transformers.util import cos_sim

similarity_matrix = cos_sim(embeddings, embeddings)

In [14]:
def build_small_clusters(sim_matrix, threshold=0.85):
    n = sim_matrix.shape[0]
    visited = set()
    clusters = []
    
    for i in range(n):
        if i in visited:
            continue
        cluster = [i]
        visited.add(i)
        for j in range(i + 1, n):
            if sim_matrix[i][j] > threshold and j not in visited:
                cluster.append(j)
                visited.add(j)
        clusters.append(cluster)
    return clusters

clusters = build_small_clusters(similarity_matrix, threshold=0.85)

In [15]:
for i, cluster in enumerate(clusters):
    print(f"Cluster {i+1}:")
    print([labels[j] for j in cluster])
    print("------")

Cluster 1:
['beer in can']
------
Cluster 2:
['beer in glass', 'beer']
------
Cluster 3:
['beer modular can']
------
Cluster 4:
['chocolate or cream filled cookies']
------
Cluster 5:
['simple cookies']
------
Cluster 6:
['mineral water', 'natural mineral water']
------
Cluster 7:
['bread multicereal']
------
Cluster 8:
['bread plain', 'bread']
------
Cluster 9:
['bread whole']
------
Cluster 10:
['bread frozen']
------
Cluster 11:
['cornflakes']
------
Cluster 12:
['dark chocolate', 'chocolate', 'bitter sweet chocolate', 'white chocolate']
------
Cluster 13:
['milk chocolate']
------
Cluster 14:
['coffee ground']
------
Cluster 15:
['coffee parchment']
------
Cluster 16:
['coffee drip filtered']
------
Cluster 17:
['coffee soluble powder', 'coffee soluble', 'coffee beverage preparation powder', 'instant coffee powder']
------
Cluster 18:
['espresso']
------
Cluster 19:
['flavored crackers']
------
Cluster 20:
['plain crackers']
------
Cluster 21:
['wholegrain crackers']
------
Cluster

In [16]:
cluster_map = {}
for cluster in clusters:
    canonical = labels[cluster[0]]
    for idx in cluster:
        cluster_map[labels[idx]] = canonical

master["standard_label"] = master["clean_label"].map(cluster_map)

master

Unnamed: 0,clean_label,standard_label
0,beer in can,beer in can
1,beer in glass,beer in glass
2,beer modular can,beer modular can
3,chocolate or cream filled cookies,chocolate or cream filled cookies
4,simple cookies,simple cookies
...,...,...
1051,yoghurt drinks including sweetened and or flav...,yoghurt drinks including sweetened and or flav...
1052,yoghurt cow milk,yoghurt cow milk
1053,yoghurt cow milk flavoured,yoghurt cow milk
1054,yoghurt cow milk plain,yoghurt cow milk


In [17]:
# Keep only relevant columns from master
label_map = master[["clean_label", "standard_label"]].drop_duplicates()

# Map each dataset's clean_label to standard_label
df_cf = df_cf.merge(label_map, on="clean_label", how="left")
df_wf = df_wf.merge(label_map, on="clean_label", how="left")
df_fcd_grouped = df_fcd_grouped.merge(label_map, on="clean_label", how="left")

In [18]:
from functools import reduce

# Put them into a list
dfs = [df_cf, df_wf, df_fcd_grouped]

# Merge all based on 'standard_label'
merged = reduce(lambda left, right: pd.merge(left, right, on='standard_label', how='outer'), dfs)

In [19]:
merged.head(10)

Unnamed: 0,Food commodity ITEM_x,Carbon Footprint kg CO2eq/kg or l of food ITEM,clean_label_x,standard_label,Food commodity ITEM_y,Water Footprint liters water/kg o liter of food ITEM,clean_label_y,Food commodity ITEM,Mean,clean_label
0,,,,alcoholic sauce,,,,Alcoholic sauce,0.03,alcoholic sauce
1,,,,alcopop and flavoured wine,,,,Alcopop and flavoured wine,1.4,alcopop and flavoured wine
2,ALFONSINO,3.49,alfonsino,alfonsino,,,,,,
3,,,,algae based formulations,,,,"Algae based formulations (e.g. Spirulina, chlo...",0.02,algae based formulations
4,ALMOND,1.88,almond,almond,ALMOND WITH SHELL,8047.0,almond with shell,Almonds sweet,3.75,almonds sweet
5,ALMOND COVERED WITH CHOCOLATE,4.8,almond covered with chocolate,almond covered with chocolate,,,,,,
6,,,,almond drink,,,,Almond drink,3.68,almond drink
7,ALMOND MILK,0.417903,almond milk,almond milk,,,,,,
8,ALMOND PASTE,2.98,almond paste,almond paste,,,,,,
9,,,,almond shelled,ALMOND SHELLED,16095.0,almond shelled,,,


In [20]:
# This will return only rows where all columns are non-null
complete_rows = merged.dropna()

In [21]:
complete_rows

Unnamed: 0,Food commodity ITEM_x,Carbon Footprint kg CO2eq/kg or l of food ITEM,clean_label_x,standard_label,Food commodity ITEM_y,Water Footprint liters water/kg o liter of food ITEM,clean_label_y,Food commodity ITEM,Mean,clean_label
4,ALMOND,1.880000,almond,almond,ALMOND WITH SHELL,8047.0,almond with shell,Almonds sweet,3.75,almonds sweet
16,APPLE JUICE (I),2.840000,apple juice,apple juice,APPLE JUICE,1140.0,apple juice,"Juice, apple",0.65,juice apple
19,APRICOT,0.296000,apricot,apricot,APRICOT,1287.0,apricot,Apricots,24.89,apricots
21,ARTICHOKE,0.330000,artichoke,artichoke,ARTICHOKE,818.0,artichoke,Globe artichokes,7.85,globe artichokes
23,ASPARAGUS,0.830000,asparagus,asparagus,ASPARAGUS*,2150.0,asparagus,Asparagus,1.03,asparagus
...,...,...,...,...,...,...,...,...,...,...
1032,WATERMELON,0.520000,watermelon,watermelon,WATERMELON,185.0,watermelon,Watermelons,39.34,watermelons
1042,WHEAT PLAIN FLOUR,0.520000,wheat plain flour,wheat plain flour,WHEAT FLOUR,1849.0,wheat flour,Wheat flour white,9.63,wheat flour white
1043,WHEAT PLAIN FLOUR,0.520000,wheat plain flour,wheat plain flour,WHEAT FLOUR,1849.0,wheat flour,Wheat wholemeal flour,0.05,wheat wholemeal flour
1044,WHOLE WHEAT FLOUR,0.364716,whole wheat flour,wheat plain flour,WHEAT FLOUR,1849.0,wheat flour,Wheat flour white,9.63,wheat flour white


In [None]:
# plot the top 15 consumed products

