# 21 – Índice nutricional de productos OFF (UE)

En este notebook se construye un índice nutricional sintético a partir del dataset:

- Entrada: `data_processed/openfoodfacts/openfoodfacts_eu_nutri.parquet`
- Cobertura: productos comercializados en la UE-27 (2015–2023) con 4 nutrientes clave completos.
- Nutrientes considerados (por 100 g):
  - `energy_100g`
  - `sugars_100g`
  - `saturated-fat_100g`
  - `sodium_100g`

Objetivos:

1. Explorar rangos básicos de estos 4 nutrientes.
2. Calcular percentiles 5 y 95 de cada nutriente (para evitar el efecto de valores extremos).
3. Definir un índice relativo de "densidad de nutrientes críticos" en escala 0–100:
   - Se normaliza cada nutriente entre 0 y 1 usando los percentiles 5–95.
   - El índice es la media de los cuatro valores normalizados multiplicada por 100.
4. Guardar un nuevo fichero Parquet con el índice por producto, que se usará en las agregaciones por país, categoría y año.

In [1]:
from pathlib import Path
import duckdb
import pandas as pd

# Carpeta raíz del proyecto
ROOT_DIR = Path("..").resolve().parent

# Carpeta con datos procesados de OFF
DATA_PROCESSED = ROOT_DIR / "data_processed" / "openfoodfacts"

# Dataset filtrado UE con nurientes completos
PARQUET_EU = DATA_PROCESSED / "openfoodfacts_eu_nutri.parquet"

ROOT_DIR, PARQUET_EU.exists(), PARQUET_EU

(WindowsPath('C:/Users/santi/OneDrive - UNIR/UNIR/MASTER ANÁLISIS Y VISUALIZACIÓN BIG DATA/TFM/dashboard-coherencia-ue-tfm'),
 True,
 WindowsPath('C:/Users/santi/OneDrive - UNIR/UNIR/MASTER ANÁLISIS Y VISUALIZACIÓN BIG DATA/TFM/dashboard-coherencia-ue-tfm/data_processed/openfoodfacts/openfoodfacts_eu_nutri.parquet'))

In [2]:
# Conexión en memoria
con = duckdb.connect(database=":memory:")

# Vista rápida de 5 filas
preview = con.execute(f"""
    SELECT 
        code,
        product_name,
        countries,
        pnns_groups_1,
        pnns_groups_2,
        energy_100g,
        sugars_100g,
        "saturated-fat_100g",
        sodium_100g,
        year
    FROM read_parquet('{PARQUET_EU}')
    LIMIT 5
""").fetchdf()

preview

Unnamed: 0,code,product_name,countries,pnns_groups_1,pnns_groups_2,energy_100g,sugars_100g,saturated-fat_100g,sodium_100g,year
0,215,Riso rosso,en:it,unknown,unknown,724.0,3.2,0.3,0.488,2022
1,369,Queso cabra,en:es,unknown,unknown,1406.0,0.12,16.6,0.76,2020
2,623,Amandes enrobées de chocolat,en:fr,unknown,unknown,2452.0,32.7,32.7,0.008,2019
3,633,Flocons d'epeautre,en:fr,unknown,unknown,1460.0,1.0,0.2,0.012,2020
4,789,Cordon bleu de volaille bio,en:fr,Fish Meat Eggs,Meat,979.0,0.6,2.8,0.56,2021


In [3]:
# Estadísticos básicos para los 4 nutrientes clave
stats_nutrientes = con.execute(f"""
    WITH base AS (
        SELECT
            energy_100g,
            sugars_100g,
            "saturated-fat_100g" AS saturated_fat_100g,
            sodium_100g
        FROM read_parquet('{PARQUET_EU}')
    )
    SELECT 
        'energy_100g' AS nutrient,
        MIN(energy_100g) AS min_val,
        QUANTILE(energy_100g, 0.05) AS p05,
        QUANTILE(energy_100g, 0.95) AS p95,
        MAX(energy_100g) AS max_val,
        AVG(energy_100g) AS mean_val
    FROM base
    UNION ALL
    SELECT 
        'sugars_100g' AS nutrient,
        MIN(sugars_100g),
        QUANTILE(sugars_100g, 0.05),
        QUANTILE(sugars_100g, 0.95),
        MAX(sugars_100g),
        AVG(sugars_100g)
    FROM base
    UNION ALL
    SELECT 
        'saturated-fat_100g' AS nutrient,
        MIN(saturated_fat_100g),
        QUANTILE(saturated_fat_100g, 0.05),
        QUANTILE(saturated_fat_100g, 0.95),
        MAX(saturated_fat_100g),
        AVG(saturated_fat_100g)
    FROM base
    UNION ALL
    SELECT 
        'sodium_100g' AS nutrient,
        MIN(sodium_100g),
        QUANTILE(sodium_100g, 0.05),
        QUANTILE(sodium_100g, 0.95),
        MAX(sodium_100g),
        AVG(sodium_100g)
    FROM base
""").fetchdf()

stats_nutrientes

Unnamed: 0,nutrient,min_val,p05,p95,max_val,mean_val
0,energy_100g,0.0,84.0,2452.0,5.95e+16,74628740000.0
1,sugars_100g,0.0,0.0,56.0,1350.0,12.4392
2,saturated-fat_100g,0.0,0.0,20.9,2376.0,5.503459
3,sodium_100g,0.0,0.0,1.612,1708.333,0.5394499


In [4]:
# Construimos un diccionario {nutriente: {p05, p95}}
quantiles = {
    row["nutrient"]: {"p05": row["p05"], "p95": row["p95"]}
    for _, row in stats_nutrientes.iterrows()
}

quantiles

{'energy_100g': {'p05': 84.0, 'p95': 2452.0},
 'sugars_100g': {'p05': 0.0, 'p95': 56.0},
 'saturated-fat_100g': {'p05': 0.0, 'p95': 20.9},
 'sodium_100g': {'p05': 0.0, 'p95': 1.612}}

In [5]:
e_p05 = quantiles["energy_100g"]["p05"]
e_p95 = quantiles["energy_100g"]["p95"]

s_p05 = quantiles["sugars_100g"]["p05"]
s_p95 = quantiles["sugars_100g"]["p95"]

sf_p05 = quantiles["saturated-fat_100g"]["p05"]
sf_p95 = quantiles["saturated-fat_100g"]["p95"]

na_p05 = quantiles["sodium_100g"]["p05"]
na_p95 = quantiles["sodium_100g"]["p95"]

e_p05, e_p95, s_p05, s_p95, sf_p05, sf_p95, na_p05, na_p95

(84.0, 2452.0, 0.0, 56.0, 0.0, 20.9, 0.0, 1.612)

In [6]:
PARQUET_EU_IDX = DATA_PROCESSED / "openfoodfacts_eu_nutri_index.parquet"

PARQUET_EU_IDX.parent.mkdir(parents=True, exist_ok=True)

con.execute(f"""
    COPY (
        SELECT
            *,
            -- Normalización 0–1 por nutriente (recortando en p05–p95)
            GREATEST(0.0, LEAST(1.0,
                (energy_100g - {e_p05}) / NULLIF({e_p95} - {e_p05}, 0)
            )) AS s_energy,
            GREATEST(0.0, LEAST(1.0,
                (sugars_100g - {s_p05}) / NULLIF({s_p95} - {s_p05}, 0)
            )) AS s_sugars,
            GREATEST(0.0, LEAST(1.0,
                ("saturated-fat_100g" - {sf_p05}) / NULLIF({sf_p95} - {sf_p05}, 0)
            )) AS s_satfat,
            GREATEST(0.0, LEAST(1.0,
                (sodium_100g - {na_p05}) / NULLIF({na_p95} - {na_p05}, 0)
            )) AS s_sodium,
            -- Índice sintético 0–100 (mayor = mayor densidad de nutrientes críticos)
            100.0 * (
                GREATEST(0.0, LEAST(1.0,
                    (energy_100g - {e_p05}) / NULLIF({e_p95} - {e_p05}, 0)
                ))
              + GREATEST(0.0, LEAST(1.0,
                    (sugars_100g - {s_p05}) / NULLIF({s_p95} - {s_p05}, 0)
                ))
              + GREATEST(0.0, LEAST(1.0,
                    ("saturated-fat_100g" - {sf_p05}) / NULLIF({sf_p95} - {sf_p05}, 0)
                ))
              + GREATEST(0.0, LEAST(1.0,
                    (sodium_100g - {na_p05}) / NULLIF({na_p95} - {na_p05}, 0)
                ))
            ) / 4.0 AS off_nutrient_index
        FROM read_parquet('{PARQUET_EU}')
    )
    TO '{PARQUET_EU_IDX}'
    (FORMAT PARQUET)
""")

PARQUET_EU_IDX.exists(), PARQUET_EU_IDX

(True,
 WindowsPath('C:/Users/santi/OneDrive - UNIR/UNIR/MASTER ANÁLISIS Y VISUALIZACIÓN BIG DATA/TFM/dashboard-coherencia-ue-tfm/data_processed/openfoodfacts/openfoodfacts_eu_nutri_index.parquet'))

In [7]:
sample_idx = con.execute(f"""
    SELECT 
        code,
        product_name,
        countries,
        pnns_groups_1,
        energy_100g,
        sugars_100g,
        "saturated-fat_100g",
        sodium_100g,
        off_nutrient_index
    FROM read_parquet('{PARQUET_EU_IDX}')
    LIMIT 10
""").fetchdf()

sample_idx

Unnamed: 0,code,product_name,countries,pnns_groups_1,energy_100g,sugars_100g,saturated-fat_100g,sodium_100g,off_nutrient_index
0,215,Riso rosso,en:it,unknown,724.0,3.2,0.3,0.488,16.112418
1,369,Queso cabra,en:es,unknown,1406.0,0.12,16.6,0.76,45.653557
2,623,Amandes enrobées de chocolat,en:fr,unknown,2452.0,32.7,32.7,0.008,64.722284
3,633,Flocons d'epeautre,en:fr,unknown,1460.0,1.0,0.2,0.012,15.398794
4,789,Cordon bleu de volaille bio,en:fr,Fish Meat Eggs,979.0,0.6,2.8,0.56,21.750905
5,825,Cookie vegan,en:fr,unknown,2201.0,27.0,15.0,0.276,56.626637
6,827,Petit hoppelli noir,Belgium,Cereals and potatoes,2280.0,56.0,20.0,0.072,73.224192
7,956,Caña lomo bellota ibérica,en:es,unknown,1569.0,1.3,8.2,1.36,47.158568
8,110124160,Sucre neige,en:france,unknown,1753.0,91.2,0.0,0.0,42.620355
9,1144,Snacks de mais,en:france,unknown,1824.0,0.0,1.4,0.64,29.970132


In [8]:
res_stats = con.execute(f"""
    SELECT 
        COUNT(*) AS n_filas,
        MIN(off_nutrient_index) AS min_idx,
        AVG(off_nutrient_index) AS mean_idx,
        QUANTILE(off_nutrient_index, 0.5) AS med_idx,
        QUANTILE(off_nutrient_index, 0.9) AS p90_idx,
        MAX(off_nutrient_index) AS max_idx
    FROM read_parquet('{PARQUET_EU_IDX}')
""").fetchdf()

res_stats

Unnamed: 0,n_filas,min_idx,mean_idx,med_idx,p90_idx,max_idx
0,797280,0.0,28.503433,27.340566,52.990833,99.736064


In [9]:
resumen_index = pd.DataFrame({
    "indicador": [
        "Filas con índice",
        "Índice mínimo",
        "Índice medio",
        "Índice mediano",
        "Índice p90",
        "Índice máximo",
    ],
    "valor": [
        int(res_stats["n_filas"][0]),
        round(float(res_stats["min_idx"][0]), 2),
        round(float(res_stats["mean_idx"][0]), 2),
        round(float(res_stats["med_idx"][0]), 2),
        round(float(res_stats["p90_idx"][0]), 2),
        round(float(res_stats["max_idx"][0]), 2),
    ],
})

resumen_index

Unnamed: 0,indicador,valor
0,Filas con índice,797280.0
1,Índice mínimo,0.0
2,Índice medio,28.5
3,Índice mediano,27.34
4,Índice p90,52.99
5,Índice máximo,99.74
