# 11 – ETL de OpenFoodFacts para la Unión Europea

En este notebook se parte del subconjunto procesado de OpenFoodFacts:

- Entrada: `data_processed/openfoodfacts/openfoodfacts_subset.parquet`
- Origen último: `en.openfoodfacts.org.products.csv` (~11 GB)
- Contenido actual: ~4,21 millones de productos y 23 variables.

**Objetivo de este ETL**

Generar un dataset filtrado y listo para análisis, que cumpla:

1. Solo productos asociados a países de la **Unión Europea (UE-27)**.
2. Periodo temporal aproximado **2015–2023**, según `last_modified_datetime`.
3. Productos con los **4 nutrientes clave** del perfil nutricional informados:
   - `energy_100g`
   - `sugars_100g`
   - `saturated-fat_100g`
   - `sodium_100g`
4. Añadir variables derivadas:
   - `country_iso2` y `country_name` para el país de la UE.
   - `year` (año de última modificación).

El resultado se guardará como:

`data_processed/openfoodfacts/openfoodfacts_eu_nutri.parquet`

In [1]:
from pathlib import Path
import duckdb
import pandas as pd

# Carpeta raíz del proyecto
ROOT_DIR = Path("..").resolve().parent

# Carpeta con datos procesados de OFF
DATA_PROCESSED = ROOT_DIR / "data_processed" / "openfoodfacts"

# Fichero de entrada (subconjunto del pre-ETL)
PARQUET_BASE = DATA_PROCESSED / "openfoodfacts_subset.parquet"

# Fichero de salida (dataset filtrado Unión Europea + nutrientes completos)
PARQUET_EU = DATA_PROCESSED / "openfoodfacts_eu_nutri.parquet"

ROOT_DIR, PARQUET_BASE.exists(), PARQUET_BASE

(WindowsPath('C:/Users/santi/OneDrive - UNIR/UNIR/MASTER ANÁLISIS Y VISUALIZACIÓN BIG DATA/TFM/dashboard-coherencia-ue-tfm'),
 True,
 WindowsPath('C:/Users/santi/OneDrive - UNIR/UNIR/MASTER ANÁLISIS Y VISUALIZACIÓN BIG DATA/TFM/dashboard-coherencia-ue-tfm/data_processed/openfoodfacts/openfoodfacts_subset.parquet'))

In [2]:
# Conexión en memoria y vista rápida del subconjunto base
con = duckdb.connect(database=":memory:")

preview = con.execute(f"""
    SELECT *
    FROM read_parquet('{PARQUET_BASE}')
    LIMIT 5
""").fetchdf()

preview

Unnamed: 0,code,product_name,brands,countries,countries_tags,categories,categories_tags,pnns_groups_1,pnns_groups_2,quantity,...,saturated-fat_100g,sodium_100g,salt_100g,fat_100g,carbohydrates_100g,fiber_100g,proteins_100g,nova_group,created_datetime,last_modified_datetime
0,2,,,en:Germany,en:germany,,,,,,...,,,,,,,,,2025-10-19 03:13:03-05:00,2025-10-19 03:13:06-05:00
1,3,,,en:France,en:france,,,,,,...,,,,,,,,,2025-07-14 04:29:48-05:00,2025-07-14 04:29:49-05:00
2,4,,,en:france,en:france,,,,,,...,,,,,,,,,2025-11-19 05:35:27-05:00,2025-11-19 05:55:02-05:00
3,5,,,en:France,en:france,,,,,,...,,,,,,,,,2025-08-04 08:27:01-05:00,2025-08-04 08:27:03-05:00
4,6,,,en:Germany,en:germany,,,,,,...,,,,,,,,,2025-10-11 15:02:55-05:00,2025-10-11 16:42:10-05:00


In [3]:
# Lista de tags para los países UE-27
eu_tags = [
    "en:belgium",
    "en:bulgaria",
    "en:czech-republic",   # si luego vemos 'en:czechia', se añade
    "en:denmark",
    "en:germany",
    "en:estonia",
    "en:ireland",
    "en:greece",
    "en:spain",
    "en:france",
    "en:croatia",
    "en:italy",
    "en:cyprus",
    "en:latvia",
    "en:lithuania",
    "en:luxembourg",
    "en:hungary",
    "en:malta",
    "en:netherlands",
    "en:austria",
    "en:poland",
    "en:portugal",
    "en:romania",
    "en:slovenia",
    "en:slovakia",
    "en:finland",
    "en:sweden",
]

# Construimos una condición SQL del tipo:
# countries_tags ILIKE '%en:france%' OR countries_tags ILIKE '%en:spain%' ...
eu_condition = " OR ".join(
    [f"countries_tags ILIKE '%{tag}%'" for tag in eu_tags]
)

eu_condition

"countries_tags ILIKE '%en:belgium%' OR countries_tags ILIKE '%en:bulgaria%' OR countries_tags ILIKE '%en:czech-republic%' OR countries_tags ILIKE '%en:denmark%' OR countries_tags ILIKE '%en:germany%' OR countries_tags ILIKE '%en:estonia%' OR countries_tags ILIKE '%en:ireland%' OR countries_tags ILIKE '%en:greece%' OR countries_tags ILIKE '%en:spain%' OR countries_tags ILIKE '%en:france%' OR countries_tags ILIKE '%en:croatia%' OR countries_tags ILIKE '%en:italy%' OR countries_tags ILIKE '%en:cyprus%' OR countries_tags ILIKE '%en:latvia%' OR countries_tags ILIKE '%en:lithuania%' OR countries_tags ILIKE '%en:luxembourg%' OR countries_tags ILIKE '%en:hungary%' OR countries_tags ILIKE '%en:malta%' OR countries_tags ILIKE '%en:netherlands%' OR countries_tags ILIKE '%en:austria%' OR countries_tags ILIKE '%en:poland%' OR countries_tags ILIKE '%en:portugal%' OR countries_tags ILIKE '%en:romania%' OR countries_tags ILIKE '%en:slovenia%' OR countries_tags ILIKE '%en:slovakia%' OR countries_tags 

In [4]:
# Cuántos productos del subconjunto base tienen algún país UE en countries_tags
counts_eu = con.execute(f"""
    SELECT 
        COUNT(*) AS n_total_base,
        SUM( ({eu_condition})::INT ) AS n_con_paises_ue
    FROM read_parquet('{PARQUET_BASE}')
""").fetchdf()

counts_eu

Unnamed: 0,n_total_base,n_con_paises_ue
0,4210261,2541674.0


In [5]:
n_total_base = counts_eu["n_total_base"][0]
n_con_ue = counts_eu["n_con_paises_ue"][0]
porc_ue = n_con_ue / n_total_base * 100

n_total_base, n_con_ue, round(porc_ue, 2)

(np.int64(4210261), np.float64(2541674.0), np.float64(60.37))

In [6]:
# Ver rango temporal del subconjunto base
rango_fechas = con.execute(f"""
    SELECT 
        MIN(last_modified_datetime) AS min_fecha,
        MAX(last_modified_datetime) AS max_fecha
    FROM read_parquet('{PARQUET_BASE}')
""").fetchdf()

rango_fechas

Unnamed: 0,min_fecha,max_fecha
0,2012-11-22 05:52:50-05:00,2025-12-09 02:45:08-05:00


In [7]:
# Filtros de nutrientes clave
nutr_filters = """
    energy_100g IS NOT NULL
    AND sugars_100g IS NOT NULL
    AND "saturated-fat_100g" IS NOT NULL
    AND sodium_100g IS NOT NULL
"""

stats_filtrado = con.execute(f"""
    SELECT
        COUNT(*) AS n_total_base,
        SUM( ({eu_condition})::INT ) AS n_con_paises_ue,
        SUM( ( {eu_condition} AND {nutr_filters} )::INT ) AS n_ue_4nutri,
        SUM( ( {eu_condition}
               AND {nutr_filters}
               AND last_modified_datetime >= TIMESTAMP '2015-01-01'
               AND last_modified_datetime <  TIMESTAMP '2024-01-01'
             )::INT ) AS n_ue_4nutri_2015_2023
    FROM read_parquet('{PARQUET_BASE}')
""").fetchdf()

stats_filtrado

Unnamed: 0,n_total_base,n_con_paises_ue,n_ue_4nutri,n_ue_4nutri_2015_2023
0,4210261,2541674.0,2533216.0,2522740.0


In [8]:
row = stats_filtrado.iloc[0]

n_total_base = row["n_total_base"]
n_con_paises_ue = row["n_con_paises_ue"]
n_ue_4nutri = row["n_ue_4nutri"]
n_ue_4nutri_2015_2023 = row["n_ue_4nutri_2015_2023"]

porc_ue = n_con_paises_ue / n_total_base * 100
porc_ue_4nutri = n_ue_4nutri / n_total_base * 100
porc_ue_4nutri_2015_2023 = n_ue_4nutri_2015_2023 / n_total_base * 100

n_total_base, n_con_paises_ue, round(porc_ue, 2), \
n_ue_4nutri, round(porc_ue_4nutri, 2), \
n_ue_4nutri_2015_2023, round(porc_ue_4nutri_2015_2023, 2)

(np.float64(4210261.0),
 np.float64(2541674.0),
 np.float64(60.37),
 np.float64(2533216.0),
 np.float64(60.17),
 np.float64(2522740.0),
 np.float64(59.92))

In [9]:
# Creamos el dataset filtrado: Unión Europea + 4 nutrientes completos + periodo 2015–2023
PARQUET_EU.parent.mkdir(parents=True, exist_ok=True)

con.execute(f"""
    COPY (
        SELECT
            *,
            EXTRACT(YEAR FROM last_modified_datetime) AS year
        FROM read_parquet('{PARQUET_BASE}')
        WHERE ({eu_condition})
          AND {nutr_filters}
          AND last_modified_datetime >= TIMESTAMP '2015-01-01'
          AND last_modified_datetime <  TIMESTAMP '2024-01-01'
    )
    TO '{PARQUET_EU}'
    (FORMAT PARQUET)
""")

PARQUET_EU.exists(), PARQUET_EU

(True,
 WindowsPath('C:/Users/santi/OneDrive - UNIR/UNIR/MASTER ANÁLISIS Y VISUALIZACIÓN BIG DATA/TFM/dashboard-coherencia-ue-tfm/data_processed/openfoodfacts/openfoodfacts_eu_nutri.parquet'))

In [10]:
off_eu_sample = con.execute(f"""
    SELECT 
        code,
        product_name,
        countries,
        pnns_groups_1,
        pnns_groups_2,
        energy_100g,
        sugars_100g,
        "saturated-fat_100g",
        sodium_100g,
        proteins_100g,
        last_modified_datetime
    FROM read_parquet('{PARQUET_EU}')
    LIMIT 10
""").fetchdf()

off_eu_sample

Unnamed: 0,code,product_name,countries,pnns_groups_1,pnns_groups_2,energy_100g,sugars_100g,saturated-fat_100g,sodium_100g,proteins_100g,last_modified_datetime
0,215,Riso rosso,en:it,unknown,unknown,724.0,3.2,0.3,0.488,8.8,2022-08-01 05:19:15-05:00
1,369,Queso cabra,en:es,unknown,unknown,1406.0,0.12,16.6,0.76,19.7,2020-06-29 11:56:19-05:00
2,623,Amandes enrobées de chocolat,en:fr,unknown,unknown,2452.0,32.7,32.7,0.008,9.3,2019-07-20 06:38:17-05:00
3,633,Flocons d'epeautre,en:fr,unknown,unknown,1460.0,1.0,0.2,0.012,13.3,2020-08-01 05:33:20-05:00
4,789,Cordon bleu de volaille bio,en:fr,Fish Meat Eggs,Meat,979.0,0.6,2.8,0.56,13.0,2021-01-27 15:40:58-05:00
5,825,Cookie vegan,en:fr,unknown,unknown,2201.0,27.0,15.0,0.276,8.2,2019-12-19 13:10:45-05:00
6,827,Petit hoppelli noir,Belgium,Cereals and potatoes,Cereals,2280.0,56.0,20.0,0.072,6.1,2021-04-04 11:01:12-05:00
7,956,Caña lomo bellota ibérica,en:es,unknown,unknown,1569.0,1.3,8.2,1.36,35.0,2019-12-31 04:45:03-05:00
8,110124160,Sucre neige,en:france,unknown,unknown,1753.0,91.2,0.0,0.0,0.0,2019-01-25 12:18:44-05:00
9,1144,Snacks de mais,en:france,unknown,unknown,1824.0,0.0,1.4,0.64,0.0,2023-04-29 00:13:12-05:00


In [11]:
resumen_filtrado = pd.DataFrame({
    "indicador": [
        "Filas totales (subconjunto base)",
        "Filas con países UE (countries_tags)",
        "Filas UE con 4 nutrientes clave",
        "Filas UE con 4 nutrientes (2015–2023)",
        "% UE sobre total",
        "% UE con 4 nutrientes",
        "% UE 4 nutrientes (2015–2023)",
    ],
    "valor": [
        int(n_total_base),
        int(n_con_paises_ue),
        int(n_ue_4nutri),
        int(n_ue_4nutri_2015_2023),
        round(porc_ue, 2),
        round(porc_ue_4nutri, 2),
        round(porc_ue_4nutri_2015_2023, 2),
    ],
})

resumen_filtrado

Unnamed: 0,indicador,valor
0,Filas totales (subconjunto base),4210261.0
1,Filas con países UE (countries_tags),2541674.0
2,Filas UE con 4 nutrientes clave,2533216.0
3,Filas UE con 4 nutrientes (2015–2023),2522740.0
4,% UE sobre total,60.37
5,% UE con 4 nutrientes,60.17
6,% UE 4 nutrientes (2015–2023),59.92
