# 21 – Panel de indicadores UE

En este notebook se construye una tabla integrada (`panel_coherencia_ue.parquet`)
que combina, a nivel **país–año**, la información procedente de:

- **OpenFoodFacts (OFF)**: índice sintético de nutrientes críticos.
- **Eurostat – HICP**: Índice armonizado de precios al consumo.
- **Eurostat – HBS**: Estructura del gasto de los hogares.
- **FAOSTAT – CoAHD**: Coste y asequibilidad de una dieta saludable.

El resultado será la base de datos que alimentará el cuadro de mando.

In [1]:
from pathlib import Path
import duckdb
import pandas as pd

# Carpeta raíz del proyecto (misma lógica que en el resto de notebooks)
ROOT_DIR = Path("..").resolve().parent

DATA_PROCESSED = ROOT_DIR / "data_processed"

OFF_PARQUET     = DATA_PROCESSED / "openfoodfacts" / "openfoodfacts_eu_nutri_index.parquet"
HICP_PARQUET    = DATA_PROCESSED / "eurostat"     / "hicp_eu27_2015_2023.parquet"
HBS_PARQUET     = DATA_PROCESSED / "eurostat"     / "hbs_eu27_long.parquet"
FAOSTAT_PARQUET = DATA_PROCESSED / "faostat"      / "faostat_cahd_eu27_2017_2023.parquet"

PANEL_DIR     = DATA_PROCESSED / "panel"
PANEL_PARQUET = PANEL_DIR / "panel_coherencia_ue.parquet"

PANEL_DIR.mkdir(parents=True, exist_ok=True)

ROOT_DIR, OFF_PARQUET.exists(), HICP_PARQUET.exists(), HBS_PARQUET.exists(), FAOSTAT_PARQUET.exists(), PANEL_PARQUET

(WindowsPath('C:/Users/santi/OneDrive - UNIR/UNIR/MASTER ANÁLISIS Y VISUALIZACIÓN BIG DATA/TFM/dashboard-coherencia-ue-tfm'),
 True,
 True,
 True,
 True,
 WindowsPath('C:/Users/santi/OneDrive - UNIR/UNIR/MASTER ANÁLISIS Y VISUALIZACIÓN BIG DATA/TFM/dashboard-coherencia-ue-tfm/data_processed/panel/panel_coherencia_ue.parquet'))

In [2]:
# Conexión en memoria a DuckDB
con = duckdb.connect(database=":memory:")

con

<_duckdb.DuckDBPyConnection at 0x28d9ceb6230>

In [3]:
hbs_units = con.execute(f"""
    SELECT DISTINCT unit
    FROM read_parquet('{HBS_PARQUET}')
    ORDER BY unit
""").fetchdf()

hbs_units

Unnamed: 0,unit
0,PM


In [4]:
con.execute(f"""
    COPY (
        WITH
        -- 1) OFF: índice nutricional por país-año -----------------------------
        off_base AS (
            SELECT
                CASE
                    WHEN countries_tags ILIKE '%en:austria%'   THEN 'AT'
                    WHEN countries_tags ILIKE '%en:belgium%'   THEN 'BE'
                    WHEN countries_tags ILIKE '%en:bulgaria%'  THEN 'BG'
                    WHEN countries_tags ILIKE '%en:croatia%'   THEN 'HR'
                    WHEN countries_tags ILIKE '%en:cyprus%'    THEN 'CY'
                    WHEN countries_tags ILIKE '%en:czech%'     THEN 'CZ'
                    WHEN countries_tags ILIKE '%en:denmark%'   THEN 'DK'
                    WHEN countries_tags ILIKE '%en:estonia%'   THEN 'EE'
                    WHEN countries_tags ILIKE '%en:finland%'   THEN 'FI'
                    WHEN countries_tags ILIKE '%en:france%'    THEN 'FR'
                    WHEN countries_tags ILIKE '%en:germany%'   THEN 'DE'
                    WHEN countries_tags ILIKE '%en:greece%'    THEN 'EL'
                    WHEN countries_tags ILIKE '%en:hungary%'   THEN 'HU'
                    WHEN countries_tags ILIKE '%en:ireland%'   THEN 'IE'
                    WHEN countries_tags ILIKE '%en:italy%'     THEN 'IT'
                    WHEN countries_tags ILIKE '%en:latvia%'    THEN 'LV'
                    WHEN countries_tags ILIKE '%en:lithuania%' THEN 'LT'
                    WHEN countries_tags ILIKE '%en:luxembourg%' THEN 'LU'
                    WHEN countries_tags ILIKE '%en:malta%'     THEN 'MT'
                    WHEN countries_tags ILIKE '%en:netherlands%' THEN 'NL'
                    WHEN countries_tags ILIKE '%en:poland%'    THEN 'PL'
                    WHEN countries_tags ILIKE '%en:portugal%'  THEN 'PT'
                    WHEN countries_tags ILIKE '%en:romania%'   THEN 'RO'
                    WHEN countries_tags ILIKE '%en:slovakia%'  THEN 'SK'
                    WHEN countries_tags ILIKE '%en:slovenia%'  THEN 'SI'
                    WHEN countries_tags ILIKE '%en:spain%'     THEN 'ES'
                    WHEN countries_tags ILIKE '%en:sweden%'    THEN 'SE'
                END AS geo,
                year,
                off_nutrient_index,
                s_energy,
                s_sugars,
                s_satfat,
                s_sodium
            FROM read_parquet('{OFF_PARQUET}')
        ),
        off_agg AS (
            SELECT
                geo,
                year,
                AVG(off_nutrient_index) AS off_index_mean,
                100.0 * AVG( (off_nutrient_index >= 50)::DOUBLE ) AS share_high_off,
                AVG(s_energy)  AS s_energy_mean,
                AVG(s_sugars)  AS s_sugars_mean,
                AVG(s_satfat)  AS s_satfat_mean,
                AVG(s_sodium)  AS s_sodium_mean
            FROM off_base
            WHERE geo IS NOT NULL
            GROUP BY geo, year
        ),

        -- 2) HICP: índices de precios por país-año ---------------------------
        hicp_agg AS (
            SELECT
                geo,
                year,
                MAX(CASE WHEN coicop = 'CP00' THEN hicp_value END) AS hicp_total,
                MAX(CASE WHEN coicop = 'CP01' THEN hicp_value END) AS hicp_food
            FROM read_parquet('{HICP_PARQUET}')
            WHERE year BETWEEN 2017 AND 2023
            GROUP BY geo, year
        ),

        -- 3) HBS: estructura del gasto alimentario ---------------------------
        hbs_food AS (
            SELECT
                geo,
                year,
                SUM(hbs_share) AS food_budget_share
            FROM read_parquet('{HBS_PARQUET}')
            WHERE unit = 'PM'             -- % del gasto total
              AND coicop LIKE 'CP01%'     -- alimentación + bebidas no alcohólicas
            GROUP BY geo, year
        ),
        hbs_latest AS (
            -- Último año disponible en HBS para cada país
            SELECT
                geo,
                year AS hbs_year,
                food_budget_share
            FROM (
                SELECT
                    geo,
                    year,
                    food_budget_share,
                    ROW_NUMBER() OVER (
                        PARTITION BY geo
                        ORDER BY year DESC
                    ) AS rn
                FROM hbs_food
            )
            WHERE rn = 1
        ),

        -- 4) FAOSTAT CAHD: coste y asequibilidad de dieta saludable ----------
        faostat_base AS (
            SELECT
                geo,
                year,
                item,
                unit,
                value
            FROM read_parquet('{FAOSTAT_PARQUET}')
            WHERE year BETWEEN 2017 AND 2023
        ),
        faostat_agg AS (
            SELECT
                geo,
                year,
                MAX(CASE
                        WHEN item LIKE 'Cost of a healthy diet (CoHD), LCU per person per day%'
                        THEN value
                    END) AS cohd_lcu,
                MAX(CASE
                        WHEN item LIKE 'Cost of a healthy diet (CoHD), PPP dollar per person per day%'
                        THEN value
                    END) AS cohd_ppp,
                MAX(CASE
                        WHEN item LIKE 'Prevalence of unaffordability (PUA), percent%'
                        THEN value
                    END) AS pua,
                MAX(CASE
                        WHEN item LIKE 'Number of people unable to afford a healthy diet%'
                        THEN value
                    END) AS pop_unaffordable
            FROM faostat_base
            GROUP BY geo, year
        ),

        -- 5) Panel conjunto: unión por país-año ------------------------------
        panel AS (
            SELECT
                COALESCE(o.geo, h.geo, f.geo)    AS geo,
                COALESCE(o.year, h.year, f.year) AS year,

                -- OFF
                o.off_index_mean,
                o.share_high_off,
                o.s_energy_mean,
                o.s_sugars_mean,
                o.s_satfat_mean,
                o.s_sodium_mean,

                -- HICP
                h.hicp_total,
                h.hicp_food,

                -- HBS (constante por país; se añade el año de referencia)
                b.food_budget_share,
                b.hbs_year,

                -- FAOSTAT
                f.cohd_lcu,
                f.cohd_ppp,
                f.pua,
                f.pop_unaffordable
            FROM off_agg o
            FULL OUTER JOIN hicp_agg    h
                ON o.geo = h.geo AND o.year = h.year
            FULL OUTER JOIN faostat_agg f
                ON COALESCE(o.geo, h.geo) = f.geo
               AND COALESCE(o.year, h.year) = f.year
            LEFT JOIN hbs_latest b
                ON COALESCE(o.geo, h.geo, f.geo) = b.geo
        )
        SELECT *
        FROM panel
        ORDER BY geo, year
    )
    TO '{PANEL_PARQUET}'
    (FORMAT PARQUET)
""")

PANEL_PARQUET.exists(), PANEL_PARQUET

(True,
 WindowsPath('C:/Users/santi/OneDrive - UNIR/UNIR/MASTER ANÁLISIS Y VISUALIZACIÓN BIG DATA/TFM/dashboard-coherencia-ue-tfm/data_processed/panel/panel_coherencia_ue.parquet'))

In [5]:
panel_sample = con.execute(f"""
    SELECT *
    FROM read_parquet('{PANEL_PARQUET}')
    ORDER BY geo, year
    LIMIT 20
""").fetchdf()

panel_sample

Unnamed: 0,geo,year,off_index_mean,share_high_off,s_energy_mean,s_sugars_mean,s_satfat_mean,s_sodium_mean,hicp_total,hicp_food,food_budget_share,hbs_year,cohd_lcu,cohd_ppp,pua,pop_unaffordable
0,AT,2017,27.305814,16.666667,0.529139,0.072024,0.221691,0.26938,103.22,103.04,361.0,2020,1.72,2.06,2.5,0.2
1,AT,2018,23.181746,8.695652,0.386035,0.134682,0.180237,0.226316,105.41,104.59,361.0,2020,1.75,2.11,2.3,0.2
2,AT,2019,27.222276,14.084507,0.434377,0.219073,0.228115,0.207326,106.98,105.74,361.0,2020,1.77,2.19,2.7,0.2
3,AT,2020,29.610361,14.124294,0.457757,0.234592,0.247425,0.244639,108.47,108.19,361.0,2020,1.81,2.3,2.5,0.2
4,AT,2021,28.55871,14.351852,0.431306,0.180786,0.266266,0.26399,111.46,108.94,361.0,2020,1.82,2.42,3.4,0.3
5,AT,2022,26.438366,13.064133,0.412526,0.191649,0.239881,0.213478,121.07,120.63,361.0,2020,2.02,2.75,2.9,0.3
6,AT,2023,27.550979,14.145031,0.435331,0.20293,0.243277,0.220502,130.4,133.92,361.0,2020,2.24,2.91,2.9,0.3
7,BE,2015,31.290754,0.0,0.651182,0.303571,0.023923,0.272953,,,477.0,2020,,,,
8,BE,2016,25.891756,0.0,0.608108,0.258929,0.069378,0.099256,,,477.0,2020,,,,
9,BE,2017,13.688434,0.0,0.254054,0.178929,0.082297,0.032258,104.03,102.82,477.0,2020,1.72,2.0,2.1,0.2


In [6]:
panel_stats = con.execute(f"""
    SELECT
        MIN(year)            AS min_year,
        MAX(year)            AS max_year,
        COUNT(*)             AS n_filas,
        COUNT(DISTINCT geo)  AS n_paises,
        SUM(off_index_mean IS NOT NULL)::INT     AS n_con_off,
        SUM(hicp_total IS NOT NULL)::INT         AS n_con_hicp,
        SUM(food_budget_share IS NOT NULL)::INT  AS n_con_hbs,
        SUM(cohd_ppp IS NOT NULL)::INT           AS n_con_faostat
    FROM read_parquet('{PANEL_PARQUET}')
""").fetchdf()

panel_stats

Unnamed: 0,min_year,max_year,n_filas,n_paises,n_con_off,n_con_hicp,n_con_hbs,n_con_faostat
0,2015,2023,205,27,181,189,197,189
