# 12 – ETL de Eurostat: HICP (UE-27, 2015–2023)

En este notebook se transforma el fichero tabular de Eurostat:

- `data_raw/eurostat/prc_hicp_aind_tabular.tsv`

**Objetivos:**

- Pasar de formato ancho (una columna por año) a formato largo (una fila por año).
- Descomponer la columna compuesta `freq,unit,coicop,geo\TIME_PERIOD` en:
  - `freq` (frecuencia, anual),
  - `unit` (tipo de índice, INX_A_AVG),
  - `coicop` (categoría HICP/COICOP),
  - `geo` (país / agregado).
- Filtrar:
  - Frecuencia anual (`freq = 'A'`),
  - Índice medio anual (`unit = 'INX_A_AVG'`),
  - Países de la UE-27,
  - Años 2015–2023.
- Limpiar los valores:
  - `:` → `NULL`,
  - valores con banderas de calidad (`"104.37 b"`, etc.) → usar solo la parte numérica.
- Guardar el resultado en:

`data_processed/eurostat/hicp_eu27_2015_2023.parquet`

In [1]:
from pathlib import Path
import duckdb
import pandas as pd

# Carpeta raíz del proyecto
ROOT_DIR = Path("..").resolve().parent

# Carpeta de datos crudos y procesados
DATA_RAW = ROOT_DIR / "data_raw" / "eurostat"
DATA_PROCESSED = ROOT_DIR / "data_processed" / "eurostat"

# Fichero HICP original (Eurostat)
HICP_PATH = DATA_RAW / "prc_hicp_aind_tabular.tsv"

# Fichero de salida (Parquet largo UE-27)
HICP_PARQUET = DATA_PROCESSED / "hicp_eu27_2015_2023.parquet"

ROOT_DIR, HICP_PATH.exists(), HICP_PATH

(WindowsPath('C:/Users/santi/OneDrive - UNIR/UNIR/MASTER ANÁLISIS Y VISUALIZACIÓN BIG DATA/TFM/dashboard-coherencia-ue-tfm'),
 True,
 WindowsPath('C:/Users/santi/OneDrive - UNIR/UNIR/MASTER ANÁLISIS Y VISUALIZACIÓN BIG DATA/TFM/dashboard-coherencia-ue-tfm/data_raw/eurostat/prc_hicp_aind_tabular.tsv'))

In [2]:
# Conexión en memoria
con = duckdb.connect(database=":memory:")

# Pequeño preview para recordar la estructura
hicp_preview = con.execute(f"""
    SELECT *
    FROM read_csv_auto('{HICP_PATH}', delim='\t', header=TRUE)
    LIMIT 5
""").fetchdf()

hicp_preview

Unnamed: 0,"freq,unit,coicop,geo\TIME_PERIOD",1996,1997,1998,1999,2000,2001,2002,2003,2004,...,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
0,"A,CID_EA,TOT_X_NRG_FOOD,AT",:,:,:,:,:,:,-0.4,-0.6,-0.3,...,0.7,0.8,1.1,0.8,0.6,1.3,0.8,1.1,2.4,1.1
1,"A,CID_EA,TOT_X_NRG_FOOD,BE",:,:,:,:,:,:,-0.3,-0.3,-0.6,...,0.6,0.9,0.5,0.3,0.5,0.7,-0.2,0.0,1.0,0.5
2,"A,CID_EA,TOT_X_NRG_FOOD,BG",:,:,:,:,:,:,5.3,0.3,0.3,...,-1.7,-1.8,-1.5,1.1,0.8,0.5,-0.1,3.7,4.0,0.3
3,"A,CID_EA,TOT_X_NRG_FOOD,CY",:,:,:,:,:,:,-1.4,0.1,-2.0,...,-1.5,-1.6,-0.6,-0.9,-0.2,-1.5,-0.1,1.0,-1.1,-0.2
4,"A,CID_EA,TOT_X_NRG_FOOD,CZ",:,:,:,:,:,:,0.1,-1.2,0.1,...,-0.3,0.4,0.9,0.7,1.1,2.4,2.1,8.1,4.4,1.2


In [3]:
# Esquema de columnas
hicp_schema = con.execute(f"""
    DESCRIBE
    SELECT *
    FROM read_csv_auto('{HICP_PATH}', delim='\t', header=TRUE)
""").fetchdf()

hicp_schema.head()

Unnamed: 0,column_name,column_type,null,key,default,extra
0,"freq,unit,coicop,geo\TIME_PERIOD",VARCHAR,YES,,,
1,1996,VARCHAR,YES,,,
2,1997,VARCHAR,YES,,,
3,1998,VARCHAR,YES,,,
4,1999,VARCHAR,YES,,,


In [4]:
# Nombre real de la primera columna (freq,unit,coicop,geo\\TIME_PERIOD)
first_col = hicp_preview.columns[0]
first_col

'freq,unit,coicop,geo\\TIME_PERIOD'

In [5]:
# Años que vamos a considerar (2015–2023)
years = list(range(2015, 2024))
years_sql = ", ".join(str(y) for y in years)

# Lista de países UE-27 (códigos Geo de Eurostat)
eu27_codes = [
    "AT","BE","BG","HR","CY","CZ","DK","EE","FI","FR","DE","EL",
    "HU","IE","IT","LV","LT","LU","MT","NL","PL","PT","RO","SK",
    "SI","ES","SE"
]

eu27_list_sql = ", ".join(f"'{c}'" for c in eu27_codes)

years, eu27_codes[:5]

([2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023],
 ['AT', 'BE', 'BG', 'HR', 'CY'])

In [6]:
# Nos aseguramos de que exista la carpeta de salida
HICP_PARQUET.parent.mkdir(parents=True, exist_ok=True)

con.execute(f"""
    COPY (
        WITH base AS (
            -- Leemos el TSV y separamos la primera columna en 4 campos
            SELECT
                split_part("{first_col}", ',', 1) AS freq,
                split_part("{first_col}", ',', 2) AS unit,
                split_part("{first_col}", ',', 3) AS coicop,
                split_part("{first_col}", ',', 4) AS geo,
                *
            FROM read_csv_auto('{HICP_PATH}', delim='\t', header=TRUE)
        ),
        long AS (
            -- Pasamos de columnas por año a filas (UNPIVOT)
            SELECT
                freq,
                unit,
                coicop,
                geo,
                year,
                value
            FROM base
            UNPIVOT (
                value FOR year IN ({years_sql})
            )
        )
        SELECT
            freq,
            unit,
            coicop,
            geo,
            CAST(year AS INTEGER) AS year,
            -- Limpieza de valor HICP:
            --   - nos quedamos con el primer token antes del espacio (ej. '104.37 b' -> '104.37')
            --   - convertimos ':' y cadenas vacías a NULL
            --   - casteamos a DOUBLE
            NULLIF(
                NULLIF(TRIM(split_part(value, ' ', 1)), ''),
                ':'
            )::DOUBLE AS hicp_value
        FROM long
        WHERE
            -- Frecuencia anual
            freq = 'A'
            -- Índice medio anual
            AND unit = 'INX_A_AVG'
            -- Países UE-27
            AND geo IN ({eu27_list_sql})
    )
    TO '{HICP_PARQUET}'
    (FORMAT PARQUET)
""")

HICP_PARQUET.exists(), HICP_PARQUET

(True,
 WindowsPath('C:/Users/santi/OneDrive - UNIR/UNIR/MASTER ANÁLISIS Y VISUALIZACIÓN BIG DATA/TFM/dashboard-coherencia-ue-tfm/data_processed/eurostat/hicp_eu27_2015_2023.parquet'))

In [7]:
hicp_eu_sample = con.execute(f"""
    SELECT *
    FROM read_parquet('{HICP_PARQUET}')
    ORDER BY geo, coicop, year
    LIMIT 10
""").fetchdf()

hicp_eu_sample

Unnamed: 0,freq,unit,coicop,geo,year,hicp_value
0,A,INX_A_AVG,AP,AT,2015,100.0
1,A,INX_A_AVG,AP,AT,2016,102.26
2,A,INX_A_AVG,AP,AT,2017,104.37
3,A,INX_A_AVG,AP,AT,2018,107.28
4,A,INX_A_AVG,AP,AT,2019,109.62
5,A,INX_A_AVG,AP,AT,2020,112.02
6,A,INX_A_AVG,AP,AT,2021,114.04
7,A,INX_A_AVG,AP,AT,2022,117.97
8,A,INX_A_AVG,AP,AT,2023,124.33
9,A,INX_A_AVG,APF,AT,2015,100.0


In [8]:
# Resumen rápido: años, nº filas, países, categorías
hicp_stats = con.execute(f"""
    SELECT
        MIN(year) AS min_year,
        MAX(year) AS max_year,
        COUNT(*) AS n_filas,
        COUNT(DISTINCT geo) AS n_paises,
        COUNT(DISTINCT coicop) AS n_coicop
    FROM read_parquet('{HICP_PARQUET}')
""").fetchdf()

hicp_stats

Unnamed: 0,min_year,max_year,n_filas,n_paises,n_coicop
0,2015,2023,98631,27,465


In [9]:
# Tipos de datos del Parquet resultante
hicp_schema_out = con.execute(f"""
    DESCRIBE
    SELECT *
    FROM read_parquet('{HICP_PARQUET}')
""").fetchdf()

hicp_schema_out

Unnamed: 0,column_name,column_type,null,key,default,extra
0,freq,VARCHAR,YES,,,
1,unit,VARCHAR,YES,,,
2,coicop,VARCHAR,YES,,,
3,geo,VARCHAR,YES,,,
4,year,INTEGER,YES,,,
5,hicp_value,DOUBLE,YES,,,
