In [None]:
import os
import json
import requests
import pandas as pd
from plotly import graph_objs as go

In [None]:
# https://ec.europa.eu/eurostat/databrowser/view/prc_hicp_midx/default/table?lang=en
url_dataset = "https://ec.europa.eu/eurostat/api/dissemination/sdmx/2.1/data/prc_hicp_midx?format=TSV&compressed=false"
url_dimensions = "https://ec.europa.eu/eurostat/databrowser-backend/api/card/1.0/LIVE/json/en/prc_hicp_manr/dimensions?stub=false&filtered=true"
path_output_root = "data/scraped/eurostat"
path_output_csv = os.path.join(path_output_root, "estat_prc_hicp_midx-custom.csv")
path_output_dimensions = os.path.join(path_output_root, "estat_prc_hicp_midx_dimensions.json")
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
can_overwrite = False

In [None]:
# Custom from https://ec.europa.eu/eurostat/databrowser/view/prc_hicp_midx__custom_10964667/default/table?lang=en
url_dataset = """https://ec.europa.eu/eurostat/api/dissemination/sdmx/3.0/data/dataflow/ESTAT/prc_hicp_midx/1.0/*.*.*.*?c[freq]=M&c[unit]=I15&c[coicop]=CP091,CP0911,CP09111,CP09112,CP09113,CP09119,CP0912,CP09121,CP09122,CP09123,CP0913,CP09131,CP09132,CP09133,CP09134,CP0914,CP09141,CP09142,CP09149,CP0915&c[geo]=EU,EU27_2020,EU28,EA,EA20,EA19,BE,BG,CZ,DK,DE,EE,IE,EL,ES,FR,HR,IT,CY,LV,LT,LU,HU,MT,NL,AT,PL,PT,RO,SI,SK,FI,SE,EEA,IS,NO,CH,UK,ME,MK,AL,RS,TR,XK,US&compress=false&format=csvdata&formatVersion=2.0&c[time]=2024-03,2024-02,2024-01,2023-12,2023-11,2023-10,2023-09,2023-08,2023-07,2023-06,2023-05,2023-04,2023-03,2023-02,2023-01,2022-12,2022-11,2022-10,2022-09,2022-08,2022-07,2022-06,2022-05,2022-04,2022-03,2022-02,2022-01,2021-12,2021-11,2021-10,2021-09,2021-08,2021-07,2021-06,2021-05,2021-04,2021-03,2021-02,2021-01,2020-12,2020-11,2020-10,2020-09,2020-08,2020-07,2020-06,2020-05,2020-04,2020-03,2020-02,2020-01,2019-12,2019-11,2019-10,2019-09,2019-08,2019-07,2019-06,2019-05,2019-04,2019-03,2019-02,2019-01,2018-12,2018-11,2018-10,2018-09,2018-08,2018-07,2018-06,2018-05,2018-04,2018-03,2018-02,2018-01,2017-12,2017-11,2017-10,2017-09,2017-08,2017-07,2017-06,2017-05,2017-04,2017-03,2017-02,2017-01,2016-12,2016-11,2016-10,2016-09,2016-08,2016-07,2016-06,2016-05,2016-04,2016-03,2016-02,2016-01,2015-12,2015-11,2015-10,2015-09,2015-08,2015-07,2015-06,2015-05,2015-04,2015-03,2015-02,2015-01"""
# url filter selecting by "CP091..." I15

In [None]:
if not os.path.exists(path_output_root):
	os.makedirs(path_output_root)

if not os.path.exists(path_output_csv) or can_overwrite:
	print("Downloading CSV...")
	headers = {
		"User-Agent": user_agent
	}
	response = requests.get(url_dataset, headers=headers)
	with open(path_output_csv, "wb") as file:
		# file.write(response.content)
		file.write(response.text.encode("utf-8"))
	print("Done.")
else:
	print("CSV already exists - set can_overwrite to True to overwrite.")

In [None]:
if not os.path.exists(path_output_dimensions) or can_overwrite:
	print("Downloading dimensions JSON...")
	headers = {
		"User-Agent": user_agent
	}
	response = requests.get(url_dimensions, headers=headers)
	with open(path_output_dimensions, "w") as file:
		file.write(response.text)
	print("Done.")
else:
	print("Dimensions JSON already exists - set can_overwrite to True to overwrite.")

In [None]:
# Definitions: https://webgate.ec.europa.eu/sdmxregistry/
# Classes: https://showvoc.op.europa.eu/#/datasets/ESTAT_European_Classification_of_Individual_Consumption_according_to_Purpose_%28ECOICOP%29/data?resId=http:%2F%2Fdata.europa.eu%2Fed1%2Fecoicop%2Fclasses

In [None]:
df = pd.read_csv(path_output_csv)
# remove STRUCTURE, STRUCTURE_ID, OBS_FLAG
df = df.drop(columns=["STRUCTURE", "STRUCTURE_ID", "OBS_FLAG"])
# force missing (empty) cells to NaN
df = df.replace("", float("nan"))
# set TIME_PERIOD as datetime
df["TIME_PERIOD"] = pd.to_datetime(df["TIME_PERIOD"])
# add 1 month to TIME_PERIOD
df["TIME_PERIOD"] = df["TIME_PERIOD"] + pd.DateOffset(months=1)
# minus 1 year
# df["TIME_PERIOD"] = df["TIME_PERIOD"] - pd.DateOffset(years=1) # type: ignore
df = df.set_index("TIME_PERIOD")
df.head()

In [None]:
dimensions = json.load(open(path_output_dimensions))
dimensions.keys()

In [None]:
def get_dimension_name(dimension_id: str) -> str:
	return dimensions["link"]["item"][2]["category"]["label"][dimension_id]

print(get_dimension_name("CP09131"))

In [None]:
def find_dimension_ids(search_query: str) -> list:
	results = []
	for key, value in dimensions["link"]["item"][2]["category"]["label"].items():
		if search_query.lower() in value.lower():
			results.append((key, value))
	# sort by first element (dimension ID)
	results = sorted(results, key=lambda x: x[0])
	return results

query = "graph"
res = find_dimension_ids(query)
print(f"Found {len(res)} results for '{query}':")
print(json.dumps(res, indent=2))

In [None]:
# get_dimension_name("CPC45230")
get_dimension_name("CP0913")

In [None]:
# find_dimension_ids("electronic")
# find_dimension_ids("process")
find_dimension_ids("information")
# CP0913 - information processing equipment

In [None]:
def get_fig(coicop_id: str, countries: list[str], start_month: str, end_month: str) -> go.Figure:
	df_id = df[df["coicop"] == coicop_id]
	fig = go.Figure()
	# countries = df_id["geo"].unique()
	for country in countries:
		df_country = df_id[(df_id["geo"] == country) & (df_id["unit"] == "I15")]
		df_country = df_country[(df_country.index >= start_month) & (df_country.index <= end_month)]
		fig.add_trace(go.Scatter(
			x=df_country.index,
			y=df_country["OBS_VALUE"],
			mode="lines",
			name=country
		))
	fig.update_layout(
		title=f"HICP index values for {get_dimension_name(coicop_id)} ({coicop_id})",
		xaxis_title="Month",
		yaxis_title="Index (2015=100)"
	)
	# fig.add_vline(x="2020-01", line_width=1, line_dash="dash", line_color="black")
	fig.update_layout(
		showlegend=True
	)
	fig.update_layout(width=1280, height=720)
	fig.update_layout(margin=dict(l=20, r=10, t=50, b=10))
	return fig

coicop_id = "CP0913" #"CP09132" #"CP0913"
countries = ["EU", "EEA", "SI", "DE", "AT", "HR"]
start_month = "2015-01"
end_month = "2024-01"
fig = get_fig(coicop_id, countries, start_month, end_month)
fig.show()