In [None]:
import os
import json
import requests
import pandas as pd
from plotly import graph_objs as go

In [None]:
# https://ec.europa.eu/eurostat/databrowser/view/prc_hicp_midx/default/table?lang=en
url_dataset = "https://ec.europa.eu/eurostat/api/dissemination/sdmx/2.1/data/prc_hicp_midx?format=TSV&compressed=false"
url_dimensions = "https://ec.europa.eu/eurostat/databrowser-backend/api/card/1.0/LIVE/json/en/prc_hicp_manr/dimensions?stub=false&filtered=true"
path_output_root = "data/scraped/eurostat"
path_output_csv = os.path.join(path_output_root, "estat_prc_hicp_midx.tsv")
path_output_dimensions = os.path.join(path_output_root, "estat_prc_hicp_midx_dimensions.json")
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
can_overwrite = False

In [None]:
if not os.path.exists(path_output_root):
	os.makedirs(path_output_root)

if not os.path.exists(path_output_csv) or can_overwrite:
	print("Downloading CSV...")
	headers = {
		"User-Agent": user_agent
	}
	response = requests.get(url_dataset, headers=headers)
	with open(path_output_csv, "wb") as file:
		# file.write(response.content)
		file.write(response.text.encode("utf-8"))
	print("Done.")
else:
	print("CSV already exists - set can_overwrite to True to overwrite.")

In [None]:
if not os.path.exists(path_output_dimensions) or can_overwrite:
	print("Downloading dimensions JSON...")
	headers = {
		"User-Agent": user_agent
	}
	response = requests.get(url_dimensions, headers=headers)
	with open(path_output_dimensions, "w") as file:
		file.write(response.text)
	print("Done.")
else:
	print("Dimensions JSON already exists - set can_overwrite to True to overwrite.")

In [None]:
# Definitions: https://webgate.ec.europa.eu/sdmxregistry/

In [None]:
# Use this if you don't want to use the custom URL
# df = pd.read_csv(path_output_csv, sep="\t")
df = pd.read_csv(path_output_csv.replace(".tsv", "-old2.tsv"), sep="\t")
# Remove leading and trailing whitespaces from all string columns
# df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x) # type: ignore
# separate by tab and space
# df = pd.read_csv(path_output_csv, sep="\s+")
# first row: freq,unit,coicop,geo\TIME_PERIOD	1996-01 	1996-02 	1996-03 		...
# first column has commas - need to split into 4 columns by comma
df[["freq", "unit", "coicop", "geo"]] = df["freq,unit,coicop,geo\\TIME_PERIOD"].str.split(",", expand=True)
# remove first column
df = df.drop(columns=["freq,unit,coicop,geo\\TIME_PERIOD"])
# move last 4 columns to front
df = df[["freq", "unit", "coicop", "geo"] + list(df.columns[:-4])]
# replace ": " with None
# df = df.replace(": ", None)
# df = df.replace(":", None)
# strip column names
df.columns = df.columns.str.strip()
# convert all columns but the first 4 to float ; if error, replace with None - skip first 4 columns
df[df.columns[4:]] = df[df.columns[4:]].apply(pd.to_numeric, errors="coerce")
df.head()

In [None]:
dimensions = json.load(open(path_output_dimensions))
dimensions.keys()

In [None]:
def get_dimension_name(dimension_id: str) -> str:
	return dimensions["link"]["item"][2]["category"]["label"][dimension_id]

print(get_dimension_name("CP09131"))

In [None]:
def find_dimension_ids(search_query: str) -> list:
	results = []
	for key, value in dimensions["link"]["item"][2]["category"]["label"].items():
		if search_query.lower() in value.lower():
			results.append((key, value))
	return results

query = "graph"
res = find_dimension_ids(query)
print(f"Found {len(res)} results for '{query}':")
print(json.dumps(res, indent=2))

In [None]:
# check if there are any left over spaces (from parsing)

# print row 1 column 3
print(f"Value = '{df.iloc[1, 3]}'")
# print column name at index 30
print(f"Column name = '{df.columns[30]}'")

In [None]:
coicp_id = "CP09131"
coicp_id = "CP091"

# select "Personal Computers" (CP09131) to plot
# df_pc = df[df["coicop"] == "CP09131"]

# select "CP091" (Audio-visual, photographic and information processing equipment) to plot
df_pc = df[df["coicop"] == coicp_id]

df_pc.head()

In [None]:
first_month = df_pc.columns[4]
last_month = df_pc.columns[-1]
print("First month:", first_month)
print("Last month:", last_month)

In [None]:
# df_pc_si = df_pc[df_pc["geo"] == "SI" and df_pc["unit"] == "I15"]
df_pc_si = df_pc[(df_pc["geo"] == "SI") & (df_pc["unit"] == "I15")]
df_pc_si.head()

In [None]:
print(f"Last column value for Slovenia: '{df_pc_si.iloc[0, -1]}'")

In [None]:
# find column index of with name 2019-01
index_2019_01 = df_pc_si.columns.get_loc("2019-01")
index_2023_12 = df_pc_si.columns.get_loc("2023-12")
print(f"Index of 2019-01: '{index_2019_01}'")

In [None]:
# plot line chart for Slovenia between 2019-01 and 2023-12

df_pc_si = df_pc[(df_pc["geo"] == "SI") & (df_pc["unit"] == "I15")]
df_pc_de = df_pc[(df_pc["geo"] == "DE") & (df_pc["unit"] == "I15")]
df_pc_eu = df_pc[(df_pc["geo"] == "EU") & (df_pc["unit"] == "I15")]
df_pc_at = df_pc[(df_pc["geo"] == "AT") & (df_pc["unit"] == "I15")]
df_pc_hr = df_pc[(df_pc["geo"] == "HR") & (df_pc["unit"] == "I15")]

# create figure
fig = go.Figure()

# add line
fig.add_trace(go.Scatter(
	x=df_pc_si.columns[index_2019_01:index_2023_12+1],
	y=df_pc_si.iloc[0, index_2019_01:index_2023_12+1],
	mode="lines",
	name="Slovenia"
))

fig.add_trace(go.Scatter(
	x=df_pc_de.columns[index_2019_01:index_2023_12+1],
	y=df_pc_de.iloc[0, index_2019_01:index_2023_12+1],
	mode="lines",
	name="Germany"
))

fig.add_trace(go.Scatter(
	x=df_pc_eu.columns[index_2019_01:index_2023_12+1],
	y=df_pc_eu.iloc[0, index_2019_01:index_2023_12+1],
	mode="lines",
	name="EU"
))

fig.add_trace(go.Scatter(
	x=df_pc_at.columns[index_2019_01:index_2023_12+1],
	y=df_pc_at.iloc[0, index_2019_01:index_2023_12+1],
	mode="lines",
	name="Austria"
))

fig.add_trace(go.Scatter(
	x=df_pc_hr.columns[index_2019_01:index_2023_12+1],
	y=df_pc_hr.iloc[0, index_2019_01:index_2023_12+1],
	mode="lines",
	name="Croatia"
))


# add title and axis labels
fig.update_layout(
	# title="Eurostat - HICP Personal Computers (CP09131)",
	title=f"HICP index values for {get_dimension_name(coicp_id)} ({coicp_id})",
	xaxis_title="Month",
	yaxis_title="Index (2015=100)"
)

# add vertical line for 2020-01
fig.add_vline(x="2020-01", line_width=1, line_dash="dash", line_color="black")
# fig.add_vline(x="2020-01", line_width=1, line_dash="dash", line_color="black", annotation_text="test")

# enable legend
fig.update_layout(
	showlegend=True
)

# show figure
fig.show() 

In [None]:
# get_dimension_name("CPC45230")
get_dimension_name("CP0013")


In [None]:
# find_dimension_ids("electronic")
# find_dimension_ids("process")
find_dimension_ids("information")
# CP0913 - information processing equipment

In [None]:
def get_fig(coicop_id: str, countries: list[str], start_month: str, end_month: str) -> go.Figure:
	df_id = df[df["coicop"] == coicop_id]
	fig = go.Figure()
	for country in countries:
		df_country = df_id[(df_id["geo"] == country) & (df_id["unit"] == "I15")]
		fig.add_trace(go.Scatter(
			x=df_country.columns[df_country.columns.get_loc(start_month):df_country.columns.get_loc(end_month)+1],
			y=df_country.iloc[0, df_country.columns.get_loc(start_month):df_country.columns.get_loc(end_month)+1],
			mode="lines",
			name=country
		))
	fig.update_layout(
		title=f"HICP index values for {get_dimension_name(coicop_id)} ({coicop_id})",
		xaxis_title="Month",
		yaxis_title="Index (2015=100)"
	)
	# fig.add_vline(x="2020-01", line_width=1, line_dash="dash", line_color="black")
	fig.update_layout(
		showlegend=True
	)
	fig.update_layout(width=1280, height=720)
	fig.update_layout(margin=dict(l=20, r=10, t=50, b=10))
	return fig

coicop_id = "CP9013" #"CP09132" #"CP0913"
countries = ["SI", "DE", "EU", "AT", "HR"]
start_month = "2019-01"
end_month = "2023-12"
fig = get_fig(coicop_id, countries, start_month, end_month)
fig.show()