In [None]:
import os
import pandas as pd
import requests
import zipfile
from plotly import graph_objects as go

In [None]:
# https://data.worldbank.org/indicator/FP.CPI.TOTL.ZG
url_cpi = "https://databank.worldbank.org/AjaxDownload/FileDownloadHandler.ashx?filename=P_d567e128-13df-4fdb-9100-3897a79cbdcb.zip&filetype=CSV&language=en&displayfile=P_Data_Extract_From_World_Development_Indicators.zip"
path_output_root = "data/scraped/world-bank"
path_zip = os.path.join(path_output_root, "cpi.zip")
path_csv = os.path.join(path_output_root, "cpi.csv")
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
can_overwrite = False


In [None]:
if not os.path.exists(path_zip):
	os.makedirs(path_output_root, exist_ok=True)

if not os.path.exists(path_zip) or can_overwrite:
	print("Downloading cpi data from World Bank...")
	headers = {"User-Agent": user_agent}
	response = requests.get(url_cpi, headers=headers)
	with open(path_zip, "wb") as f:
		f.write(response.content)
	print("Done.")
else:
	print("Using cached cpi data.")

if not os.path.exists(path_csv) or can_overwrite:
	print("Extracting cpi data...")
	with zipfile.ZipFile(path_zip) as z:
		namelist = z.namelist()
		print(f"Namelist: {namelist}")
		for name in namelist:
			if "Metadata" not in name:
				print(f"Extracting '{name}' to '{path_csv}'...")
				with open(path_csv, "wb") as f:
					f.write(z.read(name))
				break
	print("Done.")
else:
	print("Using already extracted cpi data.")

In [None]:
# df = pd.read_csv(path_csv)
# read by skipping last 4 rows, replace ".." with NaN
# df = pd.read_csv(path_csv, skipfooter=5, engine="python")
df = pd.read_csv(path_csv, skipfooter=5, engine="python", na_values="..")
for col in df.columns:
	if "[" in col:
		new_col = col.split("[")[0].strip()
		df = df.rename(columns={col: new_col})
original_columns = list(df.columns)
series_name = df.iloc[0, 0]
series_code = df.iloc[0, 1]
if "Indicator Name" in df.columns:
	df = df.drop(columns=["Indicator Name", "Indicator Code"])
countries_count = len(df)
year_columns = [col for col in df.columns if col.isnumeric()]
print(f"Indicator name: '{series_name}'")
print(f"Indicator code: '{series_code}'")
print(f"Countries / groups count: {countries_count}")
print(f"Data from year {year_columns[0]} to {year_columns[-1]}")
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# plot inflation for Slovenia from 2010 to end ; CPI 2010 = 100
fig = go.Figure()
# first year index
i_2010 = df.columns.get_loc("2010")
x = df.columns[i_2010:]
y = df.loc[df["Country Name"] == "Slovenia"].iloc[0, i_2010:]
fig.add_trace(go.Scatter(x=x, y=y, mode="lines+markers", name="Slovenia"))
fig.update_layout(title=f"{series_name} in Slovenia", xaxis_title="Year", yaxis_title="CPI")
fig.show()

In [None]:
# plot United States, Germany, United Kingdom, Japan
fig = go.Figure()
countries = ["United States", "Germany", "United Kingdom", "Japan"]
x = df.columns[i_2010:]
for country in countries:
	y = df.loc[df["Country Name"] == country].iloc[0, i_2010:]
	fig.add_trace(go.Scatter
		(x=x, y=y, mode="lines+markers", name=country))
fig.update_layout(title=f"{series_name} in selected countries", xaxis_title="Year", yaxis_title="CPI")
# add legend
fig.update_layout(legend=dict(x=0, y=1.0))
fig.show()

In [None]:
# get linear prices of a sample for 2010-end
x = df.columns[i_2010:]
# y = [(x + 1) * 100 for x in range(len(x))]
y = [1000] * len(x)
fig = go.Figure()
fig.add_trace(go.Scatter(x=x, y=y, mode="lines+markers", name="Linear"))
# deflate for US inflation using CPI values from df
y_i = df.loc[df["Country Name"] == "United States"].iloc[0, i_2010:]
for i in range(len(y)):
	y[i] = y[i] / y_i[i] * 100
	
	# y[i] = y[i] * y_i[i] / 100
	
	# cpi_ratio = y_i[i] / 100
	# y[i] = y[i] / cpi_ratio
fig.add_trace(go.Scatter(x=x, y=y, mode="lines+markers", name="Adjusted for US inflation"))
fig.update_layout(title="Linear prices", xaxis_title="Year", yaxis_title="Price")
fig.show()

In [None]:
# Simplified

# get linear prices of a sample for 2010-end
x = df.columns[i_2010:]
# y = [(x + 1) * 100 for x in range(len(x))]
y = [1000] * len(x) # prices from 2010 to 2022
print(f"prices: {y}")
fig = go.Figure()
fig.add_trace(go.Scatter(x=x, y=y, mode="lines+markers", name="Linear"))
# deflate for US inflation using CPI values from df
cpi = df.loc[df["Country Name"] == "United States"].iloc[0, i_2010:]
# print(f"cpi: {list(cpi):.2f}")
# print rounded to 2 decimal places
print(f"cpi: {[round(x, 2) for x in cpi]}")
# y = y / (cpi / 100)
y = y / cpi * 100 # alternative but more efficient since less division operations
print(f"prices adjusted for US inflation: {[round(x, 2) for x in y]}")
fig.add_trace(go.Scatter(x=x, y=y, mode="lines+markers", name="Adjusted for US inflation"))
fig.update_layout(title="Linear prices and linear prices adjusted for US inflation (2010 = 100)", xaxis_title="Year", yaxis_title="Price")
fig.show()

In [None]:
df.head()