In [None]:
import os
import pandas as pd
from datetime import datetime
from plotly import graph_objects as go
import plotly.express as px

In [None]:
path_cpi = "data/scraped/world-bank/cpi.csv"
path_output_root = "data/analysis/cpi"

if not os.path.exists(path_output_root):
	os.makedirs(path_output_root)

In [None]:
def load_cpi() -> pd.DataFrame:
	if not os.path.exists(path_cpi):
		raise FileNotFoundError(f"File not found: '{path_cpi}'. Try running cpi.iypnb first.")
	df = pd.read_csv(path_cpi, skipfooter=5, engine="python", na_values="..")
	for col in df.columns:
		if "[" in col:
			new_col = col.split("[")[0].strip()
			df = df.rename(columns={col: new_col})
	original_columns = list(df.columns)
	series_name = df.iloc[0, 0]
	series_code = df.iloc[0, 1]
	df = df.drop(columns=["Series Name", "Series Code"])
	return df

# def transpose_cpi(df: pd.DataFrame) -> pd.DataFrame:
# 	df = df.drop(columns=["Country Name"])
# 	df = df.set_index("Country Code")
# 	df = df.T
# 	# reset index
# 	df = df.reset_index()
# 	# drop "Country Code" column
# 	# df = df.drop(columns=["Country Code"])
# 	# columns = list(df.columns)
# 	# index = list(df.index)
# 	# df["Date"] = index
# 	# df = df.set_index("Date")
# 	# df.index = pd.to_datetime(df.index)

# 	# # rename columns
# 	df = df.rename(columns={"index": "year"})
# 	# # set year as index
# 	df = df.set_index("year")
# 	# # convert index to datetime
# 	# df.index = pd.to_datetime(df.index)
# 	# reset index 
# 	# df = df.reset_index()
# 	# rename columns
# 	return df

def transpose_cpi(df: pd.DataFrame) -> pd.DataFrame:
	df = df.set_index("Country Code")
	df = df.drop(columns=["Country Name"])
	df = df.T
	df.index = pd.to_datetime(df.index)
	return df

def get_ratio(df: pd.DataFrame) -> pd.DataFrame:
	df = df / 100
	return df

df_cpi = load_cpi()
df_cpi = transpose_cpi(df_cpi)
# df_cpi = get_ratio(df_cpi)
df_cpi.tail()

In [None]:
# Plot all
countries = ["USA", "DEU", "GBR", "JPN"]
fig = go.Figure()
for country in countries:
	# fig.add_trace(go.Scatter(x=df_cpi.index, y=df_cpi[country], mode="lines+markers", name=country))
	# only plot from 2010 (index year component)
	year = 2000 #2010
	x = df_cpi.index[df_cpi.index.year >= year]
	y = df_cpi[country][df_cpi.index.year >= year]
	fig.add_trace(go.Scatter(x=x, y=y, mode="lines+markers", name=country))
	
fig.update_layout(title="Consumer Price Index (2010 = 100)", xaxis_title="Year", yaxis_title="CPI")
fig.update_layout(legend=dict(x=0, y=1.0))
fig.update_layout(margin=dict(l=0, r=10, b=0, t=50))
fig.update_layout(width=1200, height=400)
fig.show()

path_output = os.path.join(path_output_root, "cpi.png")
fig.write_image(path_output, scale=2)

In [None]:
df_cpi = load_cpi()
df_cpi = transpose_cpi(df_cpi)
df_cpi = get_ratio(df_cpi)
df_cpi.tail()

In [None]:
def add_missing_years(df: pd.DataFrame) -> pd.DataFrame:
	current_year = datetime.now().year
	years = df.index.year.unique()
	last_year = years[-1]
	# get last row values
	last_known = df.loc[df.index.year == last_year].iloc[0]
	for year in range(last_year + 1, current_year + 1):
		# for each missing year, add a new row with the last known value of all columns in last known 
		new_index = pd.Timestamp(datetime(year, 1, 1))
		df.loc[new_index] = last_known
	# sort by index
	df = df.sort_index()
	return df
	

def fill_missing_dates(df: pd.DataFrame) -> pd.DataFrame:
	df = df.asfreq("D")
	# NOTE: a more complete approach would be to exclude actual NaN (missing years) values
	#       for specific countries, however for our purposes, the result is the same
	df = df.fillna(method="ffill")
	return df

def fill_missing_dates_interpolate(df: pd.DataFrame) -> pd.DataFrame:
	df = df.asfreq("D")
	# df = df.interpolate(method="time")
	# use linear interpolation to fill all missing values
	df = df.interpolate(method="linear")
	return df

df_cpi = load_cpi()
df_cpi = transpose_cpi(df_cpi)
df_cpi = get_ratio(df_cpi)
df_cpi = add_missing_years(df_cpi)
df_cpi = fill_missing_dates(df_cpi)
df_cpi.tail()

In [None]:
# Plot all
countries = ["USA", "DEU", "GBR", "JPN"]
fig = go.Figure()
for country in countries:
	# fig.add_trace(go.Scatter(x=df_cpi.index, y=df_cpi[country], mode="lines+markers", name=country))
	# only plot from 2010 (index year component)
	year = 2000 #2010
	x = df_cpi.index[df_cpi.index.year >= year]
	y = df_cpi[country][df_cpi.index.year >= year]
	fig.add_trace(go.Scatter(x=x, y=y, mode="lines", name=country))
	
fig.update_layout(title="Consumer Price Index Ratio (2010 = 1)", xaxis_title="Year", yaxis_title="CPI")
fig.update_layout(legend=dict(x=0, y=1.0))
fig.update_layout(margin=dict(l=0, r=10, b=0, t=50))
fig.update_layout(width=1200, height=400)
fig.show()

path_output = os.path.join(path_output_root, "cpi-ratio-jagged.png")
fig.write_image(path_output, scale=2)

In [None]:
df_cpi = load_cpi()
df_cpi = transpose_cpi(df_cpi)
df_cpi = get_ratio(df_cpi)
df_cpi = add_missing_years(df_cpi)
df_cpi = fill_missing_dates_interpolate(df_cpi)

# Plot all
countries = ["USA", "DEU", "GBR", "JPN"]
fig = go.Figure()
for country in countries:
	# fig.add_trace(go.Scatter(x=df_cpi.index, y=df_cpi[country], mode="lines+markers", name=country))
	# only plot from 2010 (index year component)
	year = 2000 #2010
	x = df_cpi.index[df_cpi.index.year >= year]
	y = df_cpi[country][df_cpi.index.year >= year]
	fig.add_trace(go.Scatter(x=x, y=y, mode="lines", name=country))
	
fig.update_layout(title="Consumer Price Index Ratio (2010 = 1)", xaxis_title="Year", yaxis_title="CPI")
fig.update_layout(legend=dict(x=0, y=1.0))
fig.update_layout(margin=dict(l=0, r=10, b=0, t=50))
fig.update_layout(width=1200, height=400)
fig.show()

path_output = os.path.join(path_output_root, "cpi-ratio-smooth.png")
fig.write_image(path_output, scale=2)



In [None]:
def adjust_for_inflation(df: pd.DataFrame, df_cpi: pd.DataFrame, country: str) -> pd.DataFrame:
	# NOTE: df_cpi has already been divided by 100
	# Make a copy of the dataframe
	df = df.copy()
	# Get the CPI for the country
	cpi = df_cpi[country]
	# # Get the year component of the index
	# years = df.index.year
	# # Get the CPI for the year
	# cpi = cpi[years]
	# Adjust the prices for inflation
	# df = df / cpi
	df = df.div(cpi, axis=0, fill_value=None)
	return df

def get_cpi_df_processed() -> pd.DataFrame:
	df_cpi = load_cpi()
	df_cpi = transpose_cpi(df_cpi)
	df_cpi = get_ratio(df_cpi)
	df_cpi = add_missing_years(df_cpi)
	# df_cpi = fill_missing_dates(df_cpi)
	df_cpi = fill_missing_dates_interpolate(df_cpi)
	return df_cpi

# Synthetic dataframe with prices
df_prices = pd.DataFrame(index=pd.date_range(start="2019-01-01", end="2023-12-31", freq="D"), data={"Constant": 1000, "Linear": 1100})
#  Product2 to have prices from 1100 to 1200 continuously
df_prices["Linear"].iloc[0:-1] = None
df_prices["Linear"].iloc[0] = 1000
df_prices["Linear"].iloc[-1] = 1000 + (1000 - 1000 / df_cpi["USA"].iloc[0])
df_prices = df_prices.interpolate(method="linear")
# Get the CPI dataframe
df_cpi = get_cpi_df_processed()
df_cpi = df_cpi[df_cpi.index.year >= 2019] # speeds up the process of adjusting for inflation
# Adjust the prices for inflation
df_prices_adjusted = adjust_for_inflation(df_prices, df_cpi, "USA")

# Plot prices df
fig = go.Figure()
# fig.add_trace(go.Scatter(x=df_prices.index, y=df_prices["prices"], mode="lines", name="Prices"))
# fig.add_trace(go.Scatter(x=df_prices_adjusted.index, y=df_prices_adjusted["prices"], mode="lines", name="Prices (adjusted)"))
# for each column in df_prices and df_prices_adjusted plot a line with the same color
for i, col in enumerate(df_prices.columns):
	# fig.add_trace(go.Scatter(x=df_prices.index, y=df_prices[col], mode="lines", name=col))
	# fig.add_trace(go.Scatter(x=df_prices_adjusted.index, y=df_prices_adjusted[col], mode="lines", name=f"{col} (adjusted)"))
	color = px.colors.qualitative.Plotly[i]
	fig.add_trace(go.Scatter(x=df_prices.index, y=df_prices[col], mode="lines", name=col, line=dict(color=color)))
	fig.add_trace(go.Scatter(x=df_prices_adjusted.index, y=df_prices_adjusted[col], mode="markers", name=f"{col} (adjusted)", line=dict(color=color)))

# add dashed black h-line at y=df_prices_adjusted.iloc[1, 0]
fig.add_shape(type="line", x0=df_prices_adjusted.index[0], y0=df_prices_adjusted.iloc[1, 0], x1=df_prices_adjusted.index[-1], y1=df_prices_adjusted.iloc[1, 0], line=dict(color="black", width=3, dash="dash"))

fig.update_layout(title="CPI adjustment demonstration", xaxis_title="Year", yaxis_title="Price")
fig.update_layout(legend=dict(x=0, y=1.0))
fig.update_layout(margin=dict(l=0, r=10, b=0, t=50))
fig.update_layout(width=1200, height=400)
fig.show()

path_output = os.path.join(path_output_root, "cpi-adjustment-demonstration.png")
fig.write_image(path_output, scale=2)
	

In [None]:
# DONE: export the functions into a utility class

# Demo using the external utility

In [None]:
import os
import sys
import importlib

# Import the module
sys.path.append(os.getcwd())
cpi_adjust = importlib.import_module("src.py.scraping.world-bank.cpi_adjust")

# Initialize the module
cpi_adjust.initialize_cpi(date_cutoff="2019-01-01", jagged=False)

In [None]:
import pandas as pd
from plotly import graph_objects as go
import plotly.express as px

# Synthetic dataframe with prices
df_prices = pd.DataFrame(index=pd.date_range(start="2019-01-01", end="2023-12-31", freq="D"), data={"Product1": 1000, "Product2": 1100})
# Adjust the prices for inflation
df_prices_adjusted = cpi_adjust.adjust_for_inflation(df_prices, "USA", columns=["Product1", "Product2"])

# Plot prices df
fig = go.Figure()
for i, col in enumerate(df_prices.columns):
	color = px.colors.qualitative.Plotly[i]
	fig.add_trace(go.Scatter(x=df_prices.index, y=df_prices[col], mode="lines", name=col, line=dict(color=color)))
	fig.add_trace(go.Scatter(x=df_prices_adjusted.index, y=df_prices_adjusted[col], mode="lines", name=f"{col} (adjusted)", line=dict(color=color)))

fig.update_layout(title="Prices", xaxis_title="Year", yaxis_title="Price")
fig.show()