In [None]:
# World Economic Outlook Database
import os
import requests
import json
import zipfile
from io import BytesIO
import pandas as pd
import numpy as np
from typing import Optional

In [None]:
# https://www.imf.org/en/Publications/WEO/weo-database/2023/October/download-entire-database
year = 2023
month = "Oct"

url_base = f"https://www.imf.org/-/media/Files/Publications/WEO/WEO-Database/{year}"
url_by_countries = f"{url_base}/WEO{month}{year}all.ashx" # 9 MB ; .xls
url_by_country_groups = f"{url_base}/WEO{month}{year}alla.ashx" # 577 KB ; .xls
url_sdmx_data = f"{url_base}/WEO{month}{year}-SDMXData.ashx" # 24 MB ; .zip
url_sdmx_data_structure_definition = f"{url_base}/weo{month.lower()}{year}-sdmx-dsd.ashx" # 1 MB ; .xml
url_custom_query_country_groups_all = "https://www.imf.org/imf/weodatabase/downloadreport?a=1&c=001,110,163,119,123,998,510,200,505,903,205,400,603,&s=NGDP_RPCH,NGDP_RPCHMK,NGDPD,PPPGDP,NGDP_D,NGDPRPPPPC,PPPPC,NGAP_NPGDP,PPPSH,NID_NGDP,NGSD_NGDP,PCPIPCH,PCPIEPCH,TRADEPCH,TM_RPCH,TMG_RPCH,TX_RPCH,TXG_RPCH,TTPCH,TTTPCH,TXGM_D,TXGM_DPCH,LUR,LE,GGR_NGDP,GGX_NGDP,GGXCNL_NGDP,GGSB_NPGDP,GGXONLB_NGDP,GGXWDN_NGDP,GGXWDG_NGDP,BCA,BCA_NGDPD,BM,BX,BF,BFD,BFP,BFF,BFO,BFRA,D,D_NGDPD,D_BX,DS,DS_NGDPD,DS_BX,DSI,DSI_NGDPD,DSI_BX,DSP,DSP_NGDPD,DSP_BX,PALLFNFW,PNFUELW,PINDUW,POILAPSP,POILBRE,POILDUB,POILWTI,PNRGW,POILAPSPW,PNGASW,PNGASEU,PNGASJP,PNGASUS,PCOALW,PCOALAU,PCOALSA,PFANDBW,PFOODW,PCEREW,PWHEAMT,PMAIZMT,PRICENPQ,PBARL,PVOILW,PSOYB,PSMEA,PSOIL,PROIL,PPOIL,PSUNO,POLVOIL,PFISH,PGNUTS,PMEATW,PBEEF,PLAMB,PPORK,PPOULT,PSEAFW,PSALM,PSHRI,PSUGAW,PSUGAISA,PSUGAUSA,PBANSOP,PORANG,PBEVEW,PCOFFW,PCOFFOTM,PCOFFROB,PCOCO,PTEA,PRAWMW,PTIMBW,PHARDW,PLOGSK,PSAWMAL,PSOFTW,PLOGORE,PSAWORE,PCOTTIND,PWOOLW,PWOOLF,PWOOLC,PRUBB,PHIDE,PMETAW,PCOPP,PALUM,PIORECR,PTIN,PNICK,PZINC,PLEAD,PURAN,&sy=1980&ey=2028&ssm=1&scsm=1&scc=1&ssd=1&ssc=1&sic=0&sort=country&ds=.&br=1&wsid=69dccfdb-9723-44a6-99d3-6a0c1eac4300" # 1 MB ; .xls

path_output_root = "data/scraped/imf"
path_output_weo = os.path.join(path_output_root, "weo")


In [None]:
def download_file_ashx(url: str, destination_directory: str, filename: Optional[str] = None) -> str:
	'''
		Download a file from a URL to a destination directory and returns the filepath
	'''
	os.makedirs(destination_directory, exist_ok=True)
	if filename is None:
		filename = url.split("/")[-1]
	filepath = os.path.join(destination_directory, filename)

	head = requests.head(url)
	content_type = head.headers.get("content-type", "")
	
	if "application/vnd.ms-excel" in content_type:
		filepath = filepath.replace(".ashx", ".xls")
	elif "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" in content_type:
		filepath = filepath.replace(".ashx", ".xlsx")
	elif "application/zip" in content_type or "application/x-zip-compressed" in content_type:
		filepath = filepath.replace(".ashx", ".zip")
	else:
		raise Exception(f"Unsupported content type: {content_type}")
	if os.path.exists(filepath):
		print(f"File already exists: '{filepath}'")
		return filepath
	
	response = requests.get(url)

	if response.status_code != 200:
		raise Exception(f"Failed to download file: {url}")
	
	with open(filepath, "wb") as f:
		f.write(response.content)
	print(f"Downloaded file: '{filepath}'")
	return filepath

def extract_file(filepath: str, destination: str) -> list[str]:
	'''
		Extract a file to a destination directory
	'''
	os.makedirs(destination, exist_ok=True)
	namelist = []
	
	if not filepath.endswith(".zip"):
		raise Exception(f"Unsupported file type: {filepath}")
	
	with zipfile.ZipFile(filepath, "r") as zip_ref:
		# zip_ref.extractall(destination)
		count_failed = 0
		count_success = 0
		for name in zip_ref.namelist():
			try:
				zip_ref.extract(name, destination)
				print(f"Extracted file '{name}' from '{filepath}' to '{destination}'")
				namelist.append(name)
				count_success += 1
			except Exception as e:
				print(f"Failed to extract file '{name}' from '{filepath}' to '{destination}': {e}")
				count_failed += 1
		print(f"Extracted {count_success} files. Failed to extract {count_failed} files.")
	
	return namelist

def fix_xls(filepath: str) -> None:
	'''
		Fix an .xls file by removing all ordinal 0 characters
	'''
	text = ""
	with open(filepath, "r") as f:
		text = f.read()
	text = text.replace("\x00", "")
	with open(filepath, "w") as f:
		f.write(text)


In [None]:
# Download all files
filepath_by_countries = download_file_ashx(url_by_countries, path_output_weo)
filepath_by_country_groups = download_file_ashx(url_by_country_groups, path_output_weo)
filepath_sdmx_data = download_file_ashx(url_sdmx_data, path_output_weo)
filepath_sdmx_data_structure_definition = download_file_ashx(url_sdmx_data_structure_definition, path_output_weo)
filepath_custom_query_country_groups_all = download_file_ashx(url_custom_query_country_groups_all, path_output_weo, "weo-custom-query-country-groups-all.xls")

In [None]:
# Extract filepath_sdmx_data
unzipped_files = extract_file(filepath_sdmx_data, path_output_weo)

In [None]:
# Load the path with pandas (it's an .xls file)
# df = pd.read_excel(filepath_custom_query_country_groups_all, sheet_name="WEO October 2023 By Country Groups (custom query)", header=0, engine="xlrd")
# gives error: XLRDError: Unsupported format, or corrupt file: Expected BOF record; found b'W\x00E\x00O\x00 \x00'
# Try loading as csv with tab delimiter
filepath_custom_query_country_groups_all = filepath_custom_query_country_groups_all.replace(".xls", ".csv")
df = pd.read_csv(filepath_custom_query_country_groups_all, sep="\t", header=0, engine="python")
# remove all even rows
df = df.iloc[::2]
# reindex
df = df.reset_index(drop=True)
# all text in all fields is for some reason formatted as 
# print header
print(df.head())

In [None]:
print(df.columns)

In [None]:
print(df.head())

In [None]:
filepath_custom_query_country_groups_all = filepath_custom_query_country_groups_all.replace(".xls", ".csv")
# read contents of csv file
text = ""
with open(filepath_custom_query_country_groups_all, "r") as f:
	text = f.read()
# print(text[:1000])
# iterate over all characters and print their unicode code point
# for i, c in enumerate(text):
# 	# print(ord(c), end=" ")
# 	print(f"'{c}': {ord(c)}")
# 	if i > 100:
# 		break
# remove all ordinal 0 characters
text = text.replace("\x00", "")
print(text[:1000])

In [None]:
# load the text as a csv (dataframe with tab delimiter)
df = pd.read_csv(BytesIO(text.encode()), sep="\t", header=0)#, engine="python")
# iterate over columns and change all of the columns with years to floats
for column in df.columns:
	if column.isnumeric():
		# print(f"Converting column '{column}' to float")
		df[column] = df[column].str.replace(",", "") # remove commas
		df[column] = df[column].astype(float)
df.head()

In [None]:
fix_xls(filepath_custom_query_country_groups_all)
df = pd.read_csv(filepath_custom_query_country_groups_all, sep="\t", header=0)
df.head()

In [None]:
print(df.columns)

In [None]:
print(df.head())

In [None]:
def get_dict_from_df(df: pd.DataFrame) -> dict:
	'''
		Convert a dataframe to a dictionary
	'''
	data = {}
	data["subjects"] = {} # translation table from subject code to subject descriptor, subject notes, units, and scale
	data["country_groups"] = {} # translation table from country group code to country group name, and data over time for each subject
	year_columns = [column for column in df.columns if column.isnumeric()]
	data["years"] = list(year_columns)
	year_start = int(year_columns[0])
	year_end = int(year_columns[-1])
	data["year_start"] = year_start
	data["year_end"] = year_end
	# iterate over rows
	for index, row in df.iterrows():
		# get values
		country_group_code = row["WEO Country Group Code"]
		if len(country_group_code) != 3: # skip row with "International Monetary Fund, World Economic Outlook Database, October 2023"
			continue
		country_group_name = row["Country Group Name"]
		subject_code = row["WEO Subject Code"]
		subject_descriptor = row["Subject Descriptor"]
		subject_notes = row["Subject Notes"]
		units = row["Units"]
		scale = row["Scale"]
		timeseries = row[year_columns].values
		# replace NaN with None
		country_group_code = country_group_code if not pd.isna(country_group_code) else None
		country_group_name = country_group_name if not pd.isna(country_group_name) else None
		subject_code = subject_code if not pd.isna(subject_code) else None
		subject_descriptor = subject_descriptor if not pd.isna(subject_descriptor) else None
		subject_notes = subject_notes if not pd.isna(subject_notes) else None
		units = units if not pd.isna(units) else None
		scale = scale if not pd.isna(scale) else None
		timeseries = [None if pd.isna(value) else value for value in timeseries]
		# add to data
		if country_group_code not in data["country_groups"]:
			data["country_groups"][country_group_code] = {}
		data["country_groups"][country_group_code]["code"] = country_group_code
		data["country_groups"][country_group_code]["name"] = country_group_name
		if "subjects" not in data["country_groups"][country_group_code]:
			data["country_groups"][country_group_code]["subjects"] = {}
		data["country_groups"][country_group_code]["subjects"][subject_code] = list(timeseries)
		if subject_code not in data["subjects"]:
			data["subjects"][subject_code] = {}
			data["subjects"][subject_code]["code"] = subject_code
			data["subjects"][subject_code]["descriptor"] = subject_descriptor
			data["subjects"][subject_code]["notes"] = subject_notes
			data["subjects"][subject_code]["units"] = units
			data["subjects"][subject_code]["scale"] = scale
	return data

data = get_dict_from_df(df)
data["current_year"] = "2023" # we have to choose where the data ends and predictions start
filepath_custom_query_country_groups_all_json = filepath_custom_query_country_groups_all.replace(".csv", ".json")
with open(filepath_custom_query_country_groups_all_json, "w") as f:
	json.dump(data, f, indent=2) # 1.1 MB with indent=2, 500 KB without indent... doesn't matter much tbh

In [None]:
def print_subjects(data: dict):
	'''
		Print a subject
	'''
	print(f"Subjects ({len(data['subjects'])}):")
	for subject_code, subject_data in data["subjects"].items():
		print(f"{subject_code}: '{subject_data['descriptor']}'")# ; units: '{subject_data['units']}' ; scale: '{subject_data['scale']}'")
		print(f"    Units: '{subject_data['units']}'")
		print(f"    Scale: '{subject_data['scale']}'")
		print(f"    Notes: '{subject_data['notes']}'")
print_subjects(data)

In [None]:
def print_country_groups(data: dict):
	'''
		Print a country group
	'''
	print(f"Country groups ({len(data['country_groups'])}):")
	for country_group_code, country_group_data in data["country_groups"].items():
		print(f"{country_group_code}: '{country_group_data['name']}'")
		print(f"    Subjects ({len(country_group_data['subjects'])}):")
		for subject_code, timeseries in country_group_data["subjects"].items():
			print(f"        {subject_code}: {timeseries}")

# print_country_groups(data)

In [None]:
def print_country_groups_short(data: dict):
	'''
		Print a country group
	'''
	print(f"Country groups ({len(data['country_groups'])}):")
	for country_group_code, country_group_data in data["country_groups"].items():
		print(f"{country_group_code}: '{country_group_data['name']}'")

print_country_groups_short(data)

In [None]:
def find_subjects_by_descriptor(data: dict, query: str) -> list[str]:
	'''
		Find subjects by descriptor
	'''
	subject_codes = []
	for subject_code, subject_data in data["subjects"].items():
		if query.lower() in subject_data["descriptor"].lower():
			subject_codes.append(subject_code)
	return subject_codes

matches = find_subjects_by_descriptor(data, "inflation")
print(f"Found {len(matches)} matches:")
for match in matches:
	print(f"    {match}: '{data['subjects'][match]['descriptor']}'")

In [None]:
# Print inflation values (code "PCPIPCH") for World (code "001") for the last 23 years (between "current_year" and "current_year - 23")
index_current_year = data["years"].index(data["current_year"])
n_years = 23
index_current_minus_n = index_current_year - n_years
values_inflation = data["country_groups"]["001"]["subjects"]["PCPIPCH"][index_current_minus_n:index_current_year]
print(f"World inflation values for the last {n_years} years:")
years_list = data["years"][index_current_minus_n:index_current_year]
unit_inflation = data["subjects"]["PCPIPCH"]["units"]
# for i, value in enumerate(values):
# 	print(f"    {data['years'][index_current_minus_n + i]}: {value}")
for year, value in zip(years_list, values_inflation):
	print(f"    {year}: {value} {unit_inflation.lower()}")

In [None]:
# TODO: create plots for analysis - for selected subject plot all country groups over time