In [None]:
import os
import json
import requests
import pandas as pd
from plotly import graph_objs as go
import zipfile

In [None]:
url_population = "https://api.worldbank.org/v2/en/indicator/SP.POP.TOTL?downloadformat=csv"
path_output_root = "data/scraped/world-bank"
path_zip = os.path.join(path_output_root, "population.zip")
path_csv = os.path.join(path_output_root, "population.csv")
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
can_overwrite = False
path_output_csv_fixed = os.path.join(path_output_root, "population-fixed.csv")

In [None]:
if not os.path.exists(path_zip):
	os.makedirs(path_output_root, exist_ok=True)

if not os.path.exists(path_zip) or can_overwrite:
	print("Downloading population data from World Bank...")
	headers = {"User-Agent": user_agent}
	response = requests.get(url_population, headers=headers)
	with open(path_zip, "wb") as f:
		f.write(response.content)
	print("Done.")
else:
	print("Using cached population data.")

if not os.path.exists(path_csv) or can_overwrite:
	print("Extracting population data...")
	with zipfile.ZipFile(path_zip) as z:
		namelist = z.namelist()
		print(f"Namelist: {namelist}")
		for name in namelist:
			if "Metadata" not in name:
				print(f"Extracting '{name}' to '{path_csv}'...")
				with open(path_csv, "wb") as f:
					f.write(z.read(name))
				break
	print("Done.")
else:
	print("Using already extracted population data.")


In [None]:
df = pd.read_csv(path_csv, skiprows=4)
df = df.iloc[:, :-1]
df.head()

In [None]:
first_year = df.columns[4]
last_year = df.columns[-1]
world_population = df[df["Country Name"] == "World"][last_year].values[0]
number_of_countries = len(df["Country Name"].unique())
print(f"First year: {first_year}")
print(f"Last year: {last_year}")
print(f"World population in {last_year}: {round(world_population)}")
print(f"Number of countries: {number_of_countries}")

In [None]:
def get_population(df: pd.DataFrame, country_name: str):
	df_country = df[df["Country Name"] == country_name]
	df_country = df_country.T
	df_country = df_country.iloc[4:]
	df_country.columns = ["Population"]
	df_country.index = pd.to_datetime(df_country.index)
	return df_country

# # Get population of Slovenia
# df_slovenia = df[df["Country Name"] == "Slovenia"]
# df_slovenia = df_slovenia.T
# df_slovenia = df_slovenia.iloc[4:]
# df_slovenia.columns = ["Population"]
# # df_slovenia.index = pd.to_datetime(df_slovenia.index)
# df_slovenia.head()

df_slovenia = get_population(df, "Slovenia")
df_slovenia.head()

In [None]:
# Plot population of Slovenia
fig = go.Figure()
df_slovenia = get_population(df, "Slovenia")
df_croatia = get_population(df, "Croatia")
fig.add_trace(go.Scatter(x=df_slovenia.index, y=df_slovenia["Population"], name="Population of Slovenia"))
fig.add_trace(go.Scatter(x=df_croatia.index, y=df_croatia["Population"], name="Population of Croatia"))
fig.update_layout(title="Total population", xaxis_title="Year", yaxis_title="Population")
fig.update_layout(showlegend=True)
fig.show()

In [None]:
# Save (fixed) dataframe data to CSV
df.to_csv(path_output_csv_fixed, index=False) 