In [None]:
import os
import pandas as pd
from datetime import datetime
import pycountry
from plotly import graph_objects as go


In [None]:
path_who_covid_csv = os.path.join("data/scraped/who/who_covid_daily.csv")
path_wb_population_csv = os.path.join("data/scraped/world-bank/population-fixed.csv")
path_output_merged_csv = os.path.join("data/analysis/covid/who_covid_daily_merged.csv")

In [None]:
# Load Covid data
df_covid = pd.read_csv(path_who_covid_csv)

# Set Date_reported as datetime and index
df_covid["Date_reported"] = pd.to_datetime(df_covid["Date_reported"])
df_covid.set_index("Date_reported", inplace=True)

# Rename Country_code to Country_code_ISO2
df_covid.rename(columns={"Country_code": "ISO2"}, inplace=True)

# Convert New_cases, Cumulative_cases, New_deaths, Cumulative_deaths to int
df_covid["New_cases"] = df_covid["New_cases"].astype(int)

df_covid.head()

In [None]:
# Check for missing values
df_covid.isna().sum()

In [None]:
# Remove rows with missing values
num_rows_before = df_covid.shape[0]
df_covid.dropna(inplace=True)
num_rows_after = df_covid.shape[0]
print(f"Removed {num_rows_before - num_rows_after} rows with missing values ({num_rows_after} rows remaining)")

In [None]:
# Add ISO3 column using ISO2 and pycountry
countries_2_3_map = list(df_covid["ISO2"].unique())
country_conversion_fails = {}
print(f"Number of unique countries: {len(countries_2_3_map)}")
print(f"First 3 unique countries: {countries_2_3_map[:3]}")
print("")

# convert unique_countries to dict with ISO2 as key and None as value
countries_2_3_map = dict.fromkeys(countries_2_3_map, None)
for iso2 in countries_2_3_map:
	try:
		countries_2_3_map[iso2] = pycountry.countries.get(alpha_2=iso2).alpha_3
	except Exception as e:
		# print(f"Could not find ISO3 for {iso2}")
		country_conversion_fails[iso2] = str(e)

# Print number of countries that could not be converted
print(f"{len(country_conversion_fails)} countries could not be converted to ISO3:")
for country in country_conversion_fails.keys():
	print(f"- '{country}'")
print(f"Number of countries that could be converted: {len(countries_2_3_map) - len(country_conversion_fails)}")

In [None]:
df_xk = df_covid[df_covid["ISO2"] == "XK"]
df_xk

In [None]:
# Remove all rows with ISO2 that could not be converted to ISO3
num_rows_before = df_covid.shape[0]
for i, iso2 in enumerate(country_conversion_fails.keys()):
	print(f"{i+1}/{len(country_conversion_fails)} Removing rows with ISO2 '{iso2}'", end="\r")
	df_covid = df_covid[df_covid["ISO2"] != iso2]
num_rows_after = df_covid.shape[0]
print("")
print(f"Removed {num_rows_before - num_rows_after} rows with ISO2 that could not be converted to ISO3 ({num_rows_after} rows remaining)")


In [None]:
# Add ISO3 column and move it to the front
if "ISO3" not in df_covid.columns: # ensure idempotence
	df_covid.insert(0, "ISO3", None)
	df_covid["ISO3"] = df_covid["ISO2"].map(countries_2_3_map)
df_covid.head()

In [None]:
# Print WHO_region unique values
who_regions = list(df_covid["WHO_region"].unique())
who_regions_countries = {}
for region in who_regions:
	who_regions_countries[region] = list(df_covid[df_covid["WHO_region"] == region]["ISO3"].unique())
# print(f"WHO_region unique values: {who_regions}")
print(f"WHO regions ({len(who_regions)}):")
for region in who_regions:
	print(f"- {region} ({len(who_regions_countries[region])} countries)")

In [None]:
# Get number of dates for each country
countries_rows = {}
for country in df_covid["ISO3"].unique():
	countries_rows[country] = df_covid[df_covid["ISO3"] == country].shape[0]
countries_rows = dict(sorted(countries_rows.items(), key=lambda item: item[1], reverse=True))

# Group into a dictionary by number of rows
countries_rows_groups = {}
for country, rows in countries_rows.items():
	if rows not in countries_rows_groups:
		countries_rows_groups[rows] = []
	countries_rows_groups[rows].append(country)

# Print number of countries for each number of rows
print(f"Number of countries for each number of rows ({len(countries_rows_groups)}):")
for rows, countries in countries_rows_groups.items():
	print(f"- {rows} rows ({len(countries)} countries)")

In [None]:
# NOTE: We could remove countries with less than n-rows, however it seems that all countries have the same number (210) of rows (dates representing weeks)

In [None]:
# Load population data
df_population = pd.read_csv(path_wb_population_csv)

# Rename Country Code to ISO3
df_population.rename(columns={"Country Code": "ISO3"}, inplace=True)

df_population.head()

In [None]:
# Get 5-th row
first_year_column_name = df_population.columns[4]
print(f"First year column name: {first_year_column_name}")
# Remove all columns between 1960 and 2018 (inclusive)
for i in range(1960, 2019):
	if str(i) in df_population.columns:
		df_population.drop(str(i), axis=1, inplace=True)
first_year_column_name = df_population.columns[4]
print(f"First year column name (after removing unneeded years): {first_year_column_name}")
df_population.head()

In [None]:
# Convert all columns between first_year_column_name and last column to int
for i in range(int(first_year_column_name), len(df_population.columns)):
	df_population[str(i)] = df_population[str(i)].astype(int)

# Print types
df_population.dtypes

In [None]:
df_population.head()

In [None]:
print(f"Number of countries and country groups: {df_population.shape[0]}")

In [None]:
# Check for missing values
df_population.isna().sum()

In [None]:
# Remove rows with missing values
num_rows_before = df_population.shape[0]
df_population.dropna(inplace=True)
num_rows_after = df_population.shape[0]
print(f"Removed {num_rows_before - num_rows_after} rows with missing values ({num_rows_after} rows remaining)")

In [None]:
# Remove all rows with "ISO3" from df_population which are not in countries_2_3_map (values)
num_rows_before = df_population.shape[0]
df_population = df_population[df_population["ISO3"].isin(countries_2_3_map.values())]
num_rows_after = df_population.shape[0]
print(f"Removed {num_rows_before - num_rows_after} rows from df_population with ISO3 that are not in countries_2_3_map ({num_rows_after} rows remaining)")

In [None]:
df_population_unique_dict = {}	# key: ISO3, value: None
for iso3 in df_population["ISO3"].unique():
	df_population_unique_dict[iso3] = None
print(f"Number of unique countries in df_population: {len(df_population_unique_dict)}")

In [None]:
# Remove all rows with "ISO3" from df_covid which are not in df_population_unique_dict (keys)
num_rows_before = df_covid.shape[0]
df_covid = df_covid[df_covid["ISO3"].isin(df_population_unique_dict.keys())]
num_rows_after = df_covid.shape[0]
print(f"Removed {num_rows_before - num_rows_after} rows from df_covid with ISO3 that are not in df_population_unique_dict ({num_rows_after} rows remaining)")

In [None]:
print(f"Number of unique (matching) countries in both df_covid and df_population: {len(df_covid['ISO3'].unique())}")

In [None]:
# Validate that all countries in df_covid are in df_population
mismatches = 0
for country in df_covid["ISO3"].unique():
	if country not in df_population["ISO3"].unique():
		print(f"Country '{country}' in df_covid but not in df_population")
		mismatches += 1
if mismatches == 0:
	print("All countries in df_covid are in df_population and can be matched")

In [None]:
# Print WHO_region unique values and number of countries in each region

# Get number of countries in each WHO_region
who_regions_countries = {}
for region in who_regions:
	who_regions_countries[region] = list(df_covid[df_covid["WHO_region"] == region]["ISO3"].unique())

print(f"WHO regions ({len(who_regions)}):")
for region in who_regions:
	print(f"- {region} ({len(who_regions_countries[region])} countries)")

In [None]:
# NOTE: Population data from World Bank represents mid-year estimates

In [None]:
index_of_first_year_column = df_population.columns.get_loc(first_year_column_name)
year_columns = list(df_population.columns)[index_of_first_year_column:]
year_max_covid = df_covid.index.max().year
years = list(range(int(first_year_column_name), year_max_covid + 1))
years = list(map(str, years))
print(f"Year columns ({len(year_columns)}): {year_columns}")
print(f"Years ({len(years)}): {years}")

In [None]:
# Iterate over all countries in df_covid and df_population and add population to df_covid for every year where they match
# Skip if Population column already exists
if "Population" not in df_covid.columns:
	df_covid = df_covid.copy()
	for i, iso3 in enumerate(df_population["ISO3"].unique()):
		print(f"{i+1}/{len(df_population['ISO3'].unique())} Adding population for {iso3}", end="\r")
		for year in years: #year_columns:
			year_query = year
			if year not in year_columns: # fill missing years with last year population data
				year_query = year_columns[-1]
			country_population = df_population[df_population["ISO3"] == iso3][year_query].values[0]
			# Set this population for all rows in df_covid where ISO3 matches and str(Date_reported.year) == year
			# (so each year has the correct population)
			df_covid.loc[(df_covid["ISO3"] == iso3) & (df_covid.index.year.astype(str) == year), "Population"] = country_population
	print("")
df_covid.tail()

In [None]:
# Print missing values
df_covid.isna().sum()

In [None]:
# Get population for Slovenia by years
print("Population for Slovenia by years:")
for year in years:
	df_slo = df_covid[df_covid["ISO3"] == "SVN"]
	df_slo_year = df_slo[df_slo.index.year.astype(str) == year]
	unique_values = df_slo_year["Population"].unique()
	print(f"Year: {year} - unique values: {unique_values}")

In [None]:
# Add weighted columns for incidence and mortality per n-people
n = 100000 # 100k
df_covid["New_cases_per_100k"] = df_covid["New_cases"] / df_covid["Population"] * n
df_covid["Cumulative_cases_per_100k"] = df_covid["Cumulative_cases"] / df_covid["Population"] * n
df_covid["New_deaths_per_100k"] = df_covid["New_deaths"] / df_covid["Population"] * n
df_covid["Cumulative_deaths_per_100k"] = df_covid["Cumulative_deaths"] / df_covid["Population"] * n
df_covid.tail()

In [None]:
# Save CSV with the merged data
df_covid.to_csv(path_output_merged_csv)

In [None]:
# Load merged data
df_covid = pd.read_csv(path_output_merged_csv)

# Set Date_reported as datetime and index
df_covid["Date_reported"] = pd.to_datetime(df_covid["Date_reported"])
df_covid.set_index("Date_reported", inplace=True)

In [None]:
df_covid.head()

In [None]:
# limit to data between 2020 and 2021
df_plot = df_covid[df_covid.index.year >= 2020]

# Plot New_cases_per_100k for Slovenia, Croatia, United States, Japan, and China
countries = ["SVN", "HRV", "USA", "JPN", "CHN"]

fig = go.Figure()

for country in countries:
	fig.add_trace(go.Scatter(
		x=df_plot[df_plot["ISO3"] == country].index,
		y=df_plot[df_plot["ISO3"] == country]["New_cases_per_100k"],
		name=country
	))

fig.update_layout(
	title="COVID-19 | New Cases per 100.000 people - Example",
	xaxis_title="Date",
	yaxis_title="New cases per 100.000 people"
)

fig.show()

# Save to PNG 300% zoom, 3:1 aspect ratio
fig.write_image("data/analysis/covid/plot_covid_new_cases_per_100k_example.png", scale=3, width=1800, height=600)