In [None]:
import os
import pandas as pd
from plotly import graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import plotly.express as px
import numpy as np

In [None]:
path_who_covid_csv = "data/analysis/covid/who_covid_daily_merged.csv" # actually weekly data now
path_output_root = "data/analysis/covid"

In [None]:
df_covid = pd.read_csv(path_who_covid_csv)
df_covid = df_covid.set_index('Date_reported')
df_covid.index = pd.to_datetime(df_covid.index)
df_covid.iloc[:,4:] = df_covid.iloc[:,4:].apply(pd.to_numeric, errors='coerce')
# remove dates with year 2024
df_covid = df_covid[df_covid.index.year < 2024]
# remove all that are 2023-12-31 as well as it appears incomplete
df_covid = df_covid[df_covid.index != pd.to_datetime('2023-12-31')]
df_covid.tail()

In [None]:
# Print null values
print(df_covid.isnull().sum())

In [None]:
# print data types
print(df_covid.dtypes)

In [None]:
# get WHO regions
regions = list(sorted(df_covid["WHO_region"].unique()))
# get number of countries per region
num_countries = df_covid.groupby("WHO_region")["Country"].nunique()
print(regions)
print(f"Number of countries per region: {num_countries}")

In [None]:
# Get total population for each WHO_region for a specific year
def get_region_populations(df: pd.DataFrame, year: int) -> dict:
	'''
	Get the total population for each WHO region for a specific year
	'''
	populations = {}
	date_year = df[df.index.year == year].index[0] # type: ignore
	regions = df["WHO_region"].unique()
	for region in regions:
		# get a random date for the year
		# unique Country column values where region is the current region
		columns = list(df.columns)
		countries = df[df["WHO_region"] == region]["Country"].unique()
		for country in countries:
			# get the population for the country
			population = df[(df["Country"] == country) & (df.index == date_year)]["Population"].values[0]
			if region not in populations:
				populations[region] = 0
			populations[region] += population
	return populations

region_populations = get_region_populations(df_covid, 2023)
region_populations_total = sum(region_populations.values())
print(region_populations)
print(f"Total population: {region_populations_total}")

In [None]:
# Sum all data for each region to get total numbers - each region is one row and is the index
def get_df_total_regions(df: pd.DataFrame) -> pd.DataFrame:
	'''
	Sum all data for each region to get total numbers - each region is one row and is the index
	'''
	# Drop columns ISO3, ISO2, Country
	df = df.drop(columns=["ISO3", "ISO2", "Country"])
	df_total_regions = df.groupby("WHO_region").sum()
	df_total_regions["Cumulative_cases"] = df_total_regions["New_cases"]
	df_total_regions["Cumulative_deaths"] = df_total_regions["New_deaths"]
	# drop columns New_cases and New_deaths
	df_total_regions = df_total_regions.drop(columns=["New_cases", "New_deaths"])
	# Add population 
	df_total_regions["Population"] = [region_populations[region] for region in df_total_regions.index]
	# Add columns for cases and deaths per 100k
	df_total_regions["Cumulative_cases_per_100k"] = df_total_regions["Cumulative_cases"] / df_total_regions["Population"] * 100000
	df_total_regions["Cumulative_deaths_per_100k"] = df_total_regions["Cumulative_deaths"] / df_total_regions["Population"] * 100000
	# Remove columns New_cases_per_100k and New_deaths_per_100k
	df_total_regions = df_total_regions.drop(columns=["New_cases_per_100k", "New_deaths_per_100k"])
	# Add column Survival_rate and Survival_rate_per_100k
	df_total_regions["Survival_rate"] = df_total_regions["Cumulative_cases"] / df_total_regions["Cumulative_deaths"]
	# Reorder columns
	# df_total_regions = df_total_regions[["Cumulative_cases", "Cumulative_deaths", "Survival_rate", "Population", "Cumulative_cases_per_100k", "Cumulative_deaths_per_100k"]]
	# df_total_regions = df_total_regions[['Population', 'Cumulative_cases', 'Cumulative_deaths', 'Survival_rate', 'Cumulative_cases_per_100k', 'Cumulative_deaths_per_100k']]
	df_total_regions = df_total_regions[['Population', 'Cumulative_cases', 'Cumulative_deaths', 'Cumulative_cases_per_100k', 'Cumulative_deaths_per_100k', 'Survival_rate']]
	# Rename columns to (replace "_" with " ")
	df_total_regions.columns = df_total_regions.columns.str.replace("_", " ")
	return df_total_regions
	
# df_covid.tail()
df_total_regions = get_df_total_regions(df_covid)
df_total_regions

In [None]:
# Scale as if values are area and we want the radius new_value = sqrt(value/pi) for all columns
df_total_regions_scaled = df_total_regions.apply(lambda x: (x / 3.14159) ** 0.5)
df_total_regions_scaled

In [None]:
# Normalize the region data
# df_total_regions_normalized = df_total_regions.div(df_total_regions.max())
df_total_regions_normalized = df_total_regions_scaled.div(df_total_regions_scaled.max())
df_total_regions_normalized

In [None]:
# Make a matrix (scatter plot) visualization of the data with each cell having a dot size proportional to the value
# Inside the cell on top of the dot, the value is shown
# X axis: WHO_region (one df row = one region)
# Y axis: columns (one df column = one variable)
# Dot size: value of the cell (normalized to the max value of the column)
# Dot color: variable (one color for each variable)
# Dot position: region (one position for each region)
# Dot shape: circle

fig = go.Figure()
# Add traces
# for i, col in enumerate(df_total_regions_normalized.columns):
# 	fig.add_trace(go.Scatter
# 		(
# 			x=[col] * len(df_total_regions_normalized.index),
# 			y=df_total_regions_normalized.index,
# 			mode="markers",
# 			marker=dict(
# 				size=df_total_regions_normalized[col] * 100,
# 				color=i,
# 				opacity=0.5 * 2,
# 				line=dict(width=0)
# 			),
# 			text=df_total_regions[col],
# 			name=col
# 		)
# 	)
# 	# add text to the dots (values) in white

for i, col in enumerate(reversed(df_total_regions_normalized.columns)):
	fig.add_trace(go.Scatter
		(
			x=df_total_regions_normalized.index,
			y=[col] * len(df_total_regions_normalized.index),
			mode="markers",
			marker=dict(
				size=df_total_regions_normalized[col] * 90,
				color=i,
				# color=df_total_regions_normalized[col],
				colorscale="Viridis",
				# colorscale='Agsunset',
				opacity=0.5 * 2,
				line=dict(width=0)
			),
			text=df_total_regions[col],
			name=col
		),
	)
	# add text to the dots (values) in white

# Iterate over index (WHO regions)
df_total_regions_normalized_t = df_total_regions_normalized.transpose()
for i, region in enumerate(df_total_regions_normalized_t.columns):
	# Iterate over columns (variables)
	for j, col in enumerate(reversed(df_total_regions_normalized_t.index)):
		value = df_total_regions_normalized_t[region][col]
		# fig.add_trace(go.Scatter
		# 	(
		# 		x=[region], #[col],
		# 		y=[col], #[region],
		# 		mode="markers",
		# 		marker=dict(
		# 			size=value * 80,
		# 			color=j,
		# 			opacity=0.5 * 2,
		# 			line=dict(width=0)
		# 		),
		# 		text=df_total_regions[col],
		# 		name=col
		# 	)
		# )
		# add text to the dots (values) in white
		# fig.add_annotation(
		# 	x=region, #col,
		# 	y=col, #region,
		# 	text=str(int(df_total_regions[col][region])),
		# 	textangle=-45,
		# 	showarrow=False,
		# 	font=dict(
		# 		color="black",
		# 		size=12
		# 	)
		# )
		hmm = 0

fig.update_layout(
	title="COVID-19 WHO region data summary",
	xaxis_title="WHO region",
	yaxis_title="Variable",
	showlegend=False
)

# make grid lines black
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='rgba(0,0,0,0.1)')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='rgba(0,0,0,0.1)')

# # remove background color
# fig.update_layout(
# 	plot_bgcolor='rgba(0,0,0,0)',
# 	paper_bgcolor='rgba(0,0,0,0)'
# )

# make background white
fig.update_layout(
	plot_bgcolor='white',
	paper_bgcolor='white'
)

# Make it 800x800
fig.update_layout(
	width=880,
	height=720
)

fig.update_xaxes(side="top")

# tight layout
fig.update_layout(
	margin=dict(l=220, r=20, t=80, b=20)
)

# # Show the legend
# fig.update_layout(
# 	showlegend=True
# )

fig.show()

# Save the plot as png with 3x zoom
fig.write_image(os.path.join(path_output_root, "covid_who_region_data_summary.png"), scale=2)

In [None]:
# Plot the data in a heatmap
# fig = px.imshow(df_total_regions_normalized, labels=dict(x="Variables", y="WHO region", color="Value"), title="WHO region data heatmap")
# fig = px.imshow(df_total_regions_normalized, labels=dict(x="Variables", y="WHO region", color="Value"), title="WHO region data heatmap")
# switch x and y axis
# fig.update_xaxes(side="top")
# fig.update_yaxes(autorange="reversed")

# Plot the data in a heatmap with columns being y axis and index being x axis
# fig = px.imshow(
# 	df_total_regions_normalized.transpose(),
# 	labels=dict(x="WHO region", y="Variables", color="Value"),
# 	title="WHO region data heatmap",
# 	text_auto=True)

# size of the figure
# fig.update_layout(
# 	width=1200,
# 	height=1200
# )

# Overlay the heatmap cells with the actual values from df_total_regions - get x, y positions of the cells
# Iterate over index (WHO regions)
# df_total_regions_t = df_total_regions.transpose()
# # Change to whole numbers
# df_total_regions_t = df_total_regions_t.astype(int)
# cell_width = 1 / len(df_total_regions_normalized.columns)
# cell_height = 1 / len(df_total_regions_normalized.index)
# for i, region in enumerate(df_total_regions_t.columns):
# 	# Iterate over columns (variables)
# 	for j, col in enumerate(df_total_regions_t.index):
# 		# add text to the cells (values) in white
# 		color = "white"
# 		if df_total_regions_normalized_t[region][col] <= 0.5:
# 			color = "black"
# 		cell_color = df_total_regions_normalized_t[region][col]
# 		fig.add_annotation(
# 			x=j * cell_width + cell_width / 2,
# 			y=i * cell_height + cell_height / 2,
# 			textangle=-45,
# 			text=str(df_total_regions_t[region][col]),
# 			showarrow=False,
# 			font=dict(color=color),
# 		)
		

# Put y axis text on top
# fig.update_xaxes(side="top")
z = df_total_regions_normalized.T.iloc[::-1].values
x = list(df_total_regions_normalized.index)
y = list(reversed(df_total_regions_normalized.columns))
annotated_text = df_total_regions.T.iloc[::-1].values.astype(int)
fig = ff.create_annotated_heatmap(
	z=z,
	x=x,
	y=y,
	annotation_text=annotated_text,
	colorscale="Viridis",
	showscale=True
)

# make narrower
fig.update_layout(
	width=950,
	height=320
)

# Add title
fig.update_layout(
	title="COVID-19 WHO region data heatmap",
	xaxis_title="WHO region",
	yaxis_title="Variables"
)

# tight layout
fig.update_layout(
	margin=dict(l=230, r=20, t=80, b=20)
)

fig.show()

# Save the plot
fig.write_image(os.path.join(path_output_root, "covid_who_region_data_heatmap.png"), scale=2)


In [None]:
def get_df_region(df: pd.DataFrame, region: str) -> pd.DataFrame:
	'''
	Filter the dataframe by WHO region and group by date (index)
	'''
	df_region = df[df["WHO_region"] == region]
	# df_region = df_region.groupby("Date_reported").sum()
	# Remove columns ISO3, ISO2, Country
	df_region = df_region.drop(columns=["WHO_region", "ISO3", "ISO2", "Country"])
	df_region = df_region.groupby("Date_reported").sum()
	# log scale the data
	# add +1 to avoid log(0)
	# df_region = df_region + 0.000001
	# df_region = df_region.apply(np.log)
	# df_region = df_region ** 2
	# df_region = df_region ** 0.5
	# square data
	# normalize each column
	# df_region = df_region.div(df_region.max())
	# resample to monthly
	# df_region = df_region.resample("M").sum()
	df_region["New_cases_per_100k"] = df_region["New_cases"] / df_region["Population"] * 100000
	df_region["Cumulative_cases_per_100k"] = df_region["Cumulative_cases"] / df_region["Population"] * 100000
	df_region["New_deaths_per_100k"] = df_region["New_deaths"] / df_region["Population"] * 100000
	df_region["Cumulative_deaths_per_100k"] = df_region["Cumulative_deaths"] / df_region["Population"] * 100000
	# df_region["New_cases_per_100k"] = df_region["New_cases"] / region_populations[region] * 100000
	# df_region["Cumulative_cases_per_100k"] = df_region["Cumulative_cases"] / region_populations[region] * 100000
	# df_region["New_deaths_per_100k"] = df_region["New_deaths"] / region_populations[region] * 100000
	# df_region["Cumulative_deaths_per_100k"] = df_region["Cumulative_deaths"] / region_populations[region] * 100000
	# df_region = df_region.drop(columns=["Population"])
	# Limit between 2020-01-01 and 2021-01-01
	# df_region = df_region.loc["2020-01-01":"2021-01-01"]
	# get unique dates list
	dates_df = list(sorted(df_region.index.unique()))
	dates_region = list(sorted(df.index.unique()))
	# pad the start of df_region with 0 values for missing dates
	# for date in dates_df:
	# 	# if date >= dates_region[0]:
	# 	# 	break
	# 	if date not in dates_region:
	# 		# add a row with 0 values
	# 		# df_region.loc[date] = 0
	# 		df_region.loc[date] = [0] * len(df_region.columns)
	# print start and end dates and number of dates
	# print("Region:", region)
	# print(f"Start date: {dates_df[0]}")
	# print(f"End date: {dates_df[-1]}")
	# print(f"Number of dates: {len(dates_df)}")
	# set first row to 0.00000001 so it zero values won't be cut off with histogram
	df_region.iloc[0] = 0.00000001
	# add back columns
	df_region["WHO_region"] = region
	# Add back WHO_region column to front
	df_region = df_region[["WHO_region"] + [col for col in df_region.columns if col != "WHO_region"]]
	return df_region

dfs_regions = { region: get_df_region(df_covid, region) for region in regions }

df_euro = dfs_regions["EURO"]

df_euro_new_casees_total = df_euro["New_cases"].sum()
df_euro_cumulative_cases_total = df_euro["Cumulative_cases"].max()
print(f"Total new cases: {df_euro_new_casees_total}")
print(f"Total cumulative cases: {df_euro_cumulative_cases_total}")

df_euro.tail()

In [None]:
df_joint = pd.concat(dfs_regions.values())
df_joint.tail()

In [None]:
# Plot daily new cases
fig = go.Figure()
for region, df_region in dfs_regions.items():
	fig.add_trace(go.Scatter(x=df_region.index, y=df_region["New_cases_per_100k"], name=region))
fig.update_layout(barmode="stack", bargap=0, bargroupgap=0)
fig.update_layout(title=f"Weekly new COVID-19 cases for {len(regions)} WHO regions", xaxis_title="Date", yaxis_title="New cases per 100k")
fig.update_layout(width=1200, height=400)
fig.update_layout(margin=dict(l=80, r=20, t=60, b=20))
fig.show()
# Save the plot
fig.write_image(os.path.join(path_output_root, "covid_who_region_new_cases_per_100k_scatter.png"), scale=2)

In [None]:
# Plot daily new cases
fig = go.Figure()
# for region, df_region in dfs_regions.items():
# 	fig.add_trace(go.Scatter(x=df_region.index, y=df_region["New_cases_per_100k"], name=region))
# Stacked bar chart using WHO_region as color
# fig = px.line(df_joint, x=df_joint.index, y="New_cases_per_100k", color="WHO_region")
y = "New_cases_per_100k"
# fig = px.bar(df_joint, x=df_joint.index, y=y, color="WHO_region")
fig = px.histogram(df_joint, x=df_joint.index, y=y, color="WHO_region", nbins=int(len(df_joint.index)/len(regions)))

fig.update_layout(barmode="stack", bargap=0, bargroupgap=0)
fig.update_layout(title=f"Weekly new COVID-19 cases for {len(regions)} WHO regions (same scale, stacked)", xaxis_title="Date", yaxis_title=y.replace("_", " "))

# draw dotted vertical lines on full years for each subplot
for i in range(2020, 2024):
# 	for j in range(1, len(regions)+1):
	fig.add_vline(x=f"{i}-01-01", line_dash="dashdot", line_color="rgba(0,0,0,0.3)", line_width=1)

# set width and height
fig.update_layout(
	width=1200,
	height=400
)

# set margins
fig.update_layout(
	margin=dict(l=80, r=20, t=60, b=20)
)

fig.show()

# Save the plot
fig.write_image(os.path.join(path_output_root, "covid_who_region_new_cases_stacked.png"), scale=2)

In [None]:
# Plot daily new cases
# fig = make_subplots(rows=len(regions), cols=1, shared_yaxes=True)#, subplot_titles=list(regions))
fig = make_subplots(rows=len(regions), cols=1, shared_xaxes=True, vertical_spacing=0.015)
# make subplots with specs so that top plot is histogram
y = "New_cases_per_100k"
# fig = px.histogram(df_joint, x=df_joint.index, y=y, color="WHO_region", nbins=int(len(df_joint.index)/len(regions)))
# top plot
top = px.histogram(df_joint, x=df_joint.index, y=y, color="WHO_region", nbins=int(len(df_joint.index)/len(regions)))
# fig.add_trace(top.data[0], row=1, col=1)
for i, region in enumerate(regions):
	fig.add_trace(top.data[i], row=i+1, col=1)
fig.update_layout(barmode="stack", bargap=0, bargroupgap=0)
fig.update_layout(title=f"Weekly new COVID-19 cases per 100,000 people for {len(regions)} WHO regions")
# add x axis title
fig.update_xaxes(title_text="Date", row=len(regions), col=1)
# add region names to y axis
for i, region in enumerate(regions):
	fig.update_yaxes(title_text=region, row=i+1, col=1)
# hide legend
fig.update_layout(showlegend=False)
# height
fig.update_layout(height=600, width=1200)
# tight layout
fig.update_layout(margin=dict(l=10, r=10, t=50, b=10))

# hide all x axis labels except for the bottom plot
# for i in range(1, len(regions)):
# 	fig.update_xaxes(showticklabels=False, row=i, col=1)

# draw dotted vertical lines on full years for each subplot
for i in range(2020, 2024):
# 	for j in range(1, len(regions)+1):
	fig.add_vline(x=f"{i}-01-01", line_dash="dashdot", line_color="rgba(0,0,0,0.3)", line_width=1)

fig.show()

# Save the plot
fig.write_image(os.path.join(path_output_root, "covid_who_regions_new_cases_per_100k.png"), scale=2)

In [None]:
# Plot all countries for a specific region
who_region = "EURO"
df_region = df_covid[df_covid["WHO_region"] == who_region]
fig = go.Figure()
for country in df_region["ISO3"].unique():
	df_country = df_region[df_region["ISO3"] == country]
	fig.add_trace(go.Scatter(x=df_country.index, y=df_country["New_cases_per_100k"], name=country))
fig.update_layout(title=f"Daily new COVID-19 cases for {len(df_region['ISO3'].unique())} countries in '{who_region}' WHO region", xaxis_title="Date", yaxis_title="New cases per 100k")
fig.show()

In [None]:
# Print all unique populations for EURO
who_region = "EURO"
df_region = dfs_regions[who_region]
unique_populations = df_region["Population"].unique()
print(len(unique_populations))
print(int(unique_populations[0]))