# Jenn Allen's Visualizations for WHO Mortality Data
These visualizations are based on World Health Organization (WHO) mortality data for a set of [12 countries](../source_data/filtered_countries.csv).

### Do Imports, Load Data, and Create DataFrames

In [5]:
# Import dependencies
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.ticker import ScalarFormatter
import seaborn as sns

mortality_data_path = Path('.../Resources/source_data/current_who_mortality_2017_2021.csv')
population_data_path = Path('.../Resources/source_data/current_who_population.csv')


# Load the WHO mortality data
df_mort = pd.read_csv(mortality_data_path)

# Load the WHO population data
df_pop = pd.read_csv(population_data_path)

FileNotFoundError: [Errno 2] No such file or directory: '.../Resources/source_data/current_who_mortality_2017_2021.csv'

In [None]:
# Create a function to display the total deaths in a year, with country optional

def total_deaths(df, year=None, country=""):
    # Step 1: Filter the DataFrame for the specified country if provided
    if country:
        filtered_df = df[df['Country Name'] == country]
    else:
        filtered_df = df

    # Step 2: Filter the DataFrame for the specified year if provided
    if year:
        filtered_df = filtered_df[filtered_df['Year'] == year]

    # Step 3: Filter for ICD Code 'AAA'
    filtered_df = filtered_df[filtered_df['ICD Code'] == 'AAA']

    # Step 4: Sum the number of deaths
    total_deaths = filtered_df['Deaths: All Ages'].sum()

    # Step 5: Return the result
    return total_deaths

### Function to Display the Total Deaths (Year and Country are Optional)

In [None]:
# Sum of all deaths for 2017 - 2021
print(f"Total deaths from 2017 to 2021: {total_deaths(df_mort):,}")

# Sum of deaths for 2017
print(f"Total deaths in 2017: {total_deaths(df_mort, 2017):,}")

# Sum of deaths for 2018
print(f"Total deaths in 2018: {total_deaths(df_mort, 2018):,}")

# Sum of deaths for 2019
print(f"Total deaths in 2019: {total_deaths(df_mort, 2019):,}")

# Sum of deaths for 2020
print(f"Total deaths in 2020: {total_deaths(df_mort, 2020):,}")

# Sum of deaths for 2021
print(f"Total deaths in 2021: {total_deaths(df_mort, 2021):,}")

TypeError: 'PosixPath' object is not subscriptable

### Function to Display the Top 10 Causes of Death in a Year (Country is Optional)

In [None]:
# Create a function to display the top 10 causes of death in a year

def top_causes_death_yearly(df, year, country=""):
    # Step 1: Filter the DataFrame for the specified year and country
    if country:
        filtered_df = df[(df['Year'] == year) & (df['Country Name'] == country)]
    else:
        filtered_df = df[df['Year'] == year]

    # Step 2: Group by ICD codes and sum the number of deaths
    icd_deaths = filtered_df.groupby('ICD Code')['Deaths: All Ages'].sum().reset_index()
    icd_deaths.columns = ['ICD Code', 'Deaths: All Ages']

    # Step 3: Exclude the 'AAA' ICD code since it applies to total deaths from all causes
    icd_deaths = icd_deaths[icd_deaths['ICD Code'] != 'AAA']

    # Step 4: Select the top 10 ICD codes based on the total number of deaths
    top_10_icd_codes = icd_deaths.nlargest(10, 'Deaths: All Ages')

    # Step 5: Merge with the ICD Category column
    top_10_with_category = top_10_icd_codes.merge(filtered_df[['ICD Code', 'ICD Category']].drop_duplicates(), on='ICD Code', how='left')

    # Step 6: Return the result
    return top_10_with_category

### Top Causes of Death in 2020 and 2021
This shows deaths for each year in our dataset (2017 - 2021). Reminder: our dataset is filtered to a set of [12 countries](../source_data/filtered_countries.csv).

In [None]:
# Top causes of death for 2017
top_causes_2017 = top_causes_death_yearly(df_mort, 2017)
print(f"The top causes of death for 2017:")
display(top_causes_2017)
print("***" * 50)

# Top causes of death for 2018
top_causes_2018 = top_causes_death_yearly(df_mort, 2018)
print(f"The top causes of death for 2018:")
display(top_causes_2018)
print("***" * 50)

# Top causes of death for 2019
top_causes_2019 = top_causes_death_yearly(df_mort, 2019)
print(f"The top causes of death for 2019:")
display(top_causes_2019)
print("***" * 50)

# Top causes of death for 2020
top_causes_2020 = top_causes_death_yearly(df_mort, 2020)
print(f"The top causes of death for 2020:")
display(top_causes_2020)
print("***" * 50)

# Top causes of death for 2021
top_causes_2021 = top_causes_death_yearly(df_mort, 2021)
print(f"The top causes of death for 2021:")
display(top_causes_2021)

#### Calculate and plot total deaths by Sex, year-over-year

In [None]:

# Group by 'Year' and 'Sex' and sum the deaths
df_filter_icd_aaa = df_mort[df_mort['ICD Code'] != 'AAA']
df_grouped = df_filter_icd_aaa.groupby(['Year', 'Sex'])
sum_yearly_deaths_by_sex = df_grouped[['Deaths: All Ages']].sum().reset_index()

# Sort the data by the total number of deaths
sum_yearly_deaths_by_sex = sum_yearly_deaths_by_sex.sort_values(by='Deaths: All Ages')

# Plot the data
plt.figure(figsize=(10, 6))

# Define the width of each bar
bar_width = 0.35

# Get the unique years and sexes
years = sum_yearly_deaths_by_sex['Year'].unique()
sexes = sum_yearly_deaths_by_sex['Sex'].unique()

# Create an array of positions for the bars
r = np.arange(len(years))

# Plot each sex's data
for i, sex in enumerate(sexes):
    subset = sum_yearly_deaths_by_sex[sum_yearly_deaths_by_sex['Sex'] == sex]
    bars = plt.bar(r + i * bar_width, subset['Deaths: All Ages'], width=bar_width, label=sex)

    # Add text annotations
    for bar in bars:
        yval = bar.get_height()
        plt.text(bar.get_x() + bar.get_width() / 2, yval, f'{int(yval):,}', ha='center', va='bottom')

# Add labels and title
plt.xlabel('Year')
plt.ylabel('Deaths: All Ages')
plt.title('Total Deaths by Year and Sex')
plt.xticks(r + bar_width / 2, years)
plt.legend(title='Sex')
plt.show()

### Get and Plot Total Deaths for All Countries with Population Data

In [None]:
pd.options.display.float_format = '{:,.0f}'.format

# Group the Population dataframe by 'Pop: All Ages' column
df_pop_filtered = df_pop[df_pop['Year'].between(2017, 2021)]
df_pop_filtered = df_pop_filtered.groupby(['Year', 'Country Name'])['Pop: All Ages'].sum().reset_index()

# Rename the column to 'Total Population'
df_pop_filtered = df_pop_filtered.rename(columns={'Pop: All Ages': 'Total Population'})

# Display the first few rows
display(df_pop_filtered)

In [None]:
# Create a DataFrame of deaths per year

# Filter to the 'AAA' ICD Code since that applies to the total deaths for all ICD Codes
df_filtered_grouped_deaths = df_mort[df_mort['ICD Code'] == 'AAA']

# Group by Year + Country and Deaths
df_grouped_deaths = df_filtered_grouped_deaths.groupby(['Year', 'Country Name'])['Deaths: All Ages'].sum().reset_index()

# Rename columns
df_grouped_deaths = df_grouped_deaths.rename(columns={'Deaths: All Ages':'All Causes of Death'})

# Convert the 'All Deaths' column to integer
df_grouped_deaths['All Causes of Death'] = df_grouped_deaths['All Causes of Death'].fillna(0).astype(int)

# Apply comma formatting
# df_grouped_deaths['All Causes of Death'] = df_grouped_deaths['All Causes of Death'].map(lambda x: f"{x:,}")

# Display first few rows
display(df_grouped_deaths.head())

# Create Scatterplots

In [None]:
# Group by Year + Country and Deaths
df_filtered_grouped_deaths = df_mort[df_mort['ICD Code'] != 'AAA']
df_grouped_deaths = df_filtered_grouped_deaths.groupby(['Year', 'Country Name'])['Deaths: All Ages'].sum().reset_index()

# Rename columns
df_grouped_deaths = df_grouped_deaths.rename(columns={'Deaths: All Ages':'All Causes of Death'})

# Convert the 'All Deaths' column to integer
df_grouped_deaths['All Causes of Death'] = df_grouped_deaths['All Causes of Death'].fillna(0).astype(int)

# Merge the two DataFrames on 'Year' and 'Country Name'
df_merged = pd.merge(df_pop_filtered, df_grouped_deaths, on=['Year', 'Country Name'])

# Calculate the percentage of the population that died in each year
df_merged['Percentage'] = (df_merged['All Causes of Death'] / df_merged['Total Population']) * 100

# Format the percentage column to show two decimal places
df_merged['Percentage'] = df_merged['Percentage'].map(lambda x: f"{x:.2f}")

# Display the merged DataFrame
display(df_merged.head())

In [None]:
# Create a DataFrame to exclude 'AAA' so that we don't double-count. AAA represents deaths from all ICD codes.
df_mort_filtered_no_aaa = df_mort[df_mort['ICD Code'] != 'AAA']
top_5_by_country_year = df_mort_filtered_no_aaa.groupby(['Year', 'Country Name', 'ICD Category']).apply(
    lambda x: x.nlargest(5, 'Deaths: All Ages')
).reset_index(drop=True)

# Create a DataFrame to only include 'AAA' to a summary rollup
df_mort_aaa = df_mort[df_mort['ICD Code'] == 'AAA']

# Show the first few rows for both DataFrames
display(df_mort_filtered_no_aaa.head(2))
display(df_mort_aaa.head(2))

In [None]:
# Create a pivot table to summarize total deaths by ICD code for each country and year
pivot_table = pd.pivot_table(
    df_mort_filtered_no_aaa, 
    values='Deaths: All Ages', 
    index=['Country Name', 'Year'], 
    columns=['ICD Code', 'ICD Category'], 
    aggfunc='sum'
)

# Reset the index to make 'Country Name' and 'Year' columns
pivot_table_reset = pivot_table.reset_index()

# Melt the pivot table to long format
melted = pivot_table_reset.melt(id_vars=['Country Name', 'Year'], var_name='ICD Code', value_name='Deaths')

# Rank the ICD codes within each country and year based on the number of deaths
melted['Rank'] = melted.groupby(['Country Name', 'Year'])['Deaths'].rank(method='first', ascending=False)

# Filter to include only the top 5 ICD codes for each country and year
top_5 = melted[melted['Rank'] <= 5]

# Sort by country, year, and rank
top_5_sorted = top_5.sort_values(by=['Country Name', 'Year', 'Rank'])

# Display the sorted DataFrame
display(top_5_sorted)

In [None]:
df_pop_filtered.columns

In [None]:
# Assuming df_pop_filtered and top_5_sorted are already defined and filtered

# Merge the population data with the deaths data
df_merged = pd.merge(df_pop_filtered, top_5_sorted, on=['Year', 'Country Name'])

# Filter out Brazil
df_merged = df_merged[df_merged['Country Name'] != 'Brazil']

# Convert 'Total Population' to millions
df_merged['Total Population'] = df_merged['Total Population'] / 1_000_000

# Create the bubble plot
plt.figure(figsize=(14, 8))
bubble_plot = sns.scatterplot(
    data=df_merged, 
    x='Total Population', 
    y='Deaths', 
    hue='ICD Code', 
    style='Country Name', 
    size='Total Population', 
    sizes=(20, 200), 
    palette='viridis'
)

# Set the x-axis formatter to ScalarFormatter and disable scientific notation
bubble_plot.xaxis.set_major_formatter(ScalarFormatter())
bubble_plot.ticklabel_format(style='plain', axis='x')

# Customize the plot
bubble_plot.set_title('Number of Deaths vs Population Size for Top 5 ICD Codes')
bubble_plot.set_xlabel('Population Size (Millions)')
bubble_plot.set_ylabel('Number of Deaths')
plt.legend(bbox_to_anchor=(0.5, -0.1), loc='upper center', ncol=3)

# Show the plot
plt.show()

In [None]:

# Merge the population data with the deaths data
df_merged = pd.merge(df_pop_filtered, top_5_sorted, on=['Year', 'Country Name'])

# Sum up deaths across all ICD codes
total_deaths = df_merged.groupby(['Year', 'Country Name', 'Total Population'])['Deaths'].sum().reset_index()
total_deaths['ICD Code'] = 'All Deaths'

# Filter the DataFrame to get COVID-19 specific codes
covid_codes = ['U071', 'U072']  # Example COVID-19 ICD codes
covid_deaths = df_merged[df_merged['ICD Code'].isin(covid_codes)].groupby(['Year', 'Country Name', 'Total Population'])['Deaths'].sum().reset_index()
covid_deaths['ICD Code'] = 'COVID-19'

# Combine the total deaths and COVID-19 deaths
comparison_df = pd.concat([total_deaths, covid_deaths])

# Ensure 'Total Population' and 'Deaths' are numeric
comparison_df['Total Population'] = pd.to_numeric(comparison_df['Total Population'], errors='coerce')
comparison_df['Deaths'] = pd.to_numeric(comparison_df['Deaths'], errors='coerce')

# Drop rows with NaN values in 'Total Population' or 'Deaths'
comparison_df.dropna(subset=['Total Population', 'Deaths'], inplace=True)

# Filter out Brazil
comparison_df = comparison_df[comparison_df['Country Name'] != 'Brazil']

# Convert 'Total Population' to millions
comparison_df['Total Population'] = comparison_df['Total Population'] / 1_000_000

# Create the bubble plot
plt.figure(figsize=(14, 8))
bubble_plot = sns.scatterplot(
    data=comparison_df, 
    x='Total Population', 
    y='Deaths', 
    hue='ICD Code', 
    style='Country Name', 
    size='Total Population', 
    sizes=(40, 400), 
    palette='viridis'
)

# Set the x-axis formatter to ScalarFormatter and disable scientific notation
bubble_plot.xaxis.set_major_formatter(ScalarFormatter())
bubble_plot.ticklabel_format(style='plain', axis='x')

# Add labels to each cluster
# for line in range(0, comparison_df.shape[0]):
#    plt.annotate(
#        comparison_df['Country Name'].iloc[line],
#        (comparison_df['Total Population'].iloc[line], comparison_df['Deaths'].iloc[line]),
#        textcoords="offset points",
#        xytext=(5,5),
#        ha='left'
#    )

# Customize the plot
bubble_plot.set_title('Number of Deaths vs Population Size for COVID-19 and All Deaths')
bubble_plot.set_xlabel('Population Size (Millions)')
bubble_plot.set_ylabel('Number of Deaths')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

# Show the plot
plt.show()

In [None]:

# Filter the DataFrame to include only the years 2020-2021
df_filtered = df_merged[df_merged['Year'].between(2020, 2021)]

# Check if the required columns exist in the filtered DataFrame
required_columns = ['Year', 'Country Name', 'Total Population', 'All Causes of Death', 'Percentage']
for col in required_columns:
    if col not in df_filtered.columns:
        raise ValueError(f"Column '{col}' not found in DataFrame")

# Print the filtered DataFrame to check the raw data
print("Filtered DataFrame:")
print(df_filtered.head(30))

# Group by 'Country Name' and aggregate the relevant columns
df_aggregated = df_filtered.groupby('Country Name').agg({
    'Total Population': lambda x: x.astype(str).str.replace(',', '').astype(float).sum(),
    'All Causes of Death': lambda x: x.astype(str).str.replace(',', '').astype(int).sum(),
    'Percentage': lambda x: x.astype(str).str.replace('%', '').astype(float).mean()
}).reset_index()

# Print the aggregated DataFrame to check the processed data
print("Aggregated DataFrame:")
print(df_aggregated.head())

# Extract the relevant columns from the aggregated DataFrame
countries = df_aggregated['Country Name']
total_population = df_aggregated['Total Population']
all_causes_of_death = df_aggregated['All Causes of Death']
percentage = df_aggregated['Percentage']

# Scale the percentage for bubble size and multiply by 2
bubble_size = percentage * 100 * 2  # Adjust the scaling factor as needed

# Create the scatterplot
plt.figure(figsize=(10, 6))
scatter = plt.scatter(total_population, all_causes_of_death, s=bubble_size, c=percentage, cmap='magma', alpha=0.6, edgecolors='w', linewidth=0.5)

# Add color bar
cbar = plt.colorbar(scatter)
cbar.set_label('Percentage of Population Died')

# Add titles and labels
plt.title('Scatterplot of Total Population vs. All Causes of Death (2020-2021)')
plt.xlabel('Total Population')
plt.ylabel('All Causes of Death')

# Annotate each point with the country name at the bottom right of the bubble
for i, country in enumerate(countries):
    plt.annotate(country, 
                 (total_population[i], all_causes_of_death[i]), 
                 xytext=(5, -5),  # Offset position
                 textcoords='offset points', 
                 fontsize=7, 
                 alpha=0.8,
                 ha='left',  # Horizontal alignment
                 va='top')   # Vertical alignment

# Show the plot
plt.show()