## This notebook includes pre-processing and EDA steps for WHO dataset:

# Choropleth and Stacked Bar Chart Visualizations Dataset:

The pre-processing steps outlined here are essential for creating the choropleth and stacked bar chart visualizations in the final project. The datasets utilized include the 2024 World Happiness Report, which provides happiness scores for 143 countries (predominantly sovereign nations), and a GeoJSON World dataset, which supplies the geographic attributes required for the map visualization.

The decision to merge these datasets stems from the Happiness Report’s limited coverage of 143 countries. To ensure a comprehensive representation, we aligned the data with a list of sovereign countries, merging based on standardized country names. For countries not included in the Happiness Report, we assigned a happiness score of 0. This approach allows the choropleth map to accurately reflect the available data while visually distinguishing between countries with reported happiness scores and those without.

In [32]:
# Loading in the necessary libraries
import pandas as pd
import geopandas as gpd

# Load data
df_happiness = pd.read_excel("figure1.xlsx")
geojson_url = "https://raw.githubusercontent.com/datasets/geo-countries/master/data/countries.geojson"
gdf_geo = gpd.read_file(geojson_url)

# A list that states sovereign countries:
sovereign_countries = [
    "Afghanistan", "Albania", "Algeria", "Andorra", "Angola", "Antigua and Barbuda", "Argentina",
    "Armenia", "Australia", "Austria", "Azerbaijan", "Bahamas", "Bahrain", "Bangladesh", "Barbados",
    "Belarus", "Belgium", "Belize", "Benin", "Bhutan", "Bolivia", "Bosnia and Herzegovina", "Botswana",
    "Brazil", "Brunei", "Bulgaria", "Burkina Faso", "Burundi", "Cabo Verde", "Cambodia", "Cameroon",
    "Canada", "Central African Republic", "Chad", "Chile", "China", "Colombia", "Comoros", "Congo (Brazzaville)",
    "Congo (Kinshasa)", "Costa Rica", "Croatia", "Cuba", "Cyprus", "Czech Republic", "Denmark", "Djibouti",
    "Dominica", "Dominican Republic", "Ecuador", "Egypt", "El Salvador", "Equatorial Guinea", "Eritrea",
    "Estonia", "Eswatini", "Ethiopia", "Fiji", "Finland", "France", "Gabon", "Gambia", "Georgia", "Germany",
    "Ghana", "Greece", "Grenada", "Guatemala", "Guinea", "Guinea-Bissau", "Guyana", "Haiti", "Honduras",
    "Hungary", "Iceland", "India", "Indonesia", "Iran", "Iraq", "Ireland", "Israel", "Italy", "Ivory Coast",
    "Jamaica", "Japan", "Jordan", "Kazakhstan", "Kenya", "Kiribati", "Kuwait", "Kyrgyzstan", "Laos",
    "Latvia", "Lebanon", "Lesotho", "Liberia", "Libya", "Liechtenstein", "Lithuania", "Luxembourg",
    "Madagascar", "Malawi", "Malaysia", "Maldives", "Mali", "Malta", "Marshall Islands", "Mauritania",
    "Mauritius", "Mexico", "Micronesia", "Moldova", "Monaco", "Mongolia", "Montenegro", "Morocco",
    "Mozambique", "Myanmar", "Namibia", "Nauru", "Nepal", "Netherlands", "New Zealand", "Nicaragua",
    "Niger", "Nigeria", "North Korea", "North Macedonia", "Norway", "Oman", "Pakistan", "Palau",
    "Palestine", "Panama", "Papua New Guinea", "Paraguay", "Peru", "Philippines", "Poland", "Portugal",
    "Qatar", "Romania", "Russia", "Rwanda", "Saint Kitts and Nevis", "Saint Lucia", "Saint Vincent and the Grenadines",
    "Samoa", "San Marino", "Sao Tome and Principe", "Saudi Arabia", "Senegal", "Serbia", "Seychelles",
    "Sierra Leone", "Singapore", "Slovakia", "Slovenia", "Solomon Islands", "Somalia", "South Africa",
    "South Korea", "South Sudan", "Spain", "Sri Lanka", "Sudan", "Suriname", "Sweden", "Switzerland",
    "Syria", "Taiwan", "Tajikistan", "Tanzania", "Thailand", "Timor-Leste", "Togo", "Tonga", "Trinidad and Tobago",
    "Tunisia", "Turkey", "Turkmenistan", "Tuvalu", "Uganda", "Ukraine", "United Arab Emirates", "United Kingdom",
    "United States of America", "Uruguay", "Uzbekistan", "Vanuatu", "Vatican City", "Venezuela", "Vietnam",
    "Yemen", "Zambia", "Zimbabwe"
]

# There were some name differences between the two dataframes so I had to address that:
geojson_name_mapping = {
    "Bahamas": "The Bahamas",
    "Cabo Verde": "Cape Verde",
    "Guinea-Bissau": "Guinea Bissau",
    "Micronesia": "Federated States of Micronesia",
    "Timor-Leste": "East Timor",
    "Vatican City": "Vatican",
    "Serbia": "Republic of Serbia",
    "Congo (Kinshasa)": "Democratic Republic of the Congo",
    "Tanzania": "United Republic of Tanzania",
    "Congo (Brazzaville)": "Republic of Congo",
    "North Macedonia": "Macedonia",
    "Eswatini": "Swaziland"
}

## Start AI

# Apply the name mapping:
reverse_geojson_mapping = {v: k for k, v in geojson_name_mapping.items()}

# Standardize GeoJSON names
gdf_geo["ADMIN_standardized"] = gdf_geo["ADMIN"].replace(reverse_geojson_mapping)

# Filter to sovereign countries (will include 196 total due to Kosovo)
gdf_geo_sovereign = gdf_geo[gdf_geo["ADMIN_standardized"].isin(sovereign_countries)].copy()

# Happiness data name mappings
happiness_name_mapping = {
    "Czechia": "Czech Republic",
    "United States": "United States of America",
    "Taiwan Province of China": "Taiwan",
    "Serbia": "Serbia",
    "South Korea": "South Korea",
    "North Macedonia": "North Macedonia",
    "Hong Kong S.A.R. of China": "Hong Kong S.A.R.",
    "Congo (Brazzaville)": "Congo (Brazzaville)",
    "Turkiye": "Turkey",
    "State of Palestine": "Palestine",
    "Gambia": "Gambia",
    "Tanzania": "Tanzania",
    "Eswatini": "Eswatini",
    "Congo (Kinshasa)": "Congo (Kinshasa)"
}

# Standardize happiness data names
df_happiness["Country_standardized"] = df_happiness["Country name"].replace(happiness_name_mapping)

## End AI

# Merge with left join to keep all 196 countries
merged_gdf = gdf_geo_sovereign.merge(df_happiness, left_on="ADMIN_standardized", right_on="Country_standardized", how="left")

# Fill missing ladder scores with 0
merged_gdf["Ladder score"] = merged_gdf["Ladder score"].fillna(0)

# Loading to GeoJson for D3
merged_gdf.to_file("merged_data_195.geojson", driver="GeoJSON")

# Time Series Dataset for stacked area chart in d3:

For the stacked area chart in D3, I utilized the World Health Organization’s Sustainable Development Goals (SDG) dataset to track how countries evolve over time in terms of key demographic and economic indicators. This visualization builds on the previous choropleth and stacked bar charts, which explored happiness scores, by adding a temporal perspective to assess trends in well-being-related factors.

The preprocessing was challenging due to inconsistent data across 2015–2023 for many countries. My initial plan was to align with the top and bottom 10 happiest countries from earlier visualizations, but data gaps made it difficult to find three indicators consistently available for a diverse socioeconomic mix. Instead, I analyzed which countries had the most complete data for under-five mortality rate, secondary education rate, and unemployment rate. This process identified Finland, Iceland, Israel, Norway, Sweden, and Switzerland as the top six countries with robust data. Although these don’t all rank among the bottom 10 happiest countries, they provide a representative sample of high-performing nations, ensuring the stacked area chart effectively illustrates trends across this group.

In [35]:
# Load necessary libraries
import pandas as pd
import json

# Load the WHO dataset
sdg_data = pd.read_excel("data.xlsx")

# Decide on the indicators to use
selected_indicator_codes = [
    "SH_DYN_MORT",    # Under-five mortality rate
    "SE_ADT_EDUCTRN", # Secondary education
    "SL_TLF_UEM"      # Unemployment rate
]

# Countries that align with the top and bottom ten happiness scores discussed above
selected_countries = [
    "Norway", "Denmark", "Sweden", "Finland", "Luxembourg", "Switzerland",
    "Iceland", "Netherlands", "Australia", "Israel", "Botswana", "Zambia",
    "Swaziland", "Lebanon", "Zimbabwe", "Lesotho", "Malawi", "Congo",
    "Sierra Leone", "Afghanistan"
]

# Get the respective data for the indcator, country, and dates:
filtered_data = sdg_data[
    (sdg_data["indicator_abbr"].isin(selected_indicator_codes)) &
    (sdg_data["setting"].isin(selected_countries)) &
    (sdg_data["date"].between(2015, 2023))
]

# Getting the subgroups for female and male:
filtered_data = filtered_data[filtered_data["subgroup"].isin(["Female", "Male"])]

# Pivoting the data to average the subgroups to just get a representative value
#for the country over that year:

pivoted_data = filtered_data.pivot_table(
    index=["setting", "date"],
    columns="indicator_abbr",
    values="estimate",
    aggfunc="mean"
).reset_index()

# Rename columns for interpretability:
pivoted_data.columns = [
    "Country",
    "Year",
    "UnderFiveMortality",
    "SecondaryEducation",
    "UnemploymentRate"
]

# Count how much data each country has for these indicators:
data_counts = pivoted_data.groupby("Country").agg({
    "UnderFiveMortality": "count",
    "SecondaryEducation": "count",
    "UnemploymentRate": "count"
}).reset_index()

# Adding a total column:
data_counts["TotalDataPoints"] = (
    data_counts["UnderFiveMortality"] +
    data_counts["SecondaryEducation"] +
    data_counts["UnemploymentRate"]
)

# Sort and select the top six countries!
top_countries = data_counts.sort_values("TotalDataPoints", ascending=False).head(6)

# Filter to get those six countries:
top_country_list = top_countries["Country"].tolist()
final_data = pivoted_data[pivoted_data["Country"].isin(top_country_list)]

# Sort the data:
final_data = final_data.sort_values(["Country", "Year"])

# Start AI:
# Impute missing values with ffill and bfill per country
final_data[["UnderFiveMortality", "SecondaryEducation", "UnemploymentRate"]] = (
    final_data.groupby("Country")[
        ["UnderFiveMortality", "SecondaryEducation", "UnemploymentRate"]
    ].ffill().bfill()
)
# End AI

# Save the data to JSON for D3:
final_data.to_json("top_6_data_countries.json", orient="records", indent=4)