# This notebook creates a consolidated and curated 5-year world happiness data from year 2020 to 2024.

In [2]:
import pandas as pd
import requests
import os
import shutil
import kagglehub

# ================================ #
#           CONFIGURATION          #
# ================================ #

DOWNLOAD_PATH = "../data/raw"
DATA_YEARS = range(2020, 2025)  # Years 2020-2024
RAW_FILES = [os.path.join(DOWNLOAD_PATH, f"raw_{year}.csv") for year in DATA_YEARS]
STAGING_FILES = [f"../data/staging/stg_{year}.csv" for year in DATA_YEARS]
INTEGRATED_FILE = "../data/processed/int_happiness_dataset.csv"
REPORTING_FILE = "../data/processed/reporting_happiness_dataset.csv"
COUNTRY_TO_CONTINENT_FILE = "country_to_continent.csv"

# ================================ #
#           SOURCING DATA          #
# ================================ #

cache_base_path = os.path.expanduser("~/.cache/kagglehub/datasets")

# Define the dataset folders to delete
datasets = [
    "ajaypalsinghlo/world-happiness-report-2024",
    "ajaypalsinghlo/world-happiness-report-2023",
    "ajaypalsinghlo/world-happiness-report-2022",
    "ajaypalsinghlo/world-happiness-report-2021",
    "londeen/world-happiness-report-2020"
]

# Loop through the dataset folders and delete them
for dataset_folder in datasets:
    dataset_path = os.path.join(cache_base_path, dataset_folder)
    if os.path.exists(dataset_path):
        shutil.rmtree(dataset_path)  # Delete the entire folder and its contents
        print(f"✅ Deleted cache for {dataset_folder}")
    else:
        print(f"❌ Cache folder {dataset_folder} not found.")

years = [2024, 2023, 2022, 2021, 2020]

paths = [
    kagglehub.dataset_download("ajaypalsinghlo/world-happiness-report-2024"),
    kagglehub.dataset_download("ajaypalsinghlo/world-happiness-report-2023"),
    kagglehub.dataset_download("ajaypalsinghlo/world-happiness-report-2022"),
    kagglehub.dataset_download("ajaypalsinghlo/world-happiness-report-2021"),
    kagglehub.dataset_download("londeen/world-happiness-report-2020")
]

file_year_pairs = list(zip(years, paths))
for year, path in file_year_pairs:
    if os.path.exists(path):
        if not os.path.exists(DOWNLOAD_PATH):
            os.makedirs(DOWNLOAD_PATH)
        for file in os.listdir(path):
            new_name = f"raw_{year}.csv"
            shutil.move(os.path.join(path, file), os.path.join(DOWNLOAD_PATH, new_name))

        print(f"Files from {year} have been renamed and moved.")
    else:
        print(f"Dataset not found at {path}")

✅ Deleted cache for ajaypalsinghlo/world-happiness-report-2024
✅ Deleted cache for ajaypalsinghlo/world-happiness-report-2023
✅ Deleted cache for ajaypalsinghlo/world-happiness-report-2022
✅ Deleted cache for ajaypalsinghlo/world-happiness-report-2021
✅ Deleted cache for londeen/world-happiness-report-2020
Downloading from https://www.kaggle.com/api/v1/datasets/download/ajaypalsinghlo/world-happiness-report-2024?dataset_version_number=1...


100%|██████████████████████████████████████| 4.69k/4.69k [00:00<00:00, 2.08MB/s]

Extracting files...





Downloading from https://www.kaggle.com/api/v1/datasets/download/ajaypalsinghlo/world-happiness-report-2023?dataset_version_number=1...


100%|██████████████████████████████████████| 7.15k/7.15k [00:00<00:00, 3.15MB/s]

Extracting files...





Downloading from https://www.kaggle.com/api/v1/datasets/download/ajaypalsinghlo/world-happiness-report-2022?dataset_version_number=1...


100%|██████████████████████████████████████| 5.20k/5.20k [00:00<00:00, 2.50MB/s]

Extracting files...





Downloading from https://www.kaggle.com/api/v1/datasets/download/ajaypalsinghlo/world-happiness-report-2021?dataset_version_number=2...


100%|██████████████████████████████████████| 55.2k/55.2k [00:00<00:00, 3.95MB/s]

Extracting files...





Downloading from https://www.kaggle.com/api/v1/datasets/download/londeen/world-happiness-report-2020?dataset_version_number=1...


100%|██████████████████████████████████████| 17.1k/17.1k [00:00<00:00, 6.24MB/s]

Extracting files...
Files from 2024 have been renamed and moved.
Files from 2023 have been renamed and moved.
Files from 2022 have been renamed and moved.
Files from 2021 have been renamed and moved.
Files from 2020 have been renamed and moved.





In [11]:
# ================================ #
#           STAGING DATA           #
# ================================ #

STAGING_PATH = '../data/staging'

COLUMN_MAPPING = {
    'Country name': 'Country',
    'Regional indicator': 'Region',
    'Ladder score': 'Happiness Score',
    'Life Ladder': 'Happiness Score',
    'Happiness score': 'Happiness Score',
    'Log GDP per capita': 'GDP per Capita',
    'Explained by: Log GDP per capita': 'GDP per Capita',  # 2024
    'Explained by: GDP per capita': 'GDP per Capita',  # 2022
    'Social support': 'Social Support',
    'Explained by: Social support': 'Social Support',  # 2022, 2024
    'Healthy life expectancy at birth': 'Healthy Life Expectancy',
    'Healthy life expectancy': 'Healthy Life Expectancy',
    'Explained by: Healthy life expectancy': 'Healthy Life Expectancy',
    'Freedom to make life choices': 'Freedom to Make Life Choices',
    'Explained by: Freedom to make life choices': 'Freedom to Make Life Choices',
    'Generosity': 'Generosity',
    'Explained by: Generosity': 'Generosity',
    'Perceptions of corruption': 'Perceptions of Corruption',
    'Explained by: Perceptions of corruption': 'Perceptions of Corruption',
    'Year': 'Year'
}

REQUIRED_COLUMNS = [
    'Year', 'Country', 'Happiness Score', 'GDP per Capita', 
    'Social Support', 'Healthy Life Expectancy', 'Freedom to Make Life Choices', 
    'Generosity', 'Perceptions of Corruption'
]

def clean_dataframe(df):
    """Remove spaces, trailing asterisks, and standardize column names."""
    df = df.rename(columns=COLUMN_MAPPING)
    df = df[[col for col in REQUIRED_COLUMNS if col in df.columns]]
    df = df.dropna() 
    # Strip leading/trailing spaces and trailing '*'
    str_cols = df.select_dtypes(include=["object"]).columns
    df[str_cols] = df[str_cols].apply(lambda col: col.str.strip().str.rstrip('*'))
    return df

# Process each dataset
for year, raw_file in zip(DATA_YEARS, RAW_FILES):
    if os.path.exists(raw_file):
        df = pd.read_csv(raw_file)
        df = clean_dataframe(df)
        df["Year"] = year
        if not os.path.exists(STAGING_PATH):
            os.makedirs(STAGING_PATH)
        df.to_csv(os.path.join(STAGING_PATH, f"stg_{year}.csv"), index=False)
        print(f"✅ Processed: stg_{year}.csv")

✅ Processed: stg_2020.csv
✅ Processed: stg_2021.csv
✅ Processed: stg_2022.csv
✅ Processed: stg_2023.csv
✅ Processed: stg_2024.csv


In [12]:
# ================================ #
#         INTEGRATION STEP         #
# ================================ #

import country_converter as coco

# Load all staging datasets
dataframes = [pd.read_csv(file) for file in STAGING_FILES if os.path.exists(file)]
merged_df = pd.concat(dataframes, ignore_index=True)

cc = coco.CountryConverter()

# Add a new column for Continent using the 'convert' method
merged_df['Continent'] = merged_df['Country'].apply(lambda x: cc.convert(names=x, to='continent'))

# Manual overrides for missing data
manual_updates = {
    "Bosnia and Herzegovina": {"Region": "Southern Europe", "Continent": "Europe"},
    "Congo (Brazzaville)": {"Region": "Sub-Saharan Africa", "Continent": "Africa"},
    "Hong Kong S.A.R. of China": {"Region": "Eastern Asia", "Continent": "Asia"},
    "Kosovo": {"Region": "Southern Europe", "Continent": "Europe"},
    "North Cyprus": {"Region": "Western Asia", "Continent": "Asia"},
    "North Macedonia": {"Region": "Southern Europe", "Continent": "Europe"}
}
for country, values in manual_updates.items():
    merged_df.loc[merged_df["Country"] == country, ["Region", "Continent"]] = values["Region"], values["Continent"]

merged_df = merged_df.drop(columns=['Region'], axis=1)

# Save final integrated dataset
PROCESSED_PATH = '../data/processed'
if not os.path.exists(PROCESSED_PATH):
    os.makedirs(PROCESSED_PATH)
merged_df.to_csv(INTEGRATED_FILE, index=False)
print(f"✅ Integrated dataset saved as {INTEGRATED_FILE}")

# ================================ #
#    ANALYSIS & REPORTING STEP     #
# ================================ #

# Calculate average regional Happiness Score and GDP per Capita
continent_avg = merged_df.groupby(['Year', 'Continent'])[['Happiness Score', 'GDP per Capita', 'Social Support', 
                                                      'Healthy Life Expectancy', 'Freedom to Make Life Choices', 
                                                      'Generosity', 'Perceptions of Corruption']].mean().reset_index()

# Rename columns to match requirements
continent_avg.rename(columns={
    'Happiness Score': 'Average Continent Happiness Score',
    'GDP per Capita': 'Average Continent GDP per Capita',
    'Social Support': 'Average Continent Social Support',
    'Healthy Life Expectancy': 'Average Continent Healthy Life Expectancy', 
    'Freedom to Make Life Choices': 'Average Continent Freedom to Make Life Choices', 
    'Generosity': 'Average Continent Generosity', 
    'Perceptions of Corruption': 'Average Continent Perceptions of Corruption'
}, inplace=True)

# Merge the calculated averages back into the original dataset
merged_df = merged_df.merge(continent_avg, on=['Year', 'Continent'], how='left')

# Ensure correct column order
merged_df = merged_df[['Year', 'Country', 'Continent', 
                       'Happiness Score', 'Average Continent Happiness Score',
                       'GDP per Capita', 'Average Continent GDP per Capita',
                       'Social Support', 'Average Continent Social Support',
                       'Healthy Life Expectancy', 'Average Continent Healthy Life Expectancy',
                       'Freedom to Make Life Choices', 'Average Continent Freedom to Make Life Choices', 
                       'Generosity', 'Average Continent Generosity',
                       'Perceptions of Corruption', 'Average Continent Perceptions of Corruption']]

# Save reporting dataset
merged_df.to_csv(REPORTING_FILE, index=False)
print(f"✅ Reporting dataset saved as {REPORTING_FILE}")

✅ Integrated dataset saved as ../data/processed/int_happiness_dataset.csv
✅ Reporting dataset saved as ../data/processed/reporting_happiness_dataset.csv
