# This notebook creates a consolidated and curated 5-year world happiness data from year 2020 to 2024.

# Import Libraries

In [1]:
import requests
import pandas as pd
import glob
import os

# Sourcing: Download raw data and convert to csv

In [2]:
# List of (URL, new file name) pairs
files_to_download = [
    ("https://happiness-report.s3.amazonaws.com/2020/WHR20_DataForFigure2.1.xls", "srs_file_2020.xls"),
    ("https://happiness-report.s3.amazonaws.com/2021/DataForFigure2.1WHR2021C2.xls", "srs_file_2021.xls"),
    ("https://happiness-report.s3.amazonaws.com/2022/Appendix_2_Data_for_Figure_2.1.xls", "srs_file_2022.xls"),
    ("https://happiness-report.s3.amazonaws.com/2023/DataForFigure2.1WHR2023.xls", "srs_file_2023.xls"),
    ("https://happiness-report.s3.amazonaws.com/2024/DataForFigure2.1+with+sub+bars+2024.xls", "srs_file_2024.xls"),
]

# Function to download files and convert to CSV
def download_and_convert_to_csv(file_list):
    for url, excel_filename in file_list:
        # Step 1: Download the Excel file
        response = requests.get(url, stream=True)
        with open(excel_filename, "wb") as file:
            for chunk in response.iter_content(chunk_size=1024):
                if chunk:
                    file.write(chunk)
        print(f"Downloaded: {excel_filename}")
        
        # Step 2: Convert to CSV
        csv_filename = excel_filename.replace(".xls", ".csv").replace(".xlsx", ".csv")
        
        # Automatically select the correct engine
        if excel_filename.endswith(".xls"):
            engine = "xlrd"  # Old Excel format
        else:
            engine = "openpyxl"  # New Excel format
        
        # Load and convert to CSV
        df = pd.read_excel(excel_filename, engine=engine)
        df.to_csv(csv_filename, index=False)
        
        print(f"Converted to CSV: {csv_filename}")

# Run the function
download_and_convert_to_csv(files_to_download)

# List of (file name, year) pairs
files_with_years = [
    ("srs_file_2020.csv", 2020),
    ("srs_file_2021.csv", 2021),
    ("srs_file_2022.csv", 2022),
    ("srs_file_2023.csv", 2023),
    ("srs_file_2024.csv", 2024),
]

# Process each file
for file_name, year in files_with_years:
    # Load the dataset
    df = pd.read_csv(file_name)
    
    # Add YEAR column
    df["Year"] = year
    
    # Remove rows where either 'Ladder score' or 'Happiness score' is empty
    if "Ladder score" in df.columns:
        df = df.dropna(subset=["Ladder score"])
    elif "Happiness score" in df.columns:
        df = df.dropna(subset=["Happiness score"])

    # Save the updated dataset
    df.to_csv(file_name, index=False)
    
    print(f"Updated {file_name} with YEAR column and removed empty Ladder score rows.")

Downloaded: srs_file_2020.xls
Converted to CSV: srs_file_2020.csv
Downloaded: srs_file_2021.xls
Converted to CSV: srs_file_2021.csv
Downloaded: srs_file_2022.xls
Converted to CSV: srs_file_2022.csv
Downloaded: srs_file_2023.xls
Converted to CSV: srs_file_2023.csv
Downloaded: srs_file_2024.xls
Converted to CSV: srs_file_2024.csv
Updated srs_file_2020.csv with YEAR column and removed empty Ladder score rows.
Updated srs_file_2021.csv with YEAR column and removed empty Ladder score rows.
Updated srs_file_2022.csv with YEAR column and removed empty Ladder score rows.
Updated srs_file_2023.csv with YEAR column and removed empty Ladder score rows.
Updated srs_file_2024.csv with YEAR column and removed empty Ladder score rows.


In [3]:
df = pd.read_csv("srs_file_2021.csv")
df["Regional indicator"].unique()

array(['Western Europe', 'North America and ANZ',
       'Middle East and North Africa', 'Latin America and Caribbean',
       'Central and Eastern Europe', 'East Asia', 'Southeast Asia',
       'Commonwealth of Independent States', 'Sub-Saharan Africa',
       'South Asia'], dtype=object)

# Staging: Select only the required columns, standardise coloumn names, and add in regional mapping

In [4]:
# Define standard column mappings for consistency
COLUMN_MAPPING = {
    'Country name': 'Country',
    'Country': 'Country',
    'Ladder score': 'Ladder Score',
    'Happiness score': 'Ladder Score',  # 2022 uses this
    'Logged GDP per capita': 'GDP per Capita',
    'Explained by: Log GDP per capita': 'GDP per Capita',  # 2024
    'Explained by: GDP per capita': 'GDP per Capita',  # 2022
    'Social support': 'Social Support',
    'Explained by: Social support': 'Social Support',  # 2022, 2024
    'Healthy life expectancy': 'Healthy Life Expectancy',
    'Explained by: Healthy life expectancy': 'Healthy Life Expectancy',  # 2022, 2024
    'Freedom to make life choices': 'Freedom to Make Life Choices',
    'Explained by: Freedom to make life choices': 'Freedom to Make Life Choices',  # 2022, 2024
    'Generosity': 'Generosity',
    'Explained by: Generosity': 'Generosity',  # 2022, 2024
    'Perceptions of corruption': 'Perceptions of Corruption',
    'Explained by: Perceptions of corruption': 'Perceptions of Corruption',  # 2022, 2024
    'Year': 'Year'
}

# Define the essential columns to retain
REQUIRED_COLUMNS = ['Year', 'Country', 'Ladder Score', 'GDP per Capita', 
                    'Social Support', 'Healthy Life Expectancy', 'Freedom to Make Life Choices', 
                    'Generosity', 'Perceptions of Corruption']

# Mapping of dataset filenames to corresponding years
files_with_years = {
    2020: "srs_file_2020.csv",
    2021: "srs_file_2021.csv",
    2022: "srs_file_2022.csv",
    2023: "srs_file_2023.csv",
    2024: "srs_file_2024.csv"
}

# Function to clean all string columns by removing leading/trailing spaces and trailing '*'
def clean_dataframe(df):
    str_cols = df.select_dtypes(include=["object"]).columns  # Select only string columns
    df[str_cols] = df[str_cols].map(lambda x: x.strip().rstrip('*') if isinstance(x, str) else x)
    return df

# Step 1: Process datasets
# region_mapping = {}
for year, file_name in files_with_years.items():
    if not os.path.exists(file_name):
        print(f"File {file_name} not found, skipping...")
        continue

    df = pd.read_csv(file_name).rename(columns=COLUMN_MAPPING)
    df = df.loc[:, ~df.columns.duplicated()]
    df = clean_dataframe(df)
    df = df[[col for col in REQUIRED_COLUMNS if col in df.columns]]
    df = df.dropna(subset=REQUIRED_COLUMNS)  # Drop rows with missing required data

    cleaned_file = f"stg_file_{year}.csv"
    df.to_csv(cleaned_file, index=False)
    print(f"Cleaned file saved as {cleaned_file}")

Cleaned file saved as stg_file_2020.csv
Cleaned file saved as stg_file_2021.csv
Cleaned file saved as stg_file_2022.csv
Cleaned file saved as stg_file_2023.csv
Cleaned file saved as stg_file_2024.csv


In [5]:
# List of staging files to check
staging_files = [
    "stg_file_2020.csv",
    "stg_file_2021.csv",
    "stg_file_2022.csv",
    "stg_file_2023.csv",
    "stg_file_2024.csv"
]

# Function to check for 'unknown' values in any column
def find_unknown_values(file_path):
    df = pd.read_csv(file_path)
    unknown_rows = df[df.apply(lambda row: row.astype(str).str.contains("unknown", case=False, na=False).any(), axis=1)]
    return unknown_rows

# Iterate over each staging file and check for 'unknown' values
for file in staging_files:
    if os.path.exists(file):
        unknown_rows = find_unknown_values(file)
        if not unknown_rows.empty:
            print(f"\n⚠️ Rows with 'unknown' values found in {file}:")
            print(unknown_rows.iloc[:, :4])
        else:
            print(f"✅ No 'unknown' values found in {file}.")
    else:
        print(f"❌ File {file} not found, skipping...")

print("\n🔍 Check for 'unknown' values completed!")

✅ No 'unknown' values found in stg_file_2020.csv.
✅ No 'unknown' values found in stg_file_2021.csv.
✅ No 'unknown' values found in stg_file_2022.csv.
✅ No 'unknown' values found in stg_file_2023.csv.
✅ No 'unknown' values found in stg_file_2024.csv.

🔍 Check for 'unknown' values completed!


# Integrate: Combine the 5 datasets, add region & continent mapping, and reoder

In [6]:
# Define file paths
int_dataset_name = "int_happiness_dataset.csv"
continent_region_mapping_file = "country_to_continent.csv"

# List of cleaned dataset files
merged_files = [f"stg_file_{year}.csv" for year in range(2020, 2025)]

# Load and combine all datasets
dataframes = [pd.read_csv(file) for file in merged_files if os.path.exists(file)]
if not dataframes:
    raise ValueError("No valid dataset files found. Please check file paths.")
merged_df = pd.concat(dataframes, ignore_index=True)

# Load country-to-region mapping dataset
if not os.path.exists(continent_region_mapping_file):
    raise FileNotFoundError(f"Mapping file '{continent_region_mapping_file}' not found.")
continent_region_mapping_df = pd.read_csv(continent_region_mapping_file)

# Create mapping dictionaries
country_to_region = dict(zip(continent_region_mapping_df["name"], continent_region_mapping_df["sub-region"]))
region_to_continent = dict(zip(continent_region_mapping_df["sub-region"], continent_region_mapping_df["region"]))

# Ensure "Region" and "Continent" columns exist before transformation
merged_df["Region"] = merged_df.get("Region", "Unknown")
merged_df["Continent"] = merged_df.get("Continent", "Unknown")

# Rename and standardise countries
merged_df["Country"] = merged_df["Country"].replace({
    "Taiwan Province of China": "Taiwan",
    "Eswatini, Kingdom of": "Eswatini",
    "Turkiye": "Turkey",
    "Congo": "Congo (Brazzaville)"
})

# Fill missing "Region" values using country mapping
merged_df["Region"] = merged_df["Country"].map(country_to_region).fillna(merged_df["Region"])

# Fill missing "Continent" values using region mapping
merged_df["Continent"] = merged_df["Region"].map(region_to_continent).fillna(merged_df["Continent"])

# Manually define corrections for missing Region & Continent
manual_updates = {
    "Bosnia and Herzegovina": {"Region": "Southern Europe", "Continent": "Europe"},
    "Congo (Brazzaville)": {"Region": "Sub-Saharan Africa", "Continent": "Africa"},
    "Congo (Kinshasa)": {"Region": "Sub-Saharan Africa", "Continent": "Africa"},
    "Hong Kong S.A.R. of China": {"Region": "Eastern Asia", "Continent": "Asia"},
    "Ivory Coast": {"Region": "Sub-Saharan Africa", "Continent": "Africa"},
    "Kosovo": {"Region": "Southern Europe", "Continent": "Europe"},
    "North Cyprus": {"Region": "Western Asia", "Continent": "Asia"},
    "North Macedonia": {"Region": "Southern Europe", "Continent": "Europe"},
    "Palestinian Territories": {"Region": "Western Asia", "Continent": "Asia"},
    "Swaziland": {"Region": "Sub-Saharan Africa", "Continent": "Africa"},
    "Czechia": {"Region": "Central Europe", "Continent": "Europe"},
}

# Apply manual corrections
for country, values in manual_updates.items():
    merged_df.loc[merged_df["Country"] == country, ["Region", "Continent"]] = values["Region"], values["Continent"]

# Sort and arrange columns for better readability
merged_df = merged_df.sort_values(by=["Year", "Continent", "Country"])[[
    "Year", "Country", "Region", "Continent", "Ladder Score", "GDP per Capita", 
    "Social Support", "Healthy Life Expectancy", "Freedom to Make Life Choices", 
    "Generosity", "Perceptions of Corruption"
]]

# Save the final dataset
merged_df.to_csv(int_dataset_name, index=False)
print(f"✅ Combined dataset saved as {int_dataset_name}")

✅ Combined dataset saved as int_happiness_dataset.csv


In [7]:
df = pd.read_csv("int_happiness_dataset.csv")
# Assuming df is your DataFrame
unique_combinations = df.groupby(["Continent", "Region"]).size().reset_index()[["Continent", "Region"]]

print(unique_combinations)

   Continent                           Region
0     Africa                  Northern Africa
1     Africa               Sub-Saharan Africa
2   Americas  Latin America and the Caribbean
3   Americas                 Northern America
4       Asia                     Central Asia
5       Asia                     Eastern Asia
6       Asia               South-eastern Asia
7       Asia                    Southern Asia
8       Asia                     Western Asia
9     Europe                   Central Europe
10    Europe                   Eastern Europe
11    Europe                  Northern Europe
12    Europe                  Southern Europe
13    Europe                   Western Europe
14   Oceania        Australia and New Zealand


In [8]:
def find_unknown_values(file_path):
    """
    Checks for 'unknown' values (case-insensitive) in any column of a CSV file.

    Args:
        file_path (str): The path to the CSV file.

    Returns:
        pandas.DataFrame: A DataFrame containing rows with 'unknown' values, or an empty DataFrame if none are found.
    """
    try:
        df = pd.read_csv(file_path)
        unknown_rows = df[df.apply(lambda row: row.astype(str).str.contains("unknown", case=False, na=False).any(), axis=1)]
        return unknown_rows
    except FileNotFoundError:
        return None # Return None when file not found.
    except Exception as e:
        print(f"An error occurred: {e}")
        return None # Return None in case of other errors.

# File path
file = "int_happiness_dataset.csv"

# Check for 'unknown' values
unknown_rows = find_unknown_values(file)

if unknown_rows is None:
    if not os.path.exists(file):
        print(f"❌ File {file} not found, skipping...")
    else:
        print(f"❌ An error occurred while processing {file}")
elif unknown_rows.empty:
    print(f"✅ No 'unknown' values found in {file}.")
else:
    print(f"\n⚠️ Rows with 'unknown' values found in {file}:")
    print(unknown_rows.iloc[:, :4])

print("\n🔍 Check for 'unknown' values completed!")

✅ No 'unknown' values found in int_happiness_dataset.csv.

🔍 Check for 'unknown' values completed!


In [9]:
# Find unique values in a specific column (e.g., "Region")
unique_values = merged_df["Region"].unique()

# Print unique values
print("Unique values in 'Region' column:", unique_values)


Unique values in 'Region' column: ['Northern Africa' 'Sub-Saharan Africa' 'Latin America and the Caribbean'
 'Northern America' 'Southern Asia' 'Western Asia' 'South-eastern Asia'
 'Eastern Asia' 'Central Asia' 'Southern Europe' 'Western Europe'
 'Eastern Europe' 'Northern Europe' 'Australia and New Zealand'
 'Central Europe']


# Reporting: Curate the dataset to include KPIs such as average regional ladder score.

In [10]:
# Define file paths
int_dataset_path = "int_happiness_dataset.csv"  # Input dataset
reporting_dataset_path = "reporting_happiness_dataset.csv"  # Output dataset

# Load the merged dataset
if not os.path.exists(int_dataset_path):
    raise FileNotFoundError(f"File '{int_dataset_path}' not found. Please check the file path.")

merged_df = pd.read_csv(int_dataset_path)

# Convert relevant columns to numeric (handling errors by coercing non-numeric values to NaN)
merged_df['Ladder Score'] = pd.to_numeric(merged_df['Ladder Score'], errors='coerce')
merged_df['GDP per Capita'] = pd.to_numeric(merged_df['GDP per Capita'], errors='coerce')
merged_df['Social Support'] = pd.to_numeric(merged_df['Social Support'], errors='coerce')
merged_df['Healthy Life Expectancy'] = pd.to_numeric(merged_df['Healthy Life Expectancy'], errors='coerce')
merged_df['Freedom to Make Life Choices'] = pd.to_numeric(merged_df['Freedom to Make Life Choices'], errors='coerce')
merged_df['Generosity'] = pd.to_numeric(merged_df['Generosity'], errors='coerce')
merged_df['Perceptions of Corruption'] = pd.to_numeric(merged_df['Perceptions of Corruption'], errors='coerce')

# Calculate average regional Ladder Score and GDP per Capita
regional_avg = merged_df.groupby(['Year', 'Region'])[['Ladder Score', 'GDP per Capita', 'Social Support', 
                                                      'Healthy Life Expectancy', 'Freedom to Make Life Choices', 
                                                      'Generosity', 'Perceptions of Corruption']].mean().reset_index()

# Rename columns to match requirements
regional_avg.rename(columns={
    'Ladder Score': 'Average Regional Ladder Score',
    'GDP per Capita': 'Average Regional GDP per Capita',
    'Social Support': 'Average Regional Social Support',
    'Healthy Life Expectancy': 'Average Regional Healthy Life Expectancy', 
    'Freedom to Make Life Choices': 'Average Regional Freedom to Make Life Choices', 
    'Generosity': 'Average Regional Generosity', 
    'Perceptions of Corruption': 'Average Regional Perceptions of Corruption'
}, inplace=True)

# Merge the calculated averages back into the original dataset
merged_df = merged_df.merge(regional_avg, on=['Year', 'Region'], how='left')

# Ensure correct column order
merged_df = merged_df[['Year', 'Country', 'Region', 'Continent', 
                       'Ladder Score', 'Average Regional Ladder Score',
                       'GDP per Capita', 'Average Regional GDP per Capita',
                       'Social Support', 'Average Regional Social Support',
                       'Healthy Life Expectancy', 'Average Regional Healthy Life Expectancy',
                       'Freedom to Make Life Choices', 'Average Regional Freedom to Make Life Choices', 
                       'Generosity', 'Average Regional Generosity',
                       'Perceptions of Corruption', 'Average Regional Perceptions of Corruption']]

# Save the updated dataset
merged_df.to_csv(reporting_dataset_path, index=False)

print(f"Updated dataset saved as '{reporting_dataset_path}'")


Updated dataset saved as 'reporting_happiness_dataset.csv'


In [11]:
def find_unknown_values(file_path):
    """
    Checks for 'unknown' values (case-insensitive) in any column of a CSV file.

    Args:
        file_path (str): The path to the CSV file.

    Returns:
        pandas.DataFrame: A DataFrame containing rows with 'unknown' values, or an empty DataFrame if none are found.
    """
    try:
        df = pd.read_csv(file_path)
        unknown_rows = df[df.apply(lambda row: row.astype(str).str.contains("unknown", case=False, na=False).any(), axis=1)]
        return unknown_rows
    except FileNotFoundError:
        return None # Return None when file not found.
    except Exception as e:
        print(f"An error occurred: {e}")
        return None # Return None in case of other errors.

# File path
file = "reporting_happiness_dataset.csv"

# Check for 'unknown' values
unknown_rows = find_unknown_values(file)

if unknown_rows is None:
    if not os.path.exists(file):
        print(f"❌ File {file} not found, skipping...")
    else:
        print(f"❌ An error occurred while processing {file}")
elif unknown_rows.empty:
    print(f"✅ No 'unknown' values found in {file}.")
else:
    print(f"\n⚠️ Rows with 'unknown' values found in {file}:")
    print(unknown_rows.iloc[:, :4])

print("\n🔍 Check for 'unknown' values completed!")

✅ No 'unknown' values found in reporting_happiness_dataset.csv.

🔍 Check for 'unknown' values completed!
