In [45]:
import pandas as pd
import os

def load_yearly_happiness_data(target_directory):
    file_names = os.listdir(target_directory)
    yearly_df_dict = {}

    for file_name in file_names:
        if file_name.endswith(".csv"):
            year_key = file_name.split(".")[0]
            full_path = os.path.join(target_directory, file_name)

            yearly_df = pd.read_csv(full_path)
            yearly_df_dict[year_key] = yearly_df

    return yearly_df_dict

data_dir_path = "../data"
happiness_data_dict = load_yearly_happiness_data(data_dir_path)

sample_df = happiness_data_dict["2015"]
display(sample_df.head())

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
0,Switzerland,Western Europe,1,7.587,0.03411,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738
1,Iceland,Western Europe,2,7.561,0.04884,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,2.70201
2,Denmark,Western Europe,3,7.527,0.03328,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,2.49204
3,Norway,Western Europe,4,7.522,0.0388,1.459,1.33095,0.88521,0.66973,0.36503,0.34699,2.46531
4,Canada,North America,5,7.427,0.03553,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2.45176


In [46]:
def standardize_and_merge_data(happiness_data_dict):
    column_mapping = {
        "Country": "country",
        "Country or region": "country",
        "Country name": "country",
        "Happiness Rank": "happiness_rank",
        "Happiness.Rank": "happiness_rank",
        "Overall rank": "happiness_rank",
        "Happiness Score": "happiness_score",
        "Happiness.Score": "happiness_score",
        "Score": "happiness_score",
        "Happiness score": "happiness_score",
        "Economy (GDP per Capita)": "gdp_per_capita",
        "Economy..GDP.per.Capita.": "gdp_per_capita",
        "GDP per capita": "gdp_per_capita",
        "Economy (GDP per Capita)\t": "gdp_per_capita",
        "Family": "social_support",
        "Social support": "social_support",
        "Health (Life Expectancy)": "healthy_life_expectancy",
        "Health..Life.Expectancy.": "healthy_life_expectancy",
        "Healthy life expectancy": "healthy_life_expectancy",
        "Freedom": "freedom",
        "Freedom to make life choices": "freedom",
        "Trust (Government Corruption)": "corruption_perception",
        "Trust..Government.Corruption.": "corruption_perception",
        "Perceptions of corruption": "corruption_perception",
        "Generosity": "generosity"
    }

    standardized_dfs = []

    for year_key, yearly_df in happiness_data_dict.items():
        yearly_df = yearly_df.rename(columns=column_mapping)

        target_columns = [
            "country", "happiness_rank", "happiness_score", "gdp_per_capita",
            "social_support", "healthy_life_expectancy", "freedom",
            "generosity", "corruption_perception"
        ]

        available_columns = [col for col in target_columns if col in yearly_df.columns]
        filtered_df = yearly_df[available_columns].copy()

        filtered_df["year"] = int(year_key)

        standardized_dfs.append(filtered_df)

    merged_df = pd.concat(standardized_dfs, ignore_index=True)
    return merged_df

merged_happiness_df = standardize_and_merge_data(happiness_data_dict)

display(merged_happiness_df.head())
display(merged_happiness_df.info())

Unnamed: 0,country,happiness_rank,happiness_score,gdp_per_capita,social_support,healthy_life_expectancy,freedom,generosity,corruption_perception,year
0,Switzerland,1,7.587,1.39651,1.34951,0.94143,0.66557,0.29678,0.41978,2015
1,Iceland,2,7.561,1.30232,1.40223,0.94784,0.62877,0.4363,0.14145,2015
2,Denmark,3,7.527,1.32548,1.36058,0.87464,0.64938,0.34139,0.48357,2015
3,Norway,4,7.522,1.459,1.33095,0.88521,0.66973,0.34699,0.36503,2015
4,Canada,5,7.427,1.32629,1.32261,0.90563,0.63297,0.45811,0.32957,2015


<class 'pandas.DataFrame'>
RangeIndex: 1510 entries, 0 to 1509
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   country                  1510 non-null   str    
 1   happiness_rank           1510 non-null   int64  
 2   happiness_score          1510 non-null   float64
 3   gdp_per_capita           1507 non-null   float64
 4   social_support           1507 non-null   float64
 5   healthy_life_expectancy  1506 non-null   float64
 6   freedom                  1507 non-null   float64
 7   generosity               1507 non-null   float64
 8   corruption_perception    1506 non-null   float64
 9   year                     1510 non-null   int64  
dtypes: float64(7), int64(2), str(1)
memory usage: 118.1 KB


None

In [49]:
import pandas as pd

def handle_missing_values(merged_df):
    NUMERIC_COLUMNS = [
        "gdp_per_capita",
        "social_support",
        "healthy_life_expectancy",
        "freedom",
        "generosity",
        "corruption_perception"
    ]

    imputed_df = merged_df.copy()

    for col in NUMERIC_COLUMNS:
        country_medians_df = imputed_df.groupby("country")[col].transform("median")
        imputed_df[col] = imputed_df[col].fillna(country_medians_df)

        global_median = imputed_df[col].median()
        imputed_df[col] = imputed_df[col].fillna(global_median)

    return imputed_df

processed_happiness_df = handle_missing_values(merged_happiness_df)

display(processed_happiness_df.isnull().sum())

country                    0
happiness_rank             0
happiness_score            0
gdp_per_capita             0
social_support             0
healthy_life_expectancy    0
freedom                    0
generosity                 0
corruption_perception      0
year                       0
dtype: int64