In [33]:
import pandas as pd
import difflib
import json
import warnings
warnings.filterwarnings("ignore")

### Investigação de países com nomes diferentes ou que não existem, entre 2019 e os dados do Flourish

In [178]:
df_happiness = pd.read_csv("./raw_data/world_happiness_2019.csv")
df_regions_flourish = pd.read_csv("./raw_data/regions_flourish.csv")

list_countries_happiness = df_happiness["Country name"].values
list_countries_flourish = df_regions_flourish["Country name"].values

list_in_happiness = list(set(list_countries_happiness) - set(list_countries_flourish)) 

dict_country_name_errors_2019 = {}
for country in list_in_happiness:
    similar_country = difflib.get_close_matches(country, list_countries_flourish, n=20, cutoff=0)
    
    caught_similar_country = False
    for sim_country in similar_country:
        if sim_country.lower() in country.lower() or country.lower() in sim_country.lower():
            dict_country_name_errors_2019[country] = sim_country
            caught_similar_country = True
            break

    if not caught_similar_country:
        dict_country_name_errors_2019[country] = similar_country[:10]

for key in dict_country_name_errors_2019.keys():
    print(f"{key} = {dict_country_name_errors_2019[key]}")

Swaziland = ['Thailand', 'Switzerland', 'Finland', 'New Zealand', 'Spain', 'Midway Island', 'Jarvis Island', 'Rwanda', 'Poland', 'Brazil']
DR Congo = ['Togo', 'Mongolia', 'Republic of the Congo', 'Hong Kong', 'Tonga', 'Montenegro', 'Monaco', 'Angola', 'Romania', 'Democratic Republic of the Congo']
North Cyprus = Cyprus
Czechia = ['China', 'Czech Republic', 'Chad', 'Chile', 'Uzbekistan', 'Serbia', 'French Polynesia', 'Nigeria', 'Liberia', 'Georgia']
Congo = Republic of the Congo
State of Palestine = Palestine
Viet Nam = ['Vietnam', 'The Gambia', 'Guinea', 'The Bahamas', 'Saint Lucia', 'Sint Maarten', 'Saint Martin', 'Saint Helena', 'Namibia', 'Liberia']
Russian Federation = Russia
Taiwan Province of China = China
Lao PDR = ['Laos', 'Lebanon', 'Costa Rica', 'Samoa', 'Puerto Rico', 'Libya', 'Gabon', 'Kingman Reef', 'Latvia', 'Faroe Islands']
Gambia = The Gambia
Türkiye = ['Turkey', 'Ukraine', 'Tokelau', 'Eritrea', 'Turkmenistan', 'Suriname', 'Niue', 'Tajikistan', 'Niger', 'Libya']
Republi

In [179]:
dict_sub_country_from_happiness_2019 = {
    'Swaziland': 'Eswatini',
    'DR Congo': 'Democratic Republic of the Congo',
    'Czechia': 'Czech Republic',
    'Congo': 'Republic of the Congo',
    'State of Palestine': 'Palestine',
    'Viet Nam': 'Vietnam',
    'Russian Federation': 'Russia',
    'Lao PDR': 'Laos',
    'Gambia': 'The Gambia',
    'Türkiye': 'Turkey',
    'Republic of Moldova': 'Moldova',
    'Republic of Korea': 'South Korea',
    'United States': 'United States of America',
    'Côte d’Ivoire': 'Ivory Coast',
    'Macedonia': 'North Macedonia',
    'Hong Kong SAR of China': 'Hong Kong'
}

In [181]:
df_happiness_cp = df_happiness.copy()
df_happiness_cp["Country name"].replace(dict_sub_country_from_happiness_2019, inplace=True)

list_new_countries_happiness = df_happiness_cp["Country name"].values
list_to_delete_countries_2019 = list(set(list_new_countries_happiness) - set(list_countries_flourish))
print(f"Countries to delete from 2019 data: {list_to_delete_countries_2019}")

df_happiness_cp_cleaned = df_happiness_cp[~df_happiness_cp["Country name"].isin(list_to_delete_countries_2019)]
print(f"Final total number of countries in 2019: {len(df_happiness_cp_cleaned)}")

Countries to delete from 2019 data: ['Taiwan Province of China', 'North Cyprus']
Final total number of countries in 2019: 151


In [182]:
with open("./raw_data/common_countries_2019_flourish.txt", "w") as f:
    for country in df_happiness_cp_cleaned["Country name"].values:
        f.write(country + "\n")

### Processamento dos conjuntos de acordo com a relação encontrada entre 2019 e dados do Flourish

In [183]:
def normalize_countries(df_happiness: pd.DataFrame) -> pd.DataFrame:
    try:
        df_happiness_cp = df_happiness.copy()
        df_happiness_cp_cleaned = df_happiness_cp[~df_happiness_cp["Country name"].isin(list_to_delete_countries_2019)]
        df_happiness_cp_cleaned["Country name"].replace(dict_sub_country_from_happiness_2019, inplace=True)
        
        return df_happiness_cp_cleaned
    except Exception as err:
        print(f"Erro: {err}")

def check_result(df_happiness_cp_cleaned: pd.DataFrame):
    with open("./raw_data/common_countries_2019_flourish.txt", 'r') as file:
        lines = file.readlines()
    list_choosen_contries = [line.strip() for line in lines]
    
    common_countries_with_2019 = df_happiness_cp_cleaned["Country name"].isin(list_choosen_contries)
    print(f"# common countries: {len(df_happiness_cp_cleaned[common_countries_with_2019])}")

    countries_not_found = ~df_happiness_cp_cleaned["Country name"].isin(list_choosen_contries)
    len_not_found = len(df_happiness_cp_cleaned[countries_not_found])
    print(f"# not found contries: {len_not_found}")

    if len_not_found > 0:
        print(f"-> {df_happiness_cp_cleaned[countries_not_found]['Country name'].tolist()}")

def preprocess_dataframe(year: int) -> pd.DataFrame:
    df_happiness = pd.read_csv(f"./raw_data/world_happiness_{year}.csv")
    df_happiness_cp_cleaned = normalize_countries(df_happiness=df_happiness)
    return df_happiness_cp_cleaned

In [184]:
list_years = [2019, 2020, 2021, 2022, 2023, 2024]

for year in list_years:
    print(f"Processing data from {year}")
    df_happiness_cp_cleaned = preprocess_dataframe(year=year)
    check_result(df_happiness_cp_cleaned=df_happiness_cp_cleaned)
    
    print("Saving dataframe\n")
    df_happiness_cp_cleaned.to_csv(f'./norm_data/world_happiness_{year}.csv', index=False)

Processing data from 2019
# common countries: 151
# not found contries: 0
Saving dataframe

Processing data from 2020
# common countries: 147
# not found contries: 0
Saving dataframe

Processing data from 2021
# common countries: 144
# not found contries: 0
Saving dataframe

Processing data from 2022
# common countries: 136
# not found contries: 0
Saving dataframe

Processing data from 2023
# common countries: 142
# not found contries: 0
Saving dataframe

Processing data from 2024
# common countries: 143
# not found contries: 3
-> ['Belize', 'Oman', 'Somalia']
Saving dataframe



In [185]:
# Remoção dos paises não encontrados em 2024

df_happiness_2024 = pd.read_csv("./norm_data/world_happiness_2024.csv")
removal_not_found = ~df_happiness_2024["Country name"].isin(['Belize', 'Oman', 'Somalia'])
df_happiness_2024 = df_happiness_2024[removal_not_found]
df_happiness_2024.to_csv('./norm_data/world_happiness_2024_v2.csv', index=False)
len(df_happiness_2024)

143

In [186]:
# Ajuste para 2022 com menor número de países em comum: 136

df_happiness_2022 = pd.read_csv("./norm_data/world_happiness_2022.csv")
list_countries_2022 = df_happiness_2022["Country name"].values
print(len(list_countries_2022))

136


In [187]:
for year in [2019, 2020, 2021, 2023, 2024]:
    if year == 2024:
        df_happiness_year = pd.read_csv(f"./norm_data/world_happiness_{year}_v2.csv")
    else:
        df_happiness_year = pd.read_csv(f"./norm_data/world_happiness_{year}.csv")

    filtered_countries = df_happiness_year["Country name"].isin(list_countries_2022)
    df_happiness_year = df_happiness_year[filtered_countries]
    df_happiness_year.to_csv(f'./final_data/world_happiness_{year}.csv', index=False)

In [188]:
# Verificação dos países em comum

df_happiness_2019 = pd.read_csv("./final_data/world_happiness_2019.csv")
df_happiness_2020 = pd.read_csv("./final_data/world_happiness_2020.csv")
df_happiness_2021 = pd.read_csv("./final_data/world_happiness_2021.csv")
df_happiness_2022 = pd.read_csv("./final_data/world_happiness_2022.csv")
df_happiness_2023 = pd.read_csv("./final_data/world_happiness_2023.csv")
df_happiness_2024 = pd.read_csv("./final_data/world_happiness_2024.csv")

In [189]:
set_2019 = set(df_happiness_2019["Country name"].values)
set_2020 = set(df_happiness_2020["Country name"].values)
set_2021 = set(df_happiness_2021["Country name"].values)
set_2022 = set(df_happiness_2022["Country name"].values)
set_2023 = set(df_happiness_2023["Country name"].values)
set_2024 = set(df_happiness_2024["Country name"].values)

In [190]:
def remove_dr_congo(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df = df[df["Country name"] != "Democratic Republic of the Congo"]
    return df

df_happiness_2019 = remove_dr_congo(df_happiness_2019)
df_happiness_2020 = remove_dr_congo(df_happiness_2020)
df_happiness_2021 = remove_dr_congo(df_happiness_2021)
df_happiness_2022 = remove_dr_congo(df_happiness_2022)
df_happiness_2023 = remove_dr_congo(df_happiness_2023)
df_happiness_2024 = remove_dr_congo(df_happiness_2024)

In [165]:
df_happiness_2020.sort_values(by='Country name')["Country name"].values

array(['Afghanistan', 'Albania', 'Algeria', 'Argentina', 'Armenia',
       'Australia', 'Austria', 'Bahrain', 'Bangladesh', 'Belgium',
       'Benin', 'Bolivia', 'Bosnia and Herzegovina', 'Botswana', 'Brazil',
       'Bulgaria', 'Burkina Faso', 'Cambodia', 'Cameroon', 'Canada',
       'Chad', 'Chile', 'China', 'Colombia', 'Comoros', 'Costa Rica',
       'Croatia', 'Cyprus', 'Cyprus', 'Czech Republic', 'Denmark',
       'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador', 'Estonia',
       'Ethiopia', 'Finland', 'France', 'Gabon', 'Georgia', 'Germany',
       'Ghana', 'Greece', 'Guatemala', 'Guinea', 'Honduras', 'Hong Kong',
       'Hungary', 'Iceland', 'India', 'Indonesia', 'Iran', 'Iraq',
       'Ireland', 'Israel', 'Italy', 'Ivory Coast', 'Jamaica', 'Japan',
       'Jordan', 'Kazakhstan', 'Kenya', 'Kosovo', 'Kyrgyzstan', 'Laos',
       'Latvia', 'Lebanon', 'Liberia', 'Lithuania', 'Luxembourg',
       'Madagascar', 'Malawi', 'Malaysia', 'Mali', 'Malta', 'Mauritania',
       'Mauri

In [200]:
df_cloropletic_map = pd.DataFrame()

df_cloropletic_map["Country name"] = df_happiness_2019["Country name"].to_numpy()
df_cloropletic_map["2019"] = df_happiness_2019["Explained by: Log GDP per capita"].to_numpy()
df_cloropletic_map["2020"] = df_happiness_2020["Explained by: Log GDP per capita"].to_numpy()
df_cloropletic_map["2021"] = df_happiness_2021["Explained by: Log GDP per capita"].to_numpy()
df_cloropletic_map["2022"] = df_happiness_2022["Explained by: Log GDP per capita"].to_numpy()
df_cloropletic_map["2023"] = df_happiness_2023["Explained by: Log GDP per capita"].to_numpy()
df_cloropletic_map["2024"] = df_happiness_2024["Explained by: Log GDP per capita"].to_numpy()
df_cloropletic_map

Unnamed: 0,Country name,2019,2020,2021,2022,2023,2024
0,Afghanistan,0301,0370,0758,0645,0628,1749
1,Albania,0907,1008,1439,1449,1438,1825
2,Algeria,0944,0946,1363,1353,1324,1799
3,Argentina,1028,1162,1592,1590,1562,1783
4,Armenia,0808,0996,1434,1466,1444,1822
...,...,...,...,...,...,...,...
130,Uzbekistan,0697,0769,1219,1227,1212,0827
131,Venezuela,0770,0852,0000,0000,0000,0588
132,Vietnam,0718,0817,1252,1349,1331,1223
133,Zambia,0537,0528,0930,0914,0899,0786


In [201]:
# convert string columns to numeric
for col in df_cloropletic_map.columns[1:]:
    df_cloropletic_map[col] = df_cloropletic_map[col].str.replace(',', '.').astype(float).fillna(0.0)
df_cloropletic_map

Unnamed: 0,Country name,2019,2020,2021,2022,2023,2024
0,Afghanistan,0.301,0.370,0.758,0.645,0.628,1.749
1,Albania,0.907,1.008,1.439,1.449,1.438,1.825
2,Algeria,0.944,0.946,1.363,1.353,1.324,1.799
3,Argentina,1.028,1.162,1.592,1.590,1.562,1.783
4,Armenia,0.808,0.996,1.434,1.466,1.444,1.822
...,...,...,...,...,...,...,...
130,Uzbekistan,0.697,0.769,1.219,1.227,1.212,0.827
131,Venezuela,0.770,0.852,0.000,0.000,0.000,0.588
132,Vietnam,0.718,0.817,1.252,1.349,1.331,1.223
133,Zambia,0.537,0.528,0.930,0.914,0.899,0.786


In [202]:
df_cloropletic_map.to_csv("./final_data/world_happiness_gdp.csv", index=False)