## Table of Contents

  - Importing data
  - Selecting data for analysis
    - Exporting data for external analyis
  - Merging the datasets
  - Exporting data

## Importing data

In [29]:
# Import libraries
import pandas as pd
import numpy as np
import os

In [31]:
path  = r'INSERT-YOUR-PATH'

In [51]:
#Importing data as df
df_covid = pd.read_csv(os.path.join(path, '01 Data', 'Original Data', 'owid-covid-data_stringency_avg.csv'), sep=';', index_col=False)

In [53]:
df_covid.head()

Unnamed: 0,Country,Attribute,Covid_Stringency
0,Afghanistan,2020,4194569061
1,Afghanistan,2021,2924221918
2,Afghanistan,2022,1141564384
3,Albania,2020,558060221
4,Albania,2021,4859090411


In [None]:
df_covid.rename(columns = {'Attribute' : 'Year'}, inplace = True)

In [125]:
df_covid.head()

Unnamed: 0,Country,Year,Covid_Stringency
0,Afghanistan,2020,4194569061
1,Afghanistan,2021,2924221918
2,Afghanistan,2022,1141564384
3,Albania,2020,558060221
4,Albania,2021,4859090411


In [71]:
df_covid.shape

(540, 3)

In [55]:
#Importing word happiness data as wh
df_wh = pd.read_pickle(os.path.join(path, '01 Data', 'Prepared Data', 'dataset-2024-12-02.pkl'))

In [59]:
df_wh.head()

Unnamed: 0,Country,Year,Happiness,GDP_log,Social_support,Life_expectancy,Freedom,Generosity,Corruption,Pos_affect,Neg_affect,Democracy,Inflation,_merge
0,Afghanistan,2008,3.724,7.35,0.451,50.5,0.718,0.164,0.882,0.414,0.258,3.02,26.418664,both
1,Afghanistan,2009,4.402,7.509,0.552,50.8,0.679,0.187,0.85,0.481,0.237,,,left_only
2,Afghanistan,2010,4.758,7.614,0.539,51.1,0.6,0.118,0.707,0.517,0.275,2.48,2.178538,both
3,Afghanistan,2011,3.832,7.581,0.521,51.4,0.496,0.16,0.731,0.48,0.267,2.48,11.804186,both
4,Afghanistan,2012,3.783,7.661,0.521,51.7,0.531,0.234,0.776,0.614,0.268,2.48,6.441213,both


## Selecting data for analysis

In [65]:
#Create df with only the relevant columns
df_wh_2 = df_wh[['Country', 'Year', 'Happiness']]

In [63]:
df_wh_2.head()

Unnamed: 0,Country,Year,Happiness
0,Afghanistan,2008,3.724
1,Afghanistan,2009,4.402
2,Afghanistan,2010,4.758
3,Afghanistan,2011,3.832
4,Afghanistan,2012,3.783


In [78]:
df_wh_2.shape

(2363, 3)

In [88]:
#Limit df to Covid-Years
df_wh_3 = df_wh_2[df_wh['Year'].isin([2020, 2021, 2022])]

In [84]:
df_wh_3.head()

Unnamed: 0,Country,Year,Happiness
12,Afghanistan,2021,2.436
13,Afghanistan,2022,1.281
27,Albania,2020,5.365
28,Albania,2021,5.255
29,Albania,2022,5.212


In [86]:
df_wh_3.shape

(378, 3)

### Exporting data for external analyis

In [90]:
#Export Data as Pickle
df_wh_3.to_pickle(os.path.join(path, '01 Data','Prepared Data', 'wh_covid_years.pkl'))

In [93]:
#Export Data as Pickle
df_covid.to_pickle(os.path.join(path, '01 Data','Prepared Data', 'stringency_covid_years.pkl'))

I will use AI to identify names of countries that have similar spellings or other mismatches in the country-column.

## Merging the datasets

In [97]:
# Dictionary to map Stringency dataset country names to Happiness Report names
country_mapping = {
    # Different spellings of same country/territory
    'Turkey': 'Turkiye',
    'Hong Kong': 'Hong Kong S.A.R. of China',
    'Taiwan': 'Taiwan Province of China',
    'Palestine': 'State of Palestine',
    'Congo': 'Congo (Brazzaville)',
    'Democratic Republic of Congo': 'Congo (Kinshasa)',
    
    # Territories/regions in Stringency dataset to exclude (not in Happiness Report)
    'Aruba': None,
    'Bermuda': None,
    'Brunei': None,
    'Dominica': None,
    'East Timor': None,
    'Faroe Islands': None,
    'Greenland': None,
    'Kiribati': None,
    'Liechtenstein': None,
    'Macao': None,
    'Monaco': None,
    'San Marino': None,
    'Seychelles': None,
    'Solomon Islands': None,
    'Vanuatu': None
}

In [99]:
# Function to standardize country names
def standardize_country_name(country_name):
    """
    Standardize country names to match Happiness Report naming conventions.
    
    Args:
        country_name (str): Country name from either dataset
        
    Returns:
        str or None: Standardized country name, or None if country should be excluded
    """
    return country_mapping.get(country_name, country_name)

In [103]:
df_covid.shape

(540, 3)

In [105]:
# Standardize country names in stringency dataset
df_covid['Country'] = df_covid['Country'].map(lambda x: standardize_country_name(x))

In [115]:
# Remove rows where country name was mapped to None
df_covid = df_covid[df_covid['Country'].notna()]

In [117]:
df_covid.shape

(498, 3)

In [127]:
# Now the datasets can be merged on 'Country'
merged_df = pd.merge(df_wh_3, df_covid, on=['Country', 'Year'], how='left')

In [129]:
merged_df.head()

Unnamed: 0,Country,Year,Happiness,Covid_Stringency
0,Afghanistan,2021,2.436,2924221918
1,Afghanistan,2022,1.281,1141564384
2,Albania,2020,5.365,558060221
3,Albania,2021,5.255,4859090411
4,Albania,2022,5.212,2152254795


In [131]:
merged_df.shape

(378, 4)

## Exporting data

In [137]:
#Export Data as Pickle
merged_df.to_pickle(os.path.join(path, '01 Data','Prepared Data', 'covid_happiness_2025_01_09.pkl'))