In [1]:
import pandas as pd

# LINKS

World Happines Report: https://worldhappiness.report/data/

OECD data explorer: https://data-explorer.oecd.org/

# 1) Data ingestion

## 1.1) GDP data

### 1.1.1) (OECD) GDP 1970-2023 deflated

In [5]:
gdp_1970_deflated_df = pd.read_csv("oecd_gdp_1970_2023_deflated.csv")


## 1.2) Social Policy Expenditure Data

### 1.2.1) Total social policy expenditure as percentage of GDP

In [75]:
expend_on_social_policy_df = pd.read_csv("expenditure_on_social_policy.csv")

# 1.3) Happiness

## 1.3.1) Wellbeing (hand picked features) 2004-2024

In [79]:
current_wellbeing_df = pd.read_csv("current_wellbeing.csv")

## 1.3.2) World Happiness Report 2012

In [148]:
world_happiness_2023 = pd.read_excel("data_happiness_2023.xls")

In [154]:
world_happiness_2023.head()

Unnamed: 0,Country name,year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect
0,Afghanistan,2008,3.72359,7.350416,0.450662,50.5,0.718114,0.167652,0.881686,0.414297,0.258195
1,Afghanistan,2009,4.401778,7.508646,0.552308,50.799999,0.678896,0.190809,0.850035,0.481421,0.237092
2,Afghanistan,2010,4.758381,7.6139,0.539075,51.099998,0.600127,0.121316,0.706766,0.516907,0.275324
3,Afghanistan,2011,3.831719,7.581259,0.521104,51.400002,0.495901,0.163571,0.731109,0.479835,0.267175
4,Afghanistan,2012,3.782938,7.660506,0.520637,51.700001,0.530935,0.237588,0.77562,0.613513,0.267919


# 2) Data exploration and feature engineering 

## 2.1) GDP data

### 2.1.1) (OECD) GDP 1970-2023 deflated

#### 2.1.1.1) Removing columns to get an overview of the data

In [10]:
gdp_1970_defl_simple = gdp_1970_deflated_df.drop(columns=['STRUCTURE', 'STRUCTURE_ID', 'ACTION', 'FREQ', 'COUNTERPART_SECTOR', 'Counterpart institutional sector', 'CURRENCY', 'Currency', 'Decimals', 'Confidentiality status', 'CONF_STATUS', 'INSTR_ASSET', 'Financial instruments and non-financial assets', 'TABLE_IDENTIFIER', 'Table identifier', 'Time period', 'REF_YEAR_PRICE', 'Observation value', 'Price reference year', 'Economic activity', 'ACTIVITY', 'PRICE_BASE', 'TRANSFORMATION', 'Transformation', 'TRANSACTION', 'SECTOR', 'REF_AREA', 'STRUCTURE_NAME', 'Frequency of observation', 'UNIT_MULT', 'DECIMALS', 'EXPENDITURE', 'Expenditure', 'UNIT_MEASURE'])

In [11]:
gdp_1970_defl_simple.head()

Unnamed: 0,Reference area,Institutional sector,Transaction,Unit of measure,Price base,TIME_PERIOD,OBS_VALUE,OBS_STATUS,Observation status,Unit multiplier
0,Ireland,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),2009,234829.565,A,Normal value,Millions
1,Switzerland,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),2017,559252.338,A,Normal value,Millions
2,Switzerland,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),2016,551733.319,A,Normal value,Millions
3,Colombia,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),2018,662368.976,A,Normal value,Millions
4,Switzerland,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),1971,270463.206,E,Estimated value,Millions


In [12]:
gdp_1970_defl_simple.shape

(2624, 10)

In [13]:
gdp_1970_defl_simple.iloc[2435:2445]

Unnamed: 0,Reference area,Institutional sector,Transaction,Unit of measure,Price base,TIME_PERIOD,OBS_VALUE,OBS_STATUS,Observation status,Unit multiplier
2435,Italy,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),1975,1207254.975,E,Estimated value,Millions
2436,Italy,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),1974,1233027.254,E,Estimated value,Millions
2437,Italy,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),1973,1168744.649,E,Estimated value,Millions
2438,Italy,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),1972,1091001.392,E,Estimated value,Millions
2439,Italy,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),1971,1052171.165,E,Estimated value,Millions
2440,Italy,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),1970,1033383.143,E,Estimated value,Millions
2441,Italy,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),2023,2421454.72,A,Normal value,Millions
2442,Italy,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),2022,2399363.963,A,Normal value,Millions
2443,Italy,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),2021,2307393.619,A,Normal value,Millions
2444,Italy,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),2020,2130355.931,A,Normal value,Millions


In [14]:
gdp_1970_defl_simple.isna().sum()

Reference area          0
Institutional sector    0
Transaction             0
Unit of measure         0
Price base              0
TIME_PERIOD             0
OBS_VALUE               0
OBS_STATUS              0
Observation status      0
Unit multiplier         0
dtype: int64

In [15]:
gdp_1970_defl_simple.sort_values(by="TIME_PERIOD", inplace=False)

Unnamed: 0,Reference area,Institutional sector,Transaction,Unit of measure,Price base,TIME_PERIOD,OBS_VALUE,OBS_STATUS,Observation status,Unit multiplier
121,Portugal,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),1970,1.024354e+05,E,Estimated value,Millions
786,France,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),1970,9.921342e+05,A,Normal value,Millions
2440,Italy,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),1970,1.033383e+06,E,Estimated value,Millions
1814,United States,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),1970,5.173693e+06,A,Normal value,Millions
1049,Iceland,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),1970,3.460969e+03,E,Estimated value,Millions
...,...,...,...,...,...,...,...,...,...,...
2183,Greece,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),2023,3.193293e+05,P,Provisional value,Millions
765,Norway,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),2023,3.536325e+05,A,Normal value,Millions
2357,Korea,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),2023,2.327219e+06,E,Estimated value,Millions
2279,Slovak Republic,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),2023,1.912215e+05,A,Normal value,Millions


In [26]:
ireland_df = gdp_1970_defl_simple[gdp_1970_defl_simple["Reference area"] == "Ireland"]

In [28]:
ire_df_copy = ireland_df.copy()

In [30]:
ire_df_copy.head()

Unnamed: 0,Reference area,Institutional sector,Transaction,Unit of measure,Price base,TIME_PERIOD,OBS_VALUE,OBS_STATUS,Observation status,Unit multiplier
0,Ireland,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),2009,234829.565,A,Normal value,Millions
11,Ireland,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),2020,453894.856,A,Normal value,Millions
2298,Ireland,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),2016,340720.243,A,Normal value,Millions
2299,Ireland,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),2015,336604.783,A,Normal value,Millions
2300,Ireland,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),2014,270114.548,A,Normal value,Millions


#### 2.1.1.2) Moving all OBS_VALUES (gdp) up a year (I need to move it down a year actually)

In [34]:
ire_df_copy.sort_values(by="TIME_PERIOD", inplace=True, ascending=True)

In [36]:
ire_df_copy['OBS_VALUE'] = ire_df_copy['OBS_VALUE'].shift(+1)

In [38]:
ire_df_copy.head()

Unnamed: 0,Reference area,Institutional sector,Transaction,Unit of measure,Price base,TIME_PERIOD,OBS_VALUE,OBS_STATUS,Observation status,Unit multiplier
2619,Ireland,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),1970,,E,Estimated value,Millions
2618,Ireland,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),1971,40517.027,E,Estimated value,Millions
2617,Ireland,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),1972,41922.922,E,Estimated value,Millions
2616,Ireland,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),1973,44643.64,E,Estimated value,Millions
2615,Ireland,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),1974,46751.457,E,Estimated value,Millions


## 2.2) Social Policy Expenditure 1993-2023

### 2.2.1) Total social policy expenditure as percentage of GDP

In [49]:
expend_on_social_policy_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 366 entries, 0 to 365
Data columns (total 34 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   STRUCTURE                 366 non-null    object 
 1   STRUCTURE_ID              366 non-null    object 
 2   STRUCTURE_NAME            366 non-null    object 
 3   ACTION                    366 non-null    object 
 4   REF_AREA                  366 non-null    object 
 5   Reference area            366 non-null    object 
 6   FREQ                      366 non-null    object 
 7   Frequency of observation  366 non-null    object 
 8   MEASURE                   366 non-null    object 
 9   Measure                   366 non-null    object 
 10  UNIT_MEASURE              366 non-null    object 
 11  Unit of measure           366 non-null    object 
 12  EXPEND_SOURCE             366 non-null    object 
 13  Expenditure source        366 non-null    object 
 14  SPENDING_T

#### 2.2.1.1) Removing columns to get an overview of the data

In [60]:
exp_social_df = expend_on_social_policy_df.drop(columns=['PROGRAMME_TYPE', 'SPENDING_TYPE', 'EXPEND_SOURCE', 'MEASURE', 'Price base', 'BASE_PER', 'Base period', 'STRUCTURE', 'STRUCTURE_ID', 'ACTION', 'FREQ', 'CURRENCY', 'Currency', 'Decimals',  'Time period', 'Observation value', 'PRICE_BASE', 'REF_AREA', 'STRUCTURE_NAME', 'Frequency of observation', 'UNIT_MULT', 'DECIMALS', 'UNIT_MEASURE'])

In [62]:
exp_social_df.head()


Unnamed: 0,Reference area,Measure,Unit of measure,Expenditure source,Spending type,Programme type,TIME_PERIOD,OBS_VALUE,OBS_STATUS,Observation status,Unit multiplier
0,Türkiye,Social expenditure,Percentage of GDP,Net total,Total,Total,2017,11.803,A,Normal value,Units
1,Türkiye,Social expenditure,Percentage of GDP,Net total,Total,Total,2019,12.463,A,Normal value,Units
2,Türkiye,Social expenditure,Percentage of GDP,Net total,Total,Total,2005,9.333,A,Normal value,Units
3,Türkiye,Social expenditure,Percentage of GDP,Net total,Total,Total,2007,10.164,A,Normal value,Units
4,Türkiye,Social expenditure,Percentage of GDP,Net total,Total,Total,2009,12.074,A,Normal value,Units


In [64]:
exp_social_df_DK = exp_social_df[exp_social_df["Reference area"] == "Denmark"]

In [66]:
exp_social_df_DK.head()

Unnamed: 0,Reference area,Measure,Unit of measure,Expenditure source,Spending type,Programme type,TIME_PERIOD,OBS_VALUE,OBS_STATUS,Observation status,Unit multiplier
224,Denmark,Social expenditure,Percentage of GDP,Net total,Total,Total,2005,21.821,A,Normal value,Units
225,Denmark,Social expenditure,Percentage of GDP,Net total,Total,Total,2007,22.839,A,Normal value,Units
226,Denmark,Social expenditure,Percentage of GDP,Net total,Total,Total,2009,26.397,A,Normal value,Units
227,Denmark,Social expenditure,Percentage of GDP,Net total,Total,Total,2011,26.442,A,Normal value,Units
228,Denmark,Social expenditure,Percentage of GDP,Net total,Total,Total,2013,26.787,A,Normal value,Units


In [70]:
exp_social_df_DK.sort_values(by = "TIME_PERIOD", ascending=True, inplace=False)

Unnamed: 0,Reference area,Measure,Unit of measure,Expenditure source,Spending type,Programme type,TIME_PERIOD,OBS_VALUE,OBS_STATUS,Observation status,Unit multiplier
232,Denmark,Social expenditure,Percentage of GDP,Net total,Total,Total,1993,23.61,A,Normal value,Units
233,Denmark,Social expenditure,Percentage of GDP,Net total,Total,Total,1995,22.994,A,Normal value,Units
234,Denmark,Social expenditure,Percentage of GDP,Net total,Total,Total,1997,21.604,A,Normal value,Units
235,Denmark,Social expenditure,Percentage of GDP,Net total,Total,Total,1999,21.638,A,Normal value,Units
236,Denmark,Social expenditure,Percentage of GDP,Net total,Total,Total,2001,20.879,A,Normal value,Units
237,Denmark,Social expenditure,Percentage of GDP,Net total,Total,Total,2003,22.152,A,Normal value,Units
224,Denmark,Social expenditure,Percentage of GDP,Net total,Total,Total,2005,21.821,A,Normal value,Units
225,Denmark,Social expenditure,Percentage of GDP,Net total,Total,Total,2007,22.839,A,Normal value,Units
226,Denmark,Social expenditure,Percentage of GDP,Net total,Total,Total,2009,26.397,A,Normal value,Units
227,Denmark,Social expenditure,Percentage of GDP,Net total,Total,Total,2011,26.442,A,Normal value,Units


## 2.3) Happiness

### 2.3.1) Wellbeing (hand picked features) 2004-2024

In [86]:
current_wellbeing_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5466 entries, 0 to 5465
Data columns (total 30 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   STRUCTURE           5466 non-null   object 
 1   STRUCTURE_ID        5466 non-null   object 
 2   STRUCTURE_NAME      5466 non-null   object 
 3   ACTION              5466 non-null   object 
 4   REF_AREA            5466 non-null   object 
 5   Reference area      5466 non-null   object 
 6   MEASURE             5466 non-null   object 
 7   Measure             5466 non-null   object 
 8   UNIT_MEASURE        5466 non-null   object 
 9   Unit of measure     5466 non-null   object 
 10  AGE                 5466 non-null   object 
 11  Age                 5466 non-null   object 
 12  SEX                 5466 non-null   object 
 13  Sex                 5466 non-null   object 
 14  EDUCATION_LEV       5466 non-null   object 
 15  Education level     5466 non-null   object 
 16  DOMAIN

#### 2.3.1.1) Removing columns to get an overview of the data

In [94]:
wellbeing_df = current_wellbeing_df.drop(columns=['EDUCATION_LEV', 'MEASURE', 'DOMAIN', 'Domain', 'Education level', 'Sex', 'SEX', 'AGE', 'Age', 'OBS_STATUS', 'BASE_PER', 'Base period', 'STRUCTURE', 'STRUCTURE_ID', 'ACTION', 'Decimals',  'Time period', 'Observation value', 'REF_AREA', 'STRUCTURE_NAME', 'UNIT_MULT', 'DECIMALS', 'UNIT_MEASURE'])

In [96]:
wellbeing_df.head()

Unnamed: 0,Reference area,Measure,Unit of measure,TIME_PERIOD,OBS_VALUE,Observation status,Unit multiplier
0,Latvia,Perceived health as positive,Percentage of population aged 16 years or over,2017,44.2,Normal value,Units
1,Latvia,Perceived health as positive,Percentage of population aged 16 years or over,2018,46.9,Normal value,Units
2,Latvia,Perceived health as positive,Percentage of population aged 16 years or over,2019,47.1,Normal value,Units
3,Latvia,Perceived health as positive,Percentage of population aged 16 years or over,2020,49.7,Normal value,Units
4,Latvia,Perceived health as positive,Percentage of population aged 16 years or over,2021,49.8,Normal value,Units


In [102]:
unit_of_measure_values = wellbeing_df["Measure"].unique()

print(unit_of_measure_values)

['Perceived health as positive' 'Satisfaction with personal relationships'
 'Life satisfaction' 'Satisfaction with time use' 'Lack of social support'
 'Not feeling safe at night' 'Feeling safe at night'
 'Perceived health as negative' 'Social support'
 'Self-reported depression' 'Homicides'
 'Deaths from suicide, alcohol, drugs']


In [112]:
percivd_health_pos_df = wellbeing_df[wellbeing_df["Measure"] == "Perceived health as positive"]

In [114]:
percivd_health_pos_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 590 entries, 0 to 5374
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Reference area      590 non-null    object 
 1   Measure             590 non-null    object 
 2   Unit of measure     590 non-null    object 
 3   TIME_PERIOD         590 non-null    int64  
 4   OBS_VALUE           590 non-null    float64
 5   Observation status  590 non-null    object 
 6   Unit multiplier     590 non-null    object 
dtypes: float64(1), int64(1), object(5)
memory usage: 36.9+ KB


In [116]:
stsfaction_rltnship_df = wellbeing_df[wellbeing_df["Measure"] == "Satisfaction with personal relationships"]

In [118]:
stsfaction_rltnship_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 93 entries, 31 to 5383
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Reference area      93 non-null     object 
 1   Measure             93 non-null     object 
 2   Unit of measure     93 non-null     object 
 3   TIME_PERIOD         93 non-null     int64  
 4   OBS_VALUE           93 non-null     float64
 5   Observation status  93 non-null     object 
 6   Unit multiplier     93 non-null     object 
dtypes: float64(1), int64(1), object(5)
memory usage: 5.8+ KB


In [120]:
life_stsfaction_df = wellbeing_df[wellbeing_df["Measure"] == "Life satisfaction"]

In [122]:
life_stsfaction_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 189 entries, 33 to 5380
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Reference area      189 non-null    object 
 1   Measure             189 non-null    object 
 2   Unit of measure     189 non-null    object 
 3   TIME_PERIOD         189 non-null    int64  
 4   OBS_VALUE           189 non-null    float64
 5   Observation status  189 non-null    object 
 6   Unit multiplier     189 non-null    object 
dtypes: float64(1), int64(1), object(5)
memory usage: 11.8+ KB


In [124]:
stsfaction_w_time_use_df = wellbeing_df[wellbeing_df["Measure"] == "Satisfaction with time use"]

In [126]:
stsfaction_w_time_use_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 95 entries, 35 to 5385
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Reference area      95 non-null     object 
 1   Measure             95 non-null     object 
 2   Unit of measure     95 non-null     object 
 3   TIME_PERIOD         95 non-null     int64  
 4   OBS_VALUE           95 non-null     float64
 5   Observation status  95 non-null     object 
 6   Unit multiplier     95 non-null     object 
dtypes: float64(1), int64(1), object(5)
memory usage: 5.9+ KB


In [128]:
lck_of_social_supprt_df = wellbeing_df[wellbeing_df["Measure"] == "Lack of Social support"]

In [130]:
lck_of_social_supprt_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Reference area      0 non-null      object 
 1   Measure             0 non-null      object 
 2   Unit of measure     0 non-null      object 
 3   TIME_PERIOD         0 non-null      int64  
 4   OBS_VALUE           0 non-null      float64
 5   Observation status  0 non-null      object 
 6   Unit multiplier     0 non-null      object 
dtypes: float64(1), int64(1), object(5)
memory usage: 0.0+ bytes


In [132]:
homicides_df = wellbeing_df[wellbeing_df["Measure"] == "Homicides"]

In [134]:
homicides_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 642 entries, 3838 to 5461
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Reference area      642 non-null    object 
 1   Measure             642 non-null    object 
 2   Unit of measure     642 non-null    object 
 3   TIME_PERIOD         642 non-null    int64  
 4   OBS_VALUE           642 non-null    float64
 5   Observation status  642 non-null    object 
 6   Unit multiplier     642 non-null    object 
dtypes: float64(1), int64(1), object(5)
memory usage: 40.1+ KB


In [108]:
percivd_health_pos.describe()

Unnamed: 0,TIME_PERIOD,OBS_VALUE
count,590.0,590.0
mean,2013.084746,68.044068
std,5.259281,12.790538
min,2004.0,29.5
25%,2009.0,59.8
50%,2013.0,70.0
75%,2018.0,76.6
max,2022.0,91.4


In [110]:
percivd_health_pos.head()

Unnamed: 0,Reference area,Measure,Unit of measure,TIME_PERIOD,OBS_VALUE,Observation status,Unit multiplier
0,Latvia,Perceived health as positive,Percentage of population aged 16 years or over,2017,44.2,Normal value,Units
1,Latvia,Perceived health as positive,Percentage of population aged 16 years or over,2018,46.9,Normal value,Units
2,Latvia,Perceived health as positive,Percentage of population aged 16 years or over,2019,47.1,Normal value,Units
3,Latvia,Perceived health as positive,Percentage of population aged 16 years or over,2020,49.7,Normal value,Units
4,Latvia,Perceived health as positive,Percentage of population aged 16 years or over,2021,49.8,Normal value,Units
