In [1]:
import pandas as pd

# LINKS

#### World Happines Report:
- https://worldhappiness.report/data/

#### OECD data explorer:
- https://data-explorer.oecd.org/

SOCIAL EXPENDITURE (EU, USA, AUSTRALIA): https://data-explorer.oecd.org/vis?tm=Social%20expenditure&pg=0&snb=139&vw=tb&df[ds]=dsDisseminateFinalDMZ&df[id]=DSD_SOCX_AGG%40DF_SOCX_AGG&df[ag]=OECD.ELS.SPD&df[vs]=1.0&dq=AUS%2BAUT%2BBEL%2BCZE%2BDNK%2BEST%2BFIN%2BDEU%2BFRA%2BGRC%2BHUN%2BISL%2BIRL%2BISR%2BITA%2BLVA%2BLTU%2BLUX%2BNLD%2BNZL%2BNOR%2BPOL%2BPRT%2BSVK%2BSVN%2BESP%2BSWE%2BCHE%2BTUR%2BGBR%2BUSA.A..PT_OTE_S13%2BPT_B1GQ.ES10._T._T.&pd=1989%2C2022&to[TIME_PERIOD]=false&ly[cl]=TIME_PERIOD&ly[rw]=REF_AREA%2CCOMBINED_UNIT_MEASURE


#### European Comission: 

- https://taxation-customs.ec.europa.eu/taxation/economic-analyses/taxation-trends-eu/data-taxation-trends_en
- https://webgate.ec.europa.eu/taxation_customs/redisstat/databrowser/explore/all/DATA_ON_TAX?lang=en&display=card&sort=category

# 1) Data ingestion

## 1.1) GDP data

### 1.1.1) (OECD) GDP 1970-2023 deflated

In [7]:
gdp_1970_deflated_df = pd.read_csv("oecd_gdp_1970_2023_deflated.csv")

In [8]:
gdp_1970_deflated_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2624 entries, 0 to 2623
Data columns (total 44 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   STRUCTURE                                       2624 non-null   object 
 1   STRUCTURE_ID                                    2624 non-null   object 
 2   STRUCTURE_NAME                                  2624 non-null   object 
 3   ACTION                                          2624 non-null   object 
 4   FREQ                                            2624 non-null   object 
 5   Frequency of observation                        2624 non-null   object 
 6   REF_AREA                                        2624 non-null   object 
 7   Reference area                                  2624 non-null   object 
 8   SECTOR                                          2624 non-null   object 
 9   Institutional sector                     

In [9]:
gdp_years = gdp_1970_deflated_df["TIME_PERIOD"].unique()
gdp_years.sort()
gdp_years

array([1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980,
       1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991,
       1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
       2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013,
       2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023],
      dtype=int64)

## 1.2) Social Policy Expenditure Data

### 1.2.1) Total social policy expenditure as percentage of GDP

In [12]:
expend_on_social_policy_df = pd.read_csv("expenditure_on_social_policy.csv")

In [13]:
expend_on_social_policy_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 366 entries, 0 to 365
Data columns (total 34 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   STRUCTURE                 366 non-null    object 
 1   STRUCTURE_ID              366 non-null    object 
 2   STRUCTURE_NAME            366 non-null    object 
 3   ACTION                    366 non-null    object 
 4   REF_AREA                  366 non-null    object 
 5   Reference area            366 non-null    object 
 6   FREQ                      366 non-null    object 
 7   Frequency of observation  366 non-null    object 
 8   MEASURE                   366 non-null    object 
 9   Measure                   366 non-null    object 
 10  UNIT_MEASURE              366 non-null    object 
 11  Unit of measure           366 non-null    object 
 12  EXPEND_SOURCE             366 non-null    object 
 13  Expenditure source        366 non-null    object 
 14  SPENDING_T

In [14]:
expend_on_social_policy_df.isnull().sum()

STRUCTURE                     0
STRUCTURE_ID                  0
STRUCTURE_NAME                0
ACTION                        0
REF_AREA                      0
Reference area                0
FREQ                          0
Frequency of observation      0
MEASURE                       0
Measure                       0
UNIT_MEASURE                  0
Unit of measure               0
EXPEND_SOURCE                 0
Expenditure source            0
SPENDING_TYPE                 0
Spending type                 0
PROGRAMME_TYPE                0
Programme type                0
PRICE_BASE                    0
Price base                    0
TIME_PERIOD                   0
Time period                 366
OBS_VALUE                     0
Observation value           366
OBS_STATUS                    0
Observation status            0
UNIT_MULT                     0
Unit multiplier               0
DECIMALS                      0
Decimals                      0
CURRENCY                      0
Currency

### 1.2.2) Social expenditure eu, usa and australia

Link: https://data-explorer.oecd.org/vis?tm=Social%20expenditure&pg=0&snb=139&vw=tb&df[ds]=dsDisseminateFinalDMZ&df[id]=DSD_SOCX_AGG%40DF_SOCX_AGG&df[ag]=OECD.ELS.SPD&df[vs]=1.0&dq=AUS%2BAUT%2BBEL%2BCZE%2BDNK%2BEST%2BFIN%2BDEU%2BFRA%2BGRC%2BHUN%2BISL%2BIRL%2BISR%2BITA%2BLVA%2BLTU%2BLUX%2BNLD%2BNZL%2BNOR%2BPOL%2BPRT%2BSVK%2BSVN%2BESP%2BSWE%2BCHE%2BTUR%2BGBR%2BUSA.A..PT_OTE_S13%2BPT_B1GQ.ES10._T._T.&pd=1989%2C2022&to[TIME_PERIOD]=false&ly[cl]=TIME_PERIOD&ly[rw]=REF_AREA%2CCOMBINED_UNIT_MEASURE

In [16]:
social_policy_eu_usa_australia_df = pd.read_csv("social_expenditure_eu_usa_australia.csv")

In [17]:
social_policy_years = social_policy_eu_usa_australia_df["TIME_PERIOD"].unique()
social_policy_years.sort()
social_policy_years

array([1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
       2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
       2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021,
       2022], dtype=int64)

In [209]:
social_policy_eu_usa_australia_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1821 entries, 0 to 1820
Data columns (total 34 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   STRUCTURE                 1821 non-null   object 
 1   STRUCTURE_ID              1821 non-null   object 
 2   STRUCTURE_NAME            1821 non-null   object 
 3   ACTION                    1821 non-null   object 
 4   REF_AREA                  1821 non-null   object 
 5   Reference area            1821 non-null   object 
 6   FREQ                      1821 non-null   object 
 7   Frequency of observation  1821 non-null   object 
 8   MEASURE                   1821 non-null   object 
 9   Measure                   1821 non-null   object 
 10  UNIT_MEASURE              1821 non-null   object 
 11  Unit of measure           1821 non-null   object 
 12  EXPEND_SOURCE             1821 non-null   object 
 13  Expenditure source        1821 non-null   object 
 14  SPENDING

In [213]:
social_exp_eu_usa_aus = social_policy_eu_usa_australia_df.drop(columns=['PROGRAMME_TYPE', 'SPENDING_TYPE', 'EXPEND_SOURCE', 'MEASURE', 'Price base', 'BASE_PER', 'Base period', 'STRUCTURE', 'STRUCTURE_ID', 'ACTION', 'FREQ', 'CURRENCY', 'Currency', 'Decimals',  'Time period', 'Observation value', 'PRICE_BASE', 'REF_AREA', 'STRUCTURE_NAME', 'Frequency of observation', 'UNIT_MULT', 'DECIMALS', 'UNIT_MEASURE'])

['Percentage of general government expenditure' 'Percentage of GDP']


## 1.3) Happiness

## 1.3.1) Wellbeing (hand picked features) 2004-2024

In [20]:
current_wellbeing_df = pd.read_csv("current_wellbeing.csv")

In [21]:
current_wellbeing_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5466 entries, 0 to 5465
Data columns (total 30 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   STRUCTURE           5466 non-null   object 
 1   STRUCTURE_ID        5466 non-null   object 
 2   STRUCTURE_NAME      5466 non-null   object 
 3   ACTION              5466 non-null   object 
 4   REF_AREA            5466 non-null   object 
 5   Reference area      5466 non-null   object 
 6   MEASURE             5466 non-null   object 
 7   Measure             5466 non-null   object 
 8   UNIT_MEASURE        5466 non-null   object 
 9   Unit of measure     5466 non-null   object 
 10  AGE                 5466 non-null   object 
 11  Age                 5466 non-null   object 
 12  SEX                 5466 non-null   object 
 13  Sex                 5466 non-null   object 
 14  EDUCATION_LEV       5466 non-null   object 
 15  Education level     5466 non-null   object 
 16  DOMAIN

## 1.3.2) World Happiness Report 2012

In [23]:
world_happiness_2023 = pd.read_excel("data_happiness_2023.xls")

In [24]:
world_happiness_2023.head()

Unnamed: 0,Country name,year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect
0,Afghanistan,2008,3.72359,7.350416,0.450662,50.5,0.718114,0.167652,0.881686,0.414297,0.258195
1,Afghanistan,2009,4.401778,7.508646,0.552308,50.799999,0.678896,0.190809,0.850035,0.481421,0.237092
2,Afghanistan,2010,4.758381,7.6139,0.539075,51.099998,0.600127,0.121316,0.706766,0.516907,0.275324
3,Afghanistan,2011,3.831719,7.581259,0.521104,51.400002,0.495901,0.163571,0.731109,0.479835,0.267175
4,Afghanistan,2012,3.782938,7.660506,0.520637,51.700001,0.530935,0.237588,0.77562,0.613513,0.267919


In [25]:
world_happiness_2023.describe()

Unnamed: 0,year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect
count,2199.0,2199.0,2179.0,2186.0,2145.0,2166.0,2126.0,2083.0,2175.0,2183.0
mean,2014.161437,5.479226,9.389766,0.810679,63.294583,0.747858,9.6e-05,0.745195,0.652143,0.271501
std,4.718736,1.125529,1.153387,0.120952,6.901104,0.14015,0.161083,0.185837,0.105922,0.086875
min,2005.0,1.281271,5.526723,0.228217,6.72,0.257534,-0.337527,0.035198,0.178886,0.082737
25%,2010.0,4.64675,8.499764,0.746609,59.119999,0.656528,-0.112116,0.688139,0.571684,0.20766
50%,2014.0,5.432437,9.498955,0.835535,65.050003,0.769821,-0.022671,0.799654,0.663063,0.260671
75%,2018.0,6.30946,10.373216,0.904792,68.5,0.859382,0.09207,0.868827,0.737936,0.322894
max,2022.0,8.018934,11.663788,0.987343,74.474998,0.985178,0.702708,0.983276,0.883586,0.70459


In [26]:
world_happiness_2023.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2199 entries, 0 to 2198
Data columns (total 11 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Country name                      2199 non-null   object 
 1   year                              2199 non-null   int64  
 2   Life Ladder                       2199 non-null   float64
 3   Log GDP per capita                2179 non-null   float64
 4   Social support                    2186 non-null   float64
 5   Healthy life expectancy at birth  2145 non-null   float64
 6   Freedom to make life choices      2166 non-null   float64
 7   Generosity                        2126 non-null   float64
 8   Perceptions of corruption         2083 non-null   float64
 9   Positive affect                   2175 non-null   float64
 10  Negative affect                   2183 non-null   float64
dtypes: float64(9), int64(1), object(1)
memory usage: 189.1+ KB


In [27]:
all_years_df = world_happiness_2023["year"].unique()
all_years_df.sort()
print(all_years_df)

[2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018
 2019 2020 2021 2022]


## 1.4) Taxation

### 1.4.1) European Commission: Taxation as percentage of GDP

In [30]:
total_taxes_df = pd.read_excel("tax-main-aggregates.xlsx", header=2)

In [31]:
total_taxes_df.head()

Unnamed: 0.1,Unnamed: 0,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,Difference 2012-2022 (pp),Ranking 2022,Revenue 2022\n(million EUR)
0,EU-27,37.922746,38.345385,39.301449,39.78725,39.865975,39.730539,39.843225,39.930718,40.054073,39.944737,40.002493,40.395208,40.161307,0.9,,6387768.3
1,EA-19,38.101209,38.58623,39.615106,40.116221,40.179571,40.061478,40.104618,40.241508,40.427915,40.296537,40.357165,40.786176,40.841225,1.2,,5482502.8
2,Belgium,43.580535,44.369598,45.319126,46.033293,45.683596,44.99157,44.213578,44.747951,44.824441,43.467108,43.411264,43.166396,43.268219,-2.0,2.0,239725.1
3,Bulgaria,25.398276,25.477641,26.078859,28.132118,28.399268,28.863177,29.184384,29.840882,29.683558,30.353742,30.498298,30.785349,31.145084,5.0,23.0,26722.7
4,Czechia,32.854823,33.975879,34.497486,34.897577,34.136013,34.288463,35.075928,35.402552,35.972707,35.907027,35.923476,35.890107,35.287515,0.8,17.0,97487.3


# 2) Data exploration and feature engineering 

## 2.1) GDP data

### 2.1.1) (OECD) GDP 1970-2023 deflated

#### 2.1.1.1) Removing columns to get an overview of the data

In [36]:
gdp_1970_defl_simple = gdp_1970_deflated_df.drop(columns=['STRUCTURE', 'STRUCTURE_ID', 'ACTION', 'FREQ', 'COUNTERPART_SECTOR', 'Counterpart institutional sector', 'CURRENCY', 'Currency', 'Decimals', 'Confidentiality status', 'CONF_STATUS', 'INSTR_ASSET', 'Financial instruments and non-financial assets', 'TABLE_IDENTIFIER', 'Table identifier', 'Time period', 'REF_YEAR_PRICE', 'Observation value', 'Price reference year', 'Economic activity', 'ACTIVITY', 'PRICE_BASE', 'TRANSFORMATION', 'Transformation', 'TRANSACTION', 'SECTOR', 'REF_AREA', 'STRUCTURE_NAME', 'Frequency of observation', 'UNIT_MULT', 'DECIMALS', 'EXPENDITURE', 'Expenditure', 'UNIT_MEASURE'])

In [37]:
gdp_1970_defl_simple.head()

Unnamed: 0,Reference area,Institutional sector,Transaction,Unit of measure,Price base,TIME_PERIOD,OBS_VALUE,OBS_STATUS,Observation status,Unit multiplier
0,Ireland,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),2009,234829.565,A,Normal value,Millions
1,Switzerland,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),2017,559252.338,A,Normal value,Millions
2,Switzerland,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),2016,551733.319,A,Normal value,Millions
3,Colombia,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),2018,662368.976,A,Normal value,Millions
4,Switzerland,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),1971,270463.206,E,Estimated value,Millions


In [38]:
gdp_1970_defl_simple.shape

(2624, 10)

In [39]:
gdp_1970_defl_simple.iloc[2435:2445]

Unnamed: 0,Reference area,Institutional sector,Transaction,Unit of measure,Price base,TIME_PERIOD,OBS_VALUE,OBS_STATUS,Observation status,Unit multiplier
2435,Italy,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),1975,1207254.975,E,Estimated value,Millions
2436,Italy,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),1974,1233027.254,E,Estimated value,Millions
2437,Italy,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),1973,1168744.649,E,Estimated value,Millions
2438,Italy,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),1972,1091001.392,E,Estimated value,Millions
2439,Italy,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),1971,1052171.165,E,Estimated value,Millions
2440,Italy,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),1970,1033383.143,E,Estimated value,Millions
2441,Italy,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),2023,2421454.72,A,Normal value,Millions
2442,Italy,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),2022,2399363.963,A,Normal value,Millions
2443,Italy,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),2021,2307393.619,A,Normal value,Millions
2444,Italy,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),2020,2130355.931,A,Normal value,Millions


In [40]:
gdp_1970_defl_simple.isna().sum()

Reference area          0
Institutional sector    0
Transaction             0
Unit of measure         0
Price base              0
TIME_PERIOD             0
OBS_VALUE               0
OBS_STATUS              0
Observation status      0
Unit multiplier         0
dtype: int64

In [41]:
gdp_1970_defl_simple.sort_values(by="TIME_PERIOD", inplace=False)

Unnamed: 0,Reference area,Institutional sector,Transaction,Unit of measure,Price base,TIME_PERIOD,OBS_VALUE,OBS_STATUS,Observation status,Unit multiplier
121,Portugal,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),1970,1.024354e+05,E,Estimated value,Millions
786,France,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),1970,9.921342e+05,A,Normal value,Millions
2440,Italy,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),1970,1.033383e+06,E,Estimated value,Millions
1814,United States,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),1970,5.173693e+06,A,Normal value,Millions
1049,Iceland,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),1970,3.460969e+03,E,Estimated value,Millions
...,...,...,...,...,...,...,...,...,...,...
2183,Greece,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),2023,3.193293e+05,P,Provisional value,Millions
765,Norway,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),2023,3.536325e+05,A,Normal value,Millions
2357,Korea,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),2023,2.327219e+06,E,Estimated value,Millions
2279,Slovak Republic,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),2023,1.912215e+05,A,Normal value,Millions


In [42]:
ireland_df = gdp_1970_defl_simple[gdp_1970_defl_simple["Reference area"] == "Ireland"]

In [43]:
ire_df_copy = ireland_df.copy()

In [44]:
ire_df_copy.head()

Unnamed: 0,Reference area,Institutional sector,Transaction,Unit of measure,Price base,TIME_PERIOD,OBS_VALUE,OBS_STATUS,Observation status,Unit multiplier
0,Ireland,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),2009,234829.565,A,Normal value,Millions
11,Ireland,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),2020,453894.856,A,Normal value,Millions
2298,Ireland,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),2016,340720.243,A,Normal value,Millions
2299,Ireland,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),2015,336604.783,A,Normal value,Millions
2300,Ireland,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),2014,270114.548,A,Normal value,Millions


#### 2.1.1.2) Moving all OBS_VALUES (gdp) up a year (I need to move it down a year actually)

In [46]:
ire_df_copy.sort_values(by="TIME_PERIOD", inplace=True, ascending=True)

In [47]:
ire_df_copy['OBS_VALUE'] = ire_df_copy['OBS_VALUE'].shift(+1)

In [48]:
ire_df_copy.head()

Unnamed: 0,Reference area,Institutional sector,Transaction,Unit of measure,Price base,TIME_PERIOD,OBS_VALUE,OBS_STATUS,Observation status,Unit multiplier
2619,Ireland,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),1970,,E,Estimated value,Millions
2618,Ireland,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),1971,40517.027,E,Estimated value,Millions
2617,Ireland,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),1972,41922.922,E,Estimated value,Millions
2616,Ireland,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),1973,44643.64,E,Estimated value,Millions
2615,Ireland,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),1974,46751.457,E,Estimated value,Millions


## 2.2) Social Policy Expenditure 1993-2023

### 2.2.1) Total social policy expenditure as percentage of GDP

In [51]:
expend_on_social_policy_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 366 entries, 0 to 365
Data columns (total 34 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   STRUCTURE                 366 non-null    object 
 1   STRUCTURE_ID              366 non-null    object 
 2   STRUCTURE_NAME            366 non-null    object 
 3   ACTION                    366 non-null    object 
 4   REF_AREA                  366 non-null    object 
 5   Reference area            366 non-null    object 
 6   FREQ                      366 non-null    object 
 7   Frequency of observation  366 non-null    object 
 8   MEASURE                   366 non-null    object 
 9   Measure                   366 non-null    object 
 10  UNIT_MEASURE              366 non-null    object 
 11  Unit of measure           366 non-null    object 
 12  EXPEND_SOURCE             366 non-null    object 
 13  Expenditure source        366 non-null    object 
 14  SPENDING_T

#### 2.2.1.1) Removing columns to get an overview of the data

In [53]:
exp_social_df = expend_on_social_policy_df.drop(columns=['PROGRAMME_TYPE', 'SPENDING_TYPE', 'EXPEND_SOURCE', 'MEASURE', 'Price base', 'BASE_PER', 'Base period', 'STRUCTURE', 'STRUCTURE_ID', 'ACTION', 'FREQ', 'CURRENCY', 'Currency', 'Decimals',  'Time period', 'Observation value', 'PRICE_BASE', 'REF_AREA', 'STRUCTURE_NAME', 'Frequency of observation', 'UNIT_MULT', 'DECIMALS', 'UNIT_MEASURE'])

In [54]:
exp_social_df.head()


Unnamed: 0,Reference area,Measure,Unit of measure,Expenditure source,Spending type,Programme type,TIME_PERIOD,OBS_VALUE,OBS_STATUS,Observation status,Unit multiplier
0,Türkiye,Social expenditure,Percentage of GDP,Net total,Total,Total,2017,11.803,A,Normal value,Units
1,Türkiye,Social expenditure,Percentage of GDP,Net total,Total,Total,2019,12.463,A,Normal value,Units
2,Türkiye,Social expenditure,Percentage of GDP,Net total,Total,Total,2005,9.333,A,Normal value,Units
3,Türkiye,Social expenditure,Percentage of GDP,Net total,Total,Total,2007,10.164,A,Normal value,Units
4,Türkiye,Social expenditure,Percentage of GDP,Net total,Total,Total,2009,12.074,A,Normal value,Units


In [55]:
all_years_social_exp = exp_social_df["TIME_PERIOD"].unique()
all_years_social_exp.sort()
print(all_years_social_exp)

[1993 1995 1997 1999 2001 2003 2005 2007 2009 2011 2013 2015 2017 2019]


In [56]:
df_obs_status = exp_social_df[exp_social_df["TIME_PERIOD"] == 2017]
df_obs_status.info()

<class 'pandas.core.frame.DataFrame'>
Index: 36 entries, 0 to 358
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Reference area      36 non-null     object 
 1   Measure             36 non-null     object 
 2   Unit of measure     36 non-null     object 
 3   Expenditure source  36 non-null     object 
 4   Spending type       36 non-null     object 
 5   Programme type      36 non-null     object 
 6   TIME_PERIOD         36 non-null     int64  
 7   OBS_VALUE           36 non-null     float64
 8   OBS_STATUS          36 non-null     object 
 9   Observation status  36 non-null     object 
 10  Unit multiplier     36 non-null     object 
dtypes: float64(1), int64(1), object(9)
memory usage: 3.4+ KB


In [57]:
exp_social_df_DK = exp_social_df[exp_social_df["Reference area"] == "Denmark"]

In [58]:
exp_social_df_DK.head()

Unnamed: 0,Reference area,Measure,Unit of measure,Expenditure source,Spending type,Programme type,TIME_PERIOD,OBS_VALUE,OBS_STATUS,Observation status,Unit multiplier
224,Denmark,Social expenditure,Percentage of GDP,Net total,Total,Total,2005,21.821,A,Normal value,Units
225,Denmark,Social expenditure,Percentage of GDP,Net total,Total,Total,2007,22.839,A,Normal value,Units
226,Denmark,Social expenditure,Percentage of GDP,Net total,Total,Total,2009,26.397,A,Normal value,Units
227,Denmark,Social expenditure,Percentage of GDP,Net total,Total,Total,2011,26.442,A,Normal value,Units
228,Denmark,Social expenditure,Percentage of GDP,Net total,Total,Total,2013,26.787,A,Normal value,Units


In [59]:
exp_social_df_DK.sort_values(by = "TIME_PERIOD", ascending=True, inplace=False)

Unnamed: 0,Reference area,Measure,Unit of measure,Expenditure source,Spending type,Programme type,TIME_PERIOD,OBS_VALUE,OBS_STATUS,Observation status,Unit multiplier
232,Denmark,Social expenditure,Percentage of GDP,Net total,Total,Total,1993,23.61,A,Normal value,Units
233,Denmark,Social expenditure,Percentage of GDP,Net total,Total,Total,1995,22.994,A,Normal value,Units
234,Denmark,Social expenditure,Percentage of GDP,Net total,Total,Total,1997,21.604,A,Normal value,Units
235,Denmark,Social expenditure,Percentage of GDP,Net total,Total,Total,1999,21.638,A,Normal value,Units
236,Denmark,Social expenditure,Percentage of GDP,Net total,Total,Total,2001,20.879,A,Normal value,Units
237,Denmark,Social expenditure,Percentage of GDP,Net total,Total,Total,2003,22.152,A,Normal value,Units
224,Denmark,Social expenditure,Percentage of GDP,Net total,Total,Total,2005,21.821,A,Normal value,Units
225,Denmark,Social expenditure,Percentage of GDP,Net total,Total,Total,2007,22.839,A,Normal value,Units
226,Denmark,Social expenditure,Percentage of GDP,Net total,Total,Total,2009,26.397,A,Normal value,Units
227,Denmark,Social expenditure,Percentage of GDP,Net total,Total,Total,2011,26.442,A,Normal value,Units


## 2.3) Happiness

### 2.3.1) Wellbeing (hand picked features) 2004-2024

In [62]:
current_wellbeing_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5466 entries, 0 to 5465
Data columns (total 30 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   STRUCTURE           5466 non-null   object 
 1   STRUCTURE_ID        5466 non-null   object 
 2   STRUCTURE_NAME      5466 non-null   object 
 3   ACTION              5466 non-null   object 
 4   REF_AREA            5466 non-null   object 
 5   Reference area      5466 non-null   object 
 6   MEASURE             5466 non-null   object 
 7   Measure             5466 non-null   object 
 8   UNIT_MEASURE        5466 non-null   object 
 9   Unit of measure     5466 non-null   object 
 10  AGE                 5466 non-null   object 
 11  Age                 5466 non-null   object 
 12  SEX                 5466 non-null   object 
 13  Sex                 5466 non-null   object 
 14  EDUCATION_LEV       5466 non-null   object 
 15  Education level     5466 non-null   object 
 16  DOMAIN

#### 2.3.1.1) Removing columns to get an overview of the data

In [64]:
wellbeing_df = current_wellbeing_df.drop(columns=['EDUCATION_LEV', 'MEASURE', 'DOMAIN', 'Domain', 'Education level', 'Sex', 'SEX', 'AGE', 'Age', 'OBS_STATUS', 'BASE_PER', 'Base period', 'STRUCTURE', 'STRUCTURE_ID', 'ACTION', 'Decimals',  'Time period', 'Observation value', 'REF_AREA', 'STRUCTURE_NAME', 'UNIT_MULT', 'DECIMALS', 'UNIT_MEASURE'])

In [65]:
wellbeing_df.head()

Unnamed: 0,Reference area,Measure,Unit of measure,TIME_PERIOD,OBS_VALUE,Observation status,Unit multiplier
0,Latvia,Perceived health as positive,Percentage of population aged 16 years or over,2017,44.2,Normal value,Units
1,Latvia,Perceived health as positive,Percentage of population aged 16 years or over,2018,46.9,Normal value,Units
2,Latvia,Perceived health as positive,Percentage of population aged 16 years or over,2019,47.1,Normal value,Units
3,Latvia,Perceived health as positive,Percentage of population aged 16 years or over,2020,49.7,Normal value,Units
4,Latvia,Perceived health as positive,Percentage of population aged 16 years or over,2021,49.8,Normal value,Units


In [66]:
unit_of_measure_values = wellbeing_df["Measure"].unique()

print(unit_of_measure_values)

['Perceived health as positive' 'Satisfaction with personal relationships'
 'Life satisfaction' 'Satisfaction with time use' 'Lack of social support'
 'Not feeling safe at night' 'Feeling safe at night'
 'Perceived health as negative' 'Social support'
 'Self-reported depression' 'Homicides'
 'Deaths from suicide, alcohol, drugs']


##### PERCEIVED HEALTH

In [68]:
percivd_health_pos_df = wellbeing_df[wellbeing_df["Measure"] == "Perceived health as positive"]

In [69]:
percivd_health_pos_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 590 entries, 0 to 5374
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Reference area      590 non-null    object 
 1   Measure             590 non-null    object 
 2   Unit of measure     590 non-null    object 
 3   TIME_PERIOD         590 non-null    int64  
 4   OBS_VALUE           590 non-null    float64
 5   Observation status  590 non-null    object 
 6   Unit multiplier     590 non-null    object 
dtypes: float64(1), int64(1), object(5)
memory usage: 36.9+ KB


##### SATISFACTION WITH PERSONAL RELATIONSHIPS (TO FEW)

In [71]:
stsfaction_rltnship_df = wellbeing_df[wellbeing_df["Measure"] == "Satisfaction with personal relationships"]

In [72]:
stsfaction_rltnship_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 93 entries, 31 to 5383
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Reference area      93 non-null     object 
 1   Measure             93 non-null     object 
 2   Unit of measure     93 non-null     object 
 3   TIME_PERIOD         93 non-null     int64  
 4   OBS_VALUE           93 non-null     float64
 5   Observation status  93 non-null     object 
 6   Unit multiplier     93 non-null     object 
dtypes: float64(1), int64(1), object(5)
memory usage: 5.8+ KB


##### LIFE SATISFACTION (TO FEW - 189)

In [74]:
life_stsfaction_df = wellbeing_df[wellbeing_df["Measure"] == "Life satisfaction"]

In [75]:
life_stsfaction_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 189 entries, 33 to 5380
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Reference area      189 non-null    object 
 1   Measure             189 non-null    object 
 2   Unit of measure     189 non-null    object 
 3   TIME_PERIOD         189 non-null    int64  
 4   OBS_VALUE           189 non-null    float64
 5   Observation status  189 non-null    object 
 6   Unit multiplier     189 non-null    object 
dtypes: float64(1), int64(1), object(5)
memory usage: 11.8+ KB


##### SATISFACTION WITH TOME USE (TO FEW)

In [77]:
stsfaction_w_time_use_df = wellbeing_df[wellbeing_df["Measure"] == "Satisfaction with time use"]

In [78]:
stsfaction_w_time_use_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 95 entries, 35 to 5385
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Reference area      95 non-null     object 
 1   Measure             95 non-null     object 
 2   Unit of measure     95 non-null     object 
 3   TIME_PERIOD         95 non-null     int64  
 4   OBS_VALUE           95 non-null     float64
 5   Observation status  95 non-null     object 
 6   Unit multiplier     95 non-null     object 
dtypes: float64(1), int64(1), object(5)
memory usage: 5.9+ KB


##### LACK OF SOCIAL SUPPORT (TO FEW AND NEG)

In [80]:
lck_of_social_supprt_df = wellbeing_df[wellbeing_df["Measure"] == "Lack of Social support"]

In [81]:
lck_of_social_supprt_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Reference area      0 non-null      object 
 1   Measure             0 non-null      object 
 2   Unit of measure     0 non-null      object 
 3   TIME_PERIOD         0 non-null      int64  
 4   OBS_VALUE           0 non-null      float64
 5   Observation status  0 non-null      object 
 6   Unit multiplier     0 non-null      object 
dtypes: float64(1), int64(1), object(5)
memory usage: 0.0+ bytes


##### HOMICIDES

In [83]:
homicides_df = wellbeing_df[wellbeing_df["Measure"] == "Homicides"]

In [84]:
homicides_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 642 entries, 3838 to 5461
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Reference area      642 non-null    object 
 1   Measure             642 non-null    object 
 2   Unit of measure     642 non-null    object 
 3   TIME_PERIOD         642 non-null    int64  
 4   OBS_VALUE           642 non-null    float64
 5   Observation status  642 non-null    object 
 6   Unit multiplier     642 non-null    object 
dtypes: float64(1), int64(1), object(5)
memory usage: 40.1+ KB


##### NOT FEEL SAFE (NEG)

In [86]:
not_feel_safe_df = wellbeing_df[wellbeing_df["Measure"] == "Not feeling safe at night"]

In [87]:
not_feel_safe_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 623 entries, 53 to 5369
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Reference area      623 non-null    object 
 1   Measure             623 non-null    object 
 2   Unit of measure     623 non-null    object 
 3   TIME_PERIOD         623 non-null    int64  
 4   OBS_VALUE           623 non-null    float64
 5   Observation status  623 non-null    object 
 6   Unit multiplier     623 non-null    object 
dtypes: float64(1), int64(1), object(5)
memory usage: 38.9+ KB


##### FEEL SAFE

In [89]:
feel_safe_df = wellbeing_df[wellbeing_df["Measure"] == "Feeling safe at night"]

In [90]:
feel_safe_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 675 entries, 68 to 5324
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Reference area      675 non-null    object 
 1   Measure             675 non-null    object 
 2   Unit of measure     675 non-null    object 
 3   TIME_PERIOD         675 non-null    int64  
 4   OBS_VALUE           675 non-null    float64
 5   Observation status  675 non-null    object 
 6   Unit multiplier     675 non-null    object 
dtypes: float64(1), int64(1), object(5)
memory usage: 42.2+ KB


In [91]:
feel_safe_df.head()

Unnamed: 0,Reference area,Measure,Unit of measure,TIME_PERIOD,OBS_VALUE,Observation status,Unit multiplier
68,Slovak Republic,Feeling safe at night,Percentage of population aged 15 years or over,2016,65.331679,Normal value,Units
69,Slovak Republic,Feeling safe at night,Percentage of population aged 15 years or over,2017,69.671352,Normal value,Units
70,Slovak Republic,Feeling safe at night,Percentage of population aged 15 years or over,2018,69.671352,Normal value,Units
71,Slovak Republic,Feeling safe at night,Percentage of population aged 15 years or over,2019,69.671352,Normal value,Units
72,Slovak Republic,Feeling safe at night,Percentage of population aged 15 years or over,2020,78.012499,Normal value,Units


##### HEALTH NEGATIVE (NEG)

In [93]:
perceived_health_neg = wellbeing_df[wellbeing_df["Measure"] == "Perceived health as negative"]

In [94]:
perceived_health_neg.info()

<class 'pandas.core.frame.DataFrame'>
Index: 586 entries, 114 to 5378
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Reference area      586 non-null    object 
 1   Measure             586 non-null    object 
 2   Unit of measure     586 non-null    object 
 3   TIME_PERIOD         586 non-null    int64  
 4   OBS_VALUE           586 non-null    float64
 5   Observation status  586 non-null    object 
 6   Unit multiplier     586 non-null    object 
dtypes: float64(1), int64(1), object(5)
memory usage: 36.6+ KB


##### SOCIAL SUPPORT

In [96]:
social_support_df = wellbeing_df[wellbeing_df["Measure"] == "Social support"]

In [97]:
social_support_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 673 entries, 186 to 5381
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Reference area      673 non-null    object 
 1   Measure             673 non-null    object 
 2   Unit of measure     673 non-null    object 
 3   TIME_PERIOD         673 non-null    int64  
 4   OBS_VALUE           673 non-null    float64
 5   Observation status  673 non-null    object 
 6   Unit multiplier     673 non-null    object 
dtypes: float64(1), int64(1), object(5)
memory usage: 42.1+ KB


##### DEPRESSION (TO FEW)

In [99]:
self_rep_depression_df = wellbeing_df[wellbeing_df["Measure"] == "Self-reported depression"]

In [100]:
self_rep_depression_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 49 entries, 251 to 5382
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Reference area      49 non-null     object 
 1   Measure             49 non-null     object 
 2   Unit of measure     49 non-null     object 
 3   TIME_PERIOD         49 non-null     int64  
 4   OBS_VALUE           49 non-null     float64
 5   Observation status  49 non-null     object 
 6   Unit multiplier     49 non-null     object 
dtypes: float64(1), int64(1), object(5)
memory usage: 3.1+ KB


##### DEATHS OF SUICIDE, ALCOHOL and DRUGS

In [102]:
deaths_suicide_alc_drug_df = wellbeing_df[wellbeing_df["Measure"] == "Deaths from suicide, alcohol, drugs"]

In [103]:
deaths_suicide_alc_drug_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 630 entries, 3866 to 5465
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Reference area      630 non-null    object 
 1   Measure             630 non-null    object 
 2   Unit of measure     630 non-null    object 
 3   TIME_PERIOD         630 non-null    int64  
 4   OBS_VALUE           630 non-null    float64
 5   Observation status  630 non-null    object 
 6   Unit multiplier     630 non-null    object 
dtypes: float64(1), int64(1), object(5)
memory usage: 39.4+ KB


In [104]:
all_unique = deaths_suicide_alc_drug_df["Reference area"].unique()

In [105]:
all_unique

array(['Ireland', 'Estonia', 'Spain', 'Slovenia', 'Australia', 'Hungary',
       'Czechia', 'Sweden', 'United States', 'Costa Rica', 'Norway',
       'France', 'South Africa', 'Brazil', 'Netherlands', 'Canada',
       'Chile', 'Poland', 'Iceland', 'New Zealand', 'Greece', 'Denmark',
       'Switzerland', 'Latvia', 'Luxembourg', 'Austria', 'Japan',
       'Türkiye', 'Israel', 'Mexico', 'Slovak Republic', 'Korea',
       'Lithuania', 'Belgium', 'Portugal', 'Germany', 'Colombia', 'Italy',
       'Finland', 'United Kingdom'], dtype=object)

In [106]:
print(len(all_unique))

40


## 2.4) Taxation data

### 2.4.1) European Comission total taxation of GDP

#### 2.4.1.1) Renaming column with countries to Reference area

In [110]:
total_taxes_df.rename(columns = {"Unnamed: 0": "Reference area"}, inplace=True)

In [111]:
total_taxes_df.head()

Unnamed: 0,Reference area,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,Difference 2012-2022 (pp),Ranking 2022,Revenue 2022\n(million EUR)
0,EU-27,37.922746,38.345385,39.301449,39.78725,39.865975,39.730539,39.843225,39.930718,40.054073,39.944737,40.002493,40.395208,40.161307,0.9,,6387768.3
1,EA-19,38.101209,38.58623,39.615106,40.116221,40.179571,40.061478,40.104618,40.241508,40.427915,40.296537,40.357165,40.786176,40.841225,1.2,,5482502.8
2,Belgium,43.580535,44.369598,45.319126,46.033293,45.683596,44.99157,44.213578,44.747951,44.824441,43.467108,43.411264,43.166396,43.268219,-2.0,2.0,239725.1
3,Bulgaria,25.398276,25.477641,26.078859,28.132118,28.399268,28.863177,29.184384,29.840882,29.683558,30.353742,30.498298,30.785349,31.145084,5.0,23.0,26722.7
4,Czechia,32.854823,33.975879,34.497486,34.897577,34.136013,34.288463,35.075928,35.402552,35.972707,35.907027,35.923476,35.890107,35.287515,0.8,17.0,97487.3


#### 2.4.1.2) Reshaping (melting) the table, so that year is one column and country and year together identify a row  

In [113]:
total_tax_melted_df = pd.melt(total_taxes_df, id_vars=["Reference area"], var_name = "Year", value_name = "Taxation")
total_tax_melted_df.head()

Unnamed: 0,Reference area,Year,Taxation
0,EU-27,2010,37.922746
1,EA-19,2010,38.101209
2,Belgium,2010,43.580535
3,Bulgaria,2010,25.398276
4,Czechia,2010,32.854823


In [114]:
total_tax_melted_df.isna().sum()

Reference area     64
Year                0
Taxation          100
dtype: int64

In [193]:
df_2023 = total_tax_melted_df[total_tax_melted_df["Year"]== "2021"]
df_2023.head()

Unnamed: 0,Reference area,Year,Taxation
407,EU-27,2021,40.395208
408,EA-19,2021,40.786176
409,Belgium,2021,43.166396
410,Bulgaria,2021,30.785349
411,Czechia,2021,35.890107


# 3) Collecting data frames

## 3.1) All data frames listed

In [251]:

list_of_dfs.clear()

list_of_dfs.append(social_exp_eu_usa_aus)
list_of_dfs.append(gdp_1970_defl_simple)
list_of_dfs.append(feel_safe_df)
list_of_dfs.append(percivd_health_pos_df)

world_happiness_2023.rename(columns={"year":"TIME_PERIOD"}, inplace=True)

list_of_dfs.append(world_happiness_2023)


In [253]:
for df in list_of_dfs:
    df.rename(columns={"TIME_PERIOD": "Year"}, inplace=True)

    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns={"TIME_PERIOD": "Year"}, inplace=True)


In [255]:
world_happiness_2023.head()

Unnamed: 0,Country name,Year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect
0,Afghanistan,2008,3.72359,7.350416,0.450662,50.5,0.718114,0.167652,0.881686,0.414297,0.258195
1,Afghanistan,2009,4.401778,7.508646,0.552308,50.799999,0.678896,0.190809,0.850035,0.481421,0.237092
2,Afghanistan,2010,4.758381,7.6139,0.539075,51.099998,0.600127,0.121316,0.706766,0.516907,0.275324
3,Afghanistan,2011,3.831719,7.581259,0.521104,51.400002,0.495901,0.163571,0.731109,0.479835,0.267175
4,Afghanistan,2012,3.782938,7.660506,0.520637,51.700001,0.530935,0.237588,0.77562,0.613513,0.267919


In [272]:
world_happiness_2023.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2199 entries, 0 to 2198
Data columns (total 11 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Country name                      2199 non-null   object 
 1   Year                              2199 non-null   int64  
 2   Life Ladder                       2199 non-null   float64
 3   Log GDP per capita                2179 non-null   float64
 4   Social support                    2186 non-null   float64
 5   Healthy life expectancy at birth  2145 non-null   float64
 6   Freedom to make life choices      2166 non-null   float64
 7   Generosity                        2126 non-null   float64
 8   Perceptions of corruption         2083 non-null   float64
 9   Positive affect                   2175 non-null   float64
 10  Negative affect                   2183 non-null   float64
dtypes: float64(9), int64(1), object(1)
memory usage: 189.1+ KB


### 3.2) Deleting all rows before 2005

In [347]:
social_exp_2005 = social_exp_eu_usa_aus[(social_exp_eu_usa_aus["Year"]>= 2005) & (social_exp_eu_usa_aus["Unit of measure"] == "Precentage of general government expenditure")]
gdp_defl_2005 = gdp_1970_defl_simple[gdp_1970_defl_simple["Year"]>= 2005]
feel_safe_2005 = feel_safe_df[feel_safe_df["Year"]>= 2005]
percivd_health_2005 = percivd_health_pos_df[percivd_health_pos_df["Year"]>= 2005]
world_happiness_2005 = world_happiness_2023[world_happiness_2023["Year"]>= 2005]

In [349]:
all_unit_of_measure = social_exp_2005["Unit of measure"].unique()
print(all_unit_of_measure)

[]


In [335]:
social_exp_2005.info()

Unnamed: 0,Reference area,Measure,Unit of measure,Expenditure source,Spending type,Programme type,Year,OBS_VALUE,OBS_STATUS,Observation status,Unit multiplier
0,Hungary,Social expenditure,Percentage of general government expenditure,Public,Total,Total,2014,42.619,A,Normal value,Units
1,Hungary,Social expenditure,Percentage of general government expenditure,Public,Total,Total,2015,40.403,A,Normal value,Units
2,Hungary,Social expenditure,Percentage of general government expenditure,Public,Total,Total,2016,43.11,A,Normal value,Units
3,Hungary,Social expenditure,Percentage of general government expenditure,Public,Total,Total,2017,41.605,A,Normal value,Units
4,Hungary,Social expenditure,Percentage of general government expenditure,Public,Total,Total,2018,40.342,A,Normal value,Units


In [282]:
gdp_defl_2005.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1171 entries, 0 to 2623
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Reference area        1171 non-null   object 
 1   Institutional sector  1171 non-null   object 
 2   Transaction           1171 non-null   object 
 3   Unit of measure       1171 non-null   object 
 4   Price base            1171 non-null   object 
 5   Year                  1171 non-null   int64  
 6   OBS_VALUE             1171 non-null   float64
 7   OBS_STATUS            1171 non-null   object 
 8   Observation status    1171 non-null   object 
 9   Unit multiplier       1171 non-null   object 
dtypes: float64(1), int64(1), object(8)
memory usage: 100.6+ KB


In [284]:
feel_safe_2005.info()

<class 'pandas.core.frame.DataFrame'>
Index: 675 entries, 68 to 5324
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Reference area      675 non-null    object 
 1   Measure             675 non-null    object 
 2   Unit of measure     675 non-null    object 
 3   Year                675 non-null    int64  
 4   OBS_VALUE           675 non-null    float64
 5   Observation status  675 non-null    object 
 6   Unit multiplier     675 non-null    object 
dtypes: float64(1), int64(1), object(5)
memory usage: 42.2+ KB


In [286]:
percivd_health_2005.info()

<class 'pandas.core.frame.DataFrame'>
Index: 570 entries, 0 to 5374
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Reference area      570 non-null    object 
 1   Measure             570 non-null    object 
 2   Unit of measure     570 non-null    object 
 3   Year                570 non-null    int64  
 4   OBS_VALUE           570 non-null    float64
 5   Observation status  570 non-null    object 
 6   Unit multiplier     570 non-null    object 
dtypes: float64(1), int64(1), object(5)
memory usage: 35.6+ KB


In [288]:
world_happiness_2005.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2199 entries, 0 to 2198
Data columns (total 11 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Country name                      2199 non-null   object 
 1   Year                              2199 non-null   int64  
 2   Life Ladder                       2199 non-null   float64
 3   Log GDP per capita                2179 non-null   float64
 4   Social support                    2186 non-null   float64
 5   Healthy life expectancy at birth  2145 non-null   float64
 6   Freedom to make life choices      2166 non-null   float64
 7   Generosity                        2126 non-null   float64
 8   Perceptions of corruption         2083 non-null   float64
 9   Positive affect                   2175 non-null   float64
 10  Negative affect                   2183 non-null   float64
dtypes: float64(9), int64(1), object(1)
memory usage: 189.1+ KB


In [290]:
world_happiness_2005.describe()

Unnamed: 0,Year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect
count,2199.0,2199.0,2179.0,2186.0,2145.0,2166.0,2126.0,2083.0,2175.0,2183.0
mean,2014.161437,5.479226,9.389766,0.810679,63.294583,0.747858,9.6e-05,0.745195,0.652143,0.271501
std,4.718736,1.125529,1.153387,0.120952,6.901104,0.14015,0.161083,0.185837,0.105922,0.086875
min,2005.0,1.281271,5.526723,0.228217,6.72,0.257534,-0.337527,0.035198,0.178886,0.082737
25%,2010.0,4.64675,8.499764,0.746609,59.119999,0.656528,-0.112116,0.688139,0.571684,0.20766
50%,2014.0,5.432437,9.498955,0.835535,65.050003,0.769821,-0.022671,0.799654,0.663063,0.260671
75%,2018.0,6.30946,10.373216,0.904792,68.5,0.859382,0.09207,0.868827,0.737936,0.322894
max,2022.0,8.018934,11.663788,0.987343,74.474998,0.985178,0.702708,0.983276,0.883586,0.70459


In [292]:
world_happiness_2005.head()

Unnamed: 0,Country name,Year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect
0,Afghanistan,2008,3.72359,7.350416,0.450662,50.5,0.718114,0.167652,0.881686,0.414297,0.258195
1,Afghanistan,2009,4.401778,7.508646,0.552308,50.799999,0.678896,0.190809,0.850035,0.481421,0.237092
2,Afghanistan,2010,4.758381,7.6139,0.539075,51.099998,0.600127,0.121316,0.706766,0.516907,0.275324
3,Afghanistan,2011,3.831719,7.581259,0.521104,51.400002,0.495901,0.163571,0.731109,0.479835,0.267175
4,Afghanistan,2012,3.782938,7.660506,0.520637,51.700001,0.530935,0.237588,0.77562,0.613513,0.267919


### 3.3) Checking for Observation status and deleting all non-normal status

#### 3.3.1) Social expenditure

In [310]:
obs_stat_social = social_exp_2005["Observation status"].unique()
print(obs_stat_social)

['Normal value']


#### 3.3.2) GDP

In [333]:
obs2 = gdp_defl_2005["Observation status"].unique()
print(obs2)

obs2_counts = gdp_defl_2005["Observation status"].value_counts()
print("Counts of each unique value:\n", obs2_counts)

['Normal value' 'Provisional value' 'Estimated value' 'Definition differs'
 'Time series break']
Counts of each unique value:
 Observation status
Normal value          1087
Estimated value         44
Provisional value       37
Definition differs       2
Time series break        1
Name: count, dtype: int64


#### 3.3.3) Feel safe

In [314]:
obs3 = feel_safe_2005["Observation status"].unique()
print(obs3)

['Normal value']


#### 3.3.4) Perceived health

In [316]:
obs4 = percivd_health_2005["Observation status"].unique()
print(obs4)

['Normal value' 'Definition differs' 'Time series break']


### 3.4) Removing null and na values 

#### 3.4.1) Social expenditure

#### 3.4.2) GDP

#### 3.4.3) Feel safe

#### 3.4.4) Perceived health

#### 3.5.4) World happiness

In [318]:
world_happiness_2005.isna().sum()

Country name                          0
Year                                  0
Life Ladder                           0
Log GDP per capita                   20
Social support                       13
Healthy life expectancy at birth     54
Freedom to make life choices         33
Generosity                           73
Perceptions of corruption           116
Positive affect                      24
Negative affect                      16
dtype: int64

In [None]:
df.drop('col2', axis=1, inplace=True)