In [1]:
import pandas as pd
import numpy as np
import pycountry

# LINKS

#### World Happines Report:
- https://worldhappiness.report/data/

#### Eurostat:

- UNEMPLOYMENT: https://ec.europa.eu/eurostat/databrowser/view/tps00203/default/table?lang=en&category=t_labour.t_employ.t_lfsi.t_une

#### OECD data explorer:
- https://data-explorer.oecd.org/

- SOCIAL EXPENDITURE (EU, USA, AUSTRALIA): https://data-explorer.oecd.org/vis?tm=Social%20expenditure&pg=0&snb=139&vw=tb&df[ds]=dsDisseminateFinalDMZ&df[id]=DSD_SOCX_AGG%40DF_SOCX_AGG&df[ag]=OECD.ELS.SPD&df[vs]=1.0&dq=AUS%2BAUT%2BBEL%2BCZE%2BDNK%2BEST%2BFIN%2BDEU%2BFRA%2BGRC%2BHUN%2BISL%2BIRL%2BISR%2BITA%2BLVA%2BLTU%2BLUX%2BNLD%2BNZL%2BNOR%2BPOL%2BPRT%2BSVK%2BSVN%2BESP%2BSWE%2BCHE%2BTUR%2BGBR%2BUSA.A..PT_OTE_S13%2BPT_B1GQ.ES10._T._T.&pd=1989%2C2022&to[TIME_PERIOD]=false&ly[cl]=TIME_PERIOD&ly[rw]=REF_AREA%2CCOMBINED_UNIT_MEASURE

- UNEMPLOYMENT: https://data-explorer.oecd.org/vis?fs[0]=Topic%2C1%7CEmployment%23JOB%23%7CUnemployment%20indicators%23JOB_UNEMP%23&pg=0&fc=Topic&bp=true&snb=15&vw=tb&df[ds]=dsDisseminateFinalDMZ&df[id]=DSD_LFS%40DF_IALFS_UNE_Q&df[ag]=OECD.SDD.TPS&df[vs]=1.0&dq=ZAF%2BRUS%2BIDN%2BHRV%2BBGR%2BBRA%2BGBR%2BUSA%2BTUR%2BCHE%2BSWE%2BESP%2BSVN%2BSVK%2BPRT%2BPOL%2BNOR%2BNZL%2BNLD%2BMEX%2BLUX%2BLTU%2BLVA%2BKOR%2BJPN%2BITA%2BISR%2BIRL%2BISL%2BHUN%2BGRC%2BDEU%2BFRA%2BFIN%2BEST%2BDNK%2BCRI%2BCOL%2BCZE%2BCHL%2BBEL%2BCAN%2BAUT%2BAUS.UNE.._Z.Y._T.Y_GE15..A&pd=2004%2C2024&to[TIME_PERIOD]=false&ly[cl]=TIME_PERIOD&ly[rw]=REF_AREA

#### European Comission: 

- https://taxation-customs.ec.europa.eu/taxation/economic-analyses/taxation-trends-eu/data-taxation-trends_en
- https://webgate.ec.europa.eu/taxation_customs/redisstat/databrowser/explore/all/DATA_ON_TAX?lang=en&display=card&sort=category

# 1) Data ingestion

## 1.1) GDP data

### 1.1.1) (OECD) GDP 1970-2023 deflated

In [7]:
gdp_1970_deflated_df = pd.read_csv("oecd_gdp_1970_2023_deflated.csv")

In [8]:
gdp_1970_deflated_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2624 entries, 0 to 2623
Data columns (total 44 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   STRUCTURE                                       2624 non-null   object 
 1   STRUCTURE_ID                                    2624 non-null   object 
 2   STRUCTURE_NAME                                  2624 non-null   object 
 3   ACTION                                          2624 non-null   object 
 4   FREQ                                            2624 non-null   object 
 5   Frequency of observation                        2624 non-null   object 
 6   REF_AREA                                        2624 non-null   object 
 7   Reference area                                  2624 non-null   object 
 8   SECTOR                                          2624 non-null   object 
 9   Institutional sector                     

In [9]:
gdp_years = gdp_1970_deflated_df["TIME_PERIOD"].unique()
gdp_years.sort()
gdp_years

array([1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980,
       1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991,
       1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
       2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013,
       2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023],
      dtype=int64)

## 1.2) Social Policy Expenditure Data

### 1.2.1) Total social policy expenditure as percentage of GDP

In [12]:
expend_on_social_policy_df = pd.read_csv("expenditure_on_social_policy.csv")

In [13]:
expend_on_social_policy_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 366 entries, 0 to 365
Data columns (total 34 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   STRUCTURE                 366 non-null    object 
 1   STRUCTURE_ID              366 non-null    object 
 2   STRUCTURE_NAME            366 non-null    object 
 3   ACTION                    366 non-null    object 
 4   REF_AREA                  366 non-null    object 
 5   Reference area            366 non-null    object 
 6   FREQ                      366 non-null    object 
 7   Frequency of observation  366 non-null    object 
 8   MEASURE                   366 non-null    object 
 9   Measure                   366 non-null    object 
 10  UNIT_MEASURE              366 non-null    object 
 11  Unit of measure           366 non-null    object 
 12  EXPEND_SOURCE             366 non-null    object 
 13  Expenditure source        366 non-null    object 
 14  SPENDING_T

In [14]:
expend_on_social_policy_df.isnull().sum()

STRUCTURE                     0
STRUCTURE_ID                  0
STRUCTURE_NAME                0
ACTION                        0
REF_AREA                      0
Reference area                0
FREQ                          0
Frequency of observation      0
MEASURE                       0
Measure                       0
UNIT_MEASURE                  0
Unit of measure               0
EXPEND_SOURCE                 0
Expenditure source            0
SPENDING_TYPE                 0
Spending type                 0
PROGRAMME_TYPE                0
Programme type                0
PRICE_BASE                    0
Price base                    0
TIME_PERIOD                   0
Time period                 366
OBS_VALUE                     0
Observation value           366
OBS_STATUS                    0
Observation status            0
UNIT_MULT                     0
Unit multiplier               0
DECIMALS                      0
Decimals                      0
CURRENCY                      0
Currency

### 1.2.2) Social expenditure eu, usa and australia

Link: https://data-explorer.oecd.org/vis?tm=Social%20expenditure&pg=0&snb=139&vw=tb&df[ds]=dsDisseminateFinalDMZ&df[id]=DSD_SOCX_AGG%40DF_SOCX_AGG&df[ag]=OECD.ELS.SPD&df[vs]=1.0&dq=AUS%2BAUT%2BBEL%2BCZE%2BDNK%2BEST%2BFIN%2BDEU%2BFRA%2BGRC%2BHUN%2BISL%2BIRL%2BISR%2BITA%2BLVA%2BLTU%2BLUX%2BNLD%2BNZL%2BNOR%2BPOL%2BPRT%2BSVK%2BSVN%2BESP%2BSWE%2BCHE%2BTUR%2BGBR%2BUSA.A..PT_OTE_S13%2BPT_B1GQ.ES10._T._T.&pd=1989%2C2022&to[TIME_PERIOD]=false&ly[cl]=TIME_PERIOD&ly[rw]=REF_AREA%2CCOMBINED_UNIT_MEASURE

In [16]:
social_policy_eu_usa_australia_df = pd.read_csv("social_expenditure_eu_usa_australia.csv")

In [17]:
social_policy_years = social_policy_eu_usa_australia_df["TIME_PERIOD"].unique()
social_policy_years.sort()
social_policy_years

array([1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
       2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
       2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021,
       2022], dtype=int64)

In [18]:
social_policy_eu_usa_australia_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1821 entries, 0 to 1820
Data columns (total 34 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   STRUCTURE                 1821 non-null   object 
 1   STRUCTURE_ID              1821 non-null   object 
 2   STRUCTURE_NAME            1821 non-null   object 
 3   ACTION                    1821 non-null   object 
 4   REF_AREA                  1821 non-null   object 
 5   Reference area            1821 non-null   object 
 6   FREQ                      1821 non-null   object 
 7   Frequency of observation  1821 non-null   object 
 8   MEASURE                   1821 non-null   object 
 9   Measure                   1821 non-null   object 
 10  UNIT_MEASURE              1821 non-null   object 
 11  Unit of measure           1821 non-null   object 
 12  EXPEND_SOURCE             1821 non-null   object 
 13  Expenditure source        1821 non-null   object 
 14  SPENDING

In [19]:
social_exp_eu_usa_aus = social_policy_eu_usa_australia_df.drop(columns=['PROGRAMME_TYPE', 'SPENDING_TYPE', 'EXPEND_SOURCE', 'MEASURE', 'Price base', 'BASE_PER', 'Base period', 'STRUCTURE', 'STRUCTURE_ID', 'ACTION', 'FREQ', 'CURRENCY', 'Currency', 'Decimals',  'Time period', 'Observation value', 'PRICE_BASE', 'STRUCTURE_NAME', 'Frequency of observation', 'UNIT_MULT', 'DECIMALS', 'UNIT_MEASURE'])

## 1.3) Happiness

### 1.3.1) Wellbeing (hand picked features) 2004-2024

In [22]:
current_wellbeing_df = pd.read_csv("current_wellbeing.csv")

In [23]:
current_wellbeing_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5466 entries, 0 to 5465
Data columns (total 30 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   STRUCTURE           5466 non-null   object 
 1   STRUCTURE_ID        5466 non-null   object 
 2   STRUCTURE_NAME      5466 non-null   object 
 3   ACTION              5466 non-null   object 
 4   REF_AREA            5466 non-null   object 
 5   Reference area      5466 non-null   object 
 6   MEASURE             5466 non-null   object 
 7   Measure             5466 non-null   object 
 8   UNIT_MEASURE        5466 non-null   object 
 9   Unit of measure     5466 non-null   object 
 10  AGE                 5466 non-null   object 
 11  Age                 5466 non-null   object 
 12  SEX                 5466 non-null   object 
 13  Sex                 5466 non-null   object 
 14  EDUCATION_LEV       5466 non-null   object 
 15  Education level     5466 non-null   object 
 16  DOMAIN

## 1.3.2) World Happiness Report 2023

In [25]:
world_happiness_2023 = pd.read_excel("data_happiness_2023.xls")

In [26]:
world_happiness_2023.head()

Unnamed: 0,Country name,year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect
0,Afghanistan,2008,3.72359,7.350416,0.450662,50.5,0.718114,0.167652,0.881686,0.414297,0.258195
1,Afghanistan,2009,4.401778,7.508646,0.552308,50.799999,0.678896,0.190809,0.850035,0.481421,0.237092
2,Afghanistan,2010,4.758381,7.6139,0.539075,51.099998,0.600127,0.121316,0.706766,0.516907,0.275324
3,Afghanistan,2011,3.831719,7.581259,0.521104,51.400002,0.495901,0.163571,0.731109,0.479835,0.267175
4,Afghanistan,2012,3.782938,7.660506,0.520637,51.700001,0.530935,0.237588,0.77562,0.613513,0.267919


In [27]:
world_happiness_2023.describe()

Unnamed: 0,year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect
count,2199.0,2199.0,2179.0,2186.0,2145.0,2166.0,2126.0,2083.0,2175.0,2183.0
mean,2014.161437,5.479226,9.389766,0.810679,63.294583,0.747858,9.6e-05,0.745195,0.652143,0.271501
std,4.718736,1.125529,1.153387,0.120952,6.901104,0.14015,0.161083,0.185837,0.105922,0.086875
min,2005.0,1.281271,5.526723,0.228217,6.72,0.257534,-0.337527,0.035198,0.178886,0.082737
25%,2010.0,4.64675,8.499764,0.746609,59.119999,0.656528,-0.112116,0.688139,0.571684,0.20766
50%,2014.0,5.432437,9.498955,0.835535,65.050003,0.769821,-0.022671,0.799654,0.663063,0.260671
75%,2018.0,6.30946,10.373216,0.904792,68.5,0.859382,0.09207,0.868827,0.737936,0.322894
max,2022.0,8.018934,11.663788,0.987343,74.474998,0.985178,0.702708,0.983276,0.883586,0.70459


In [28]:
world_happiness_2023.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2199 entries, 0 to 2198
Data columns (total 11 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Country name                      2199 non-null   object 
 1   year                              2199 non-null   int64  
 2   Life Ladder                       2199 non-null   float64
 3   Log GDP per capita                2179 non-null   float64
 4   Social support                    2186 non-null   float64
 5   Healthy life expectancy at birth  2145 non-null   float64
 6   Freedom to make life choices      2166 non-null   float64
 7   Generosity                        2126 non-null   float64
 8   Perceptions of corruption         2083 non-null   float64
 9   Positive affect                   2175 non-null   float64
 10  Negative affect                   2183 non-null   float64
dtypes: float64(9), int64(1), object(1)
memory usage: 189.1+ KB


In [29]:
all_years_df = world_happiness_2023["year"].unique()
all_years_df.sort()
print(all_years_df)

[2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018
 2019 2020 2021 2022]


## 1.4) Taxation

### 1.4.1) European Commission: Taxation as percentage of GDP

LINK: https://taxation-customs.ec.europa.eu/taxation/economic-analyses/taxation-trends-eu/data-taxation-trends_en

In [33]:
total_taxes_df = pd.read_excel("tax-main-aggregates.xlsx", header=2)

In [34]:
total_taxes_df.head()

Unnamed: 0.1,Unnamed: 0,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,Difference 2012-2022 (pp),Ranking 2022,Revenue 2022\n(million EUR)
0,EU-27,37.922746,38.345385,39.301449,39.78725,39.865975,39.730539,39.843225,39.930718,40.054073,39.944737,40.002493,40.395208,40.161307,0.9,,6387768.3
1,EA-19,38.101209,38.58623,39.615106,40.116221,40.179571,40.061478,40.104618,40.241508,40.427915,40.296537,40.357165,40.786176,40.841225,1.2,,5482502.8
2,Belgium,43.580535,44.369598,45.319126,46.033293,45.683596,44.99157,44.213578,44.747951,44.824441,43.467108,43.411264,43.166396,43.268219,-2.0,2.0,239725.1
3,Bulgaria,25.398276,25.477641,26.078859,28.132118,28.399268,28.863177,29.184384,29.840882,29.683558,30.353742,30.498298,30.785349,31.145084,5.0,23.0,26722.7
4,Czechia,32.854823,33.975879,34.497486,34.897577,34.136013,34.288463,35.075928,35.402552,35.972707,35.907027,35.923476,35.890107,35.287515,0.8,17.0,97487.3


## 1.5) Unemployment

### 1.5.1) OECD: Unemployment by year

In [37]:
unempl_df = pd.read_csv("unemployment_by_year.csv")
unit_of_measure = unempl_df["Unit of measure"].value_counts()
print(unit_of_measure)

Unit of measure
Persons    825
Name: count, dtype: int64


### 1.5.2) Eurostat unemployment

In [39]:
unempl_eurostat_df = pd.read_csv("unemployment_eurostat.tsv", sep=r"[\t,\\]", engine = "python")

In [40]:
unempl_eurostat_df.replace(r'^\s*:\s*$', np.nan, regex=True, inplace=True)

PC_ACT = Percentage of population in the labour force

In [42]:
unempl_eurostat_df.iloc[1:40]

Unnamed: 0,freq,age,unit,sex,geo,TIME_PERIOD,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
1,A,Y15-74,PC_ACT,T,BA,,,,,,,,,,17.4,15.4,13.2,
2,A,Y15-74,PC_ACT,T,BE,7.6,8.6,8.7,8.7,7.9,7.2 b,6.0,5.5,5.8,6.3,5.6,5.5,
3,A,Y15-74,PC_ACT,T,BG,13.3,13.9,12.4,10.1,8.6,7.2,6.2,5.2 b,6.1,5.2,4.2,4.3,
4,A,Y15-74,PC_ACT,T,CH,4.5,4.8,4.9,4.8,5.0,4.8,4.7,4.4,4.8,5.1,4.1,4.1,
5,A,Y15-74,PC_ACT,T,CY,11.9,15.9,16.1,15.0,13.0,11.1,8.4,7.1,7.6,7.5,6.8,5.8 b,
6,A,Y15-74,PC_ACT,T,CZ,7.0,7.0,6.1,5.1,4.0,2.9,2.2,2.0,2.6,2.8,2.2,2.6,
7,A,Y15-74,PC_ACT,T,DE,5.1,5.0,4.7,4.4,3.9,3.6,3.2,3.0,3.7 b,3.7,3.2,3.1,
8,A,Y15-74,PC_ACT,T,DK,7.8,7.4,6.9,6.3,6.0 b,5.8 b,5.1,5.0,5.6,5.1,4.5,5.1 b,
9,A,Y15-74,PC_ACT,T,EA20,11.5,12.2,11.7,11.0,10.1,9.1,8.2,7.6,8.0,7.8,6.8,6.6,
10,A,Y15-74,PC_ACT,T,EE,9.9,8.6,7.3,6.4,6.8,5.8,5.4,4.5,6.9,6.2,5.6,6.4,


### Shifting all the values under the year-columns one step to the right and deleting TIME_PERIOD

In [44]:
start_col = 'TIME_PERIOD'

# Find the index of the starting column
start_index = unempl_eurostat_df.columns.get_loc(start_col)

# Create a new DataFrame initialized with NaN
df_shifted = unempl_eurostat_df.copy()

# Shift values to the right from the specified column
df_shifted.iloc[:, start_index + 1:] = df_shifted.iloc[:, start_index:-1]


df_shifted.head()

1      13.2
2       5.5
3       4.3
4       4.1
       ... 
106     296
107     440
108      38
109     162
110    3274
Name: 2022 , Length: 111, dtype: object' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df_shifted.iloc[:, start_index + 1:] = df_shifted.iloc[:, start_index:-1]


Unnamed: 0,freq,age,unit,sex,geo,TIME_PERIOD,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,A,Y15-74,PC_ACT,T,AT,5.2,5.2,5.7,6.0,6.1,6.5,5.9,5.2,4.8,6.0,6.2,4.8,5.1
1,A,Y15-74,PC_ACT,T,BA,,,,,,,,,,,17.4,15.4,13.2
2,A,Y15-74,PC_ACT,T,BE,7.6,7.6,8.6,8.7,8.7,7.9,7.2 b,6.0,5.5,5.8,6.3,5.6,5.5
3,A,Y15-74,PC_ACT,T,BG,13.3,13.3,13.9,12.4,10.1,8.6,7.2,6.2,5.2 b,6.1,5.2,4.2,4.3
4,A,Y15-74,PC_ACT,T,CH,4.5,4.5,4.8,4.9,4.8,5.0,4.8,4.7,4.4,4.8,5.1,4.1,4.1


In [45]:
df_shifted.drop(columns="TIME_PERIOD", inplace=True)
df_shifted.head()

Unnamed: 0,freq,age,unit,sex,geo,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,A,Y15-74,PC_ACT,T,AT,5.2,5.7,6.0,6.1,6.5,5.9,5.2,4.8,6.0,6.2,4.8,5.1
1,A,Y15-74,PC_ACT,T,BA,,,,,,,,,,17.4,15.4,13.2
2,A,Y15-74,PC_ACT,T,BE,7.6,8.6,8.7,8.7,7.9,7.2 b,6.0,5.5,5.8,6.3,5.6,5.5
3,A,Y15-74,PC_ACT,T,BG,13.3,13.9,12.4,10.1,8.6,7.2,6.2,5.2 b,6.1,5.2,4.2,4.3
4,A,Y15-74,PC_ACT,T,CH,4.5,4.8,4.9,4.8,5.0,4.8,4.7,4.4,4.8,5.1,4.1,4.1


### Deleting all other units of measure than PC_ACT

In [47]:
unit = df_shifted["unit"].value_counts()
print(unit)

unit
PC_ACT     37
PC_POP     37
THS_PER    37
Name: count, dtype: int64


In [48]:
unempl_one_unit_df = df_shifted[~df_shifted["unit"].isin(["PC_POP", "THS_PER"])]

In [49]:
unit = unempl_one_unit_df["unit"].value_counts()
print(unit)

unit
PC_ACT    37
Name: count, dtype: int64


### Deleting freq, age, sex

In [51]:
unempl_one_unit_df.drop(columns=["freq", "age", "sex", "unit"], inplace= True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unempl_one_unit_df.drop(columns=["freq", "age", "sex", "unit"], inplace= True)


In [52]:
unempl_one_unit_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 37 entries, 0 to 36
Data columns (total 13 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   geo     37 non-null     object
 1   2012    36 non-null     object
 2   2013    36 non-null     object
 3   2014    36 non-null     object
 4   2015    36 non-null     object
 5   2016    36 non-null     object
 6   2017    36 non-null     object
 7   2018    36 non-null     object
 8   2019    36 non-null     object
 9   2020    36 non-null     object
 10  2021    35 non-null     object
 11  2022    35 non-null     object
 12  2023    35 non-null     object
dtypes: object(13)
memory usage: 4.0+ KB


In [53]:
unempl_one_unit_df.isna().sum()

geo      0
2012     1
2013     1
2014     1
2015     1
2016     1
2017     1
2018     1
2019     1
2020     1
2021     2
2022     2
2023     2
dtype: int64

### Transforming data frame from wide to long so years are under a column and there values in another column

In [55]:
unempl_euro_melted_df = pd.melt(unempl_one_unit_df, id_vars=["geo"], var_name = "TIME_PERIOD", value_name = "Unemployment as percentage of labour force")
unempl_euro_melted_df.head()

Unnamed: 0,geo,TIME_PERIOD,Unemployment as percentage of labour force
0,AT,2012,5.2
1,BA,2012,
2,BE,2012,7.6
3,BG,2012,13.3
4,CH,2012,4.5


In [56]:
unempl_euro_melted_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 444 entries, 0 to 443
Data columns (total 3 columns):
 #   Column                                      Non-Null Count  Dtype 
---  ------                                      --------------  ----- 
 0   geo                                         444 non-null    object
 1   TIME_PERIOD                                 444 non-null    object
 2   Unemployment as percentage of labour force  429 non-null    object
dtypes: object(3)
memory usage: 10.5+ KB


# 2) Data exploration and feature engineering 

## 2.1) GDP data

### 2.1.1) (OECD) GDP 1970-2023 deflated

#### 2.1.1.1) Removing columns to get an overview of the data

In [61]:
gdp_1970_defl_simple = gdp_1970_deflated_df.drop(columns=['STRUCTURE', 'STRUCTURE_ID', 'ACTION', 'FREQ', 'COUNTERPART_SECTOR', 'Counterpart institutional sector', 'CURRENCY', 'Currency', 'Decimals', 'Confidentiality status', 'CONF_STATUS', 'INSTR_ASSET', 'Financial instruments and non-financial assets', 'TABLE_IDENTIFIER', 'Table identifier', 'Time period', 'REF_YEAR_PRICE', 'Observation value', 'Price reference year', 'Economic activity', 'ACTIVITY', 'PRICE_BASE', 'TRANSFORMATION', 'Transformation', 'TRANSACTION', 'SECTOR', 'STRUCTURE_NAME', 'Frequency of observation', 'UNIT_MULT', 'DECIMALS', 'EXPENDITURE', 'Expenditure', 'UNIT_MEASURE'])

In [62]:
gdp_1970_defl_simple.head()

Unnamed: 0,REF_AREA,Reference area,Institutional sector,Transaction,Unit of measure,Price base,TIME_PERIOD,OBS_VALUE,OBS_STATUS,Observation status,Unit multiplier
0,IRL,Ireland,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),2009,234829.565,A,Normal value,Millions
1,CHE,Switzerland,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),2017,559252.338,A,Normal value,Millions
2,CHE,Switzerland,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),2016,551733.319,A,Normal value,Millions
3,COL,Colombia,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),2018,662368.976,A,Normal value,Millions
4,CHE,Switzerland,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),1971,270463.206,E,Estimated value,Millions


In [63]:
gdp_1970_defl_simple.shape

(2624, 11)

In [64]:
gdp_1970_defl_simple.iloc[2435:2445]

Unnamed: 0,REF_AREA,Reference area,Institutional sector,Transaction,Unit of measure,Price base,TIME_PERIOD,OBS_VALUE,OBS_STATUS,Observation status,Unit multiplier
2435,ITA,Italy,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),1975,1207254.975,E,Estimated value,Millions
2436,ITA,Italy,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),1974,1233027.254,E,Estimated value,Millions
2437,ITA,Italy,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),1973,1168744.649,E,Estimated value,Millions
2438,ITA,Italy,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),1972,1091001.392,E,Estimated value,Millions
2439,ITA,Italy,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),1971,1052171.165,E,Estimated value,Millions
2440,ITA,Italy,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),1970,1033383.143,E,Estimated value,Millions
2441,ITA,Italy,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),2023,2421454.72,A,Normal value,Millions
2442,ITA,Italy,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),2022,2399363.963,A,Normal value,Millions
2443,ITA,Italy,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),2021,2307393.619,A,Normal value,Millions
2444,ITA,Italy,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),2020,2130355.931,A,Normal value,Millions


In [65]:
gdp_1970_defl_simple.isna().sum()

REF_AREA                0
Reference area          0
Institutional sector    0
Transaction             0
Unit of measure         0
Price base              0
TIME_PERIOD             0
OBS_VALUE               0
OBS_STATUS              0
Observation status      0
Unit multiplier         0
dtype: int64

In [66]:
gdp_1970_defl_simple.sort_values(by="TIME_PERIOD", inplace=False)

Unnamed: 0,REF_AREA,Reference area,Institutional sector,Transaction,Unit of measure,Price base,TIME_PERIOD,OBS_VALUE,OBS_STATUS,Observation status,Unit multiplier
121,PRT,Portugal,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),1970,1.024354e+05,E,Estimated value,Millions
786,FRA,France,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),1970,9.921342e+05,A,Normal value,Millions
2440,ITA,Italy,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),1970,1.033383e+06,E,Estimated value,Millions
1814,USA,United States,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),1970,5.173693e+06,A,Normal value,Millions
1049,ISL,Iceland,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),1970,3.460969e+03,E,Estimated value,Millions
...,...,...,...,...,...,...,...,...,...,...,...
2183,GRC,Greece,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),2023,3.193293e+05,P,Provisional value,Millions
765,NOR,Norway,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),2023,3.536325e+05,A,Normal value,Millions
2357,KOR,Korea,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),2023,2.327219e+06,E,Estimated value,Millions
2279,SVK,Slovak Republic,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),2023,1.912215e+05,A,Normal value,Millions


In [67]:
ireland_df = gdp_1970_defl_simple[gdp_1970_defl_simple["Reference area"] == "Ireland"]

In [68]:
ire_df_copy = ireland_df.copy()

In [69]:
ire_df_copy.head()

Unnamed: 0,REF_AREA,Reference area,Institutional sector,Transaction,Unit of measure,Price base,TIME_PERIOD,OBS_VALUE,OBS_STATUS,Observation status,Unit multiplier
0,IRL,Ireland,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),2009,234829.565,A,Normal value,Millions
11,IRL,Ireland,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),2020,453894.856,A,Normal value,Millions
2298,IRL,Ireland,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),2016,340720.243,A,Normal value,Millions
2299,IRL,Ireland,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),2015,336604.783,A,Normal value,Millions
2300,IRL,Ireland,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),2014,270114.548,A,Normal value,Millions


#### 2.1.1.2) Moving all OBS_VALUES (gdp) up a year (I need to move it down a year actually)

In [71]:
ire_df_copy.sort_values(by="TIME_PERIOD", inplace=True, ascending=True)

In [72]:
ire_df_copy['OBS_VALUE'] = ire_df_copy['OBS_VALUE'].shift(+1)

In [73]:
ire_df_copy.head()

Unnamed: 0,REF_AREA,Reference area,Institutional sector,Transaction,Unit of measure,Price base,TIME_PERIOD,OBS_VALUE,OBS_STATUS,Observation status,Unit multiplier
2619,IRL,Ireland,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),1970,,E,Estimated value,Millions
2618,IRL,Ireland,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),1971,40517.027,E,Estimated value,Millions
2617,IRL,Ireland,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),1972,41922.922,E,Estimated value,Millions
2616,IRL,Ireland,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),1973,44643.64,E,Estimated value,Millions
2615,IRL,Ireland,Total economy,Gross domestic product,"US dollars, PPP converted",Chain linked volume (rebased),1974,46751.457,E,Estimated value,Millions


## 2.2) Social Policy Expenditure 1993-2023

### 2.2.1) Total social policy expenditure as percentage of GDP

In [76]:
expend_on_social_policy_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 366 entries, 0 to 365
Data columns (total 34 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   STRUCTURE                 366 non-null    object 
 1   STRUCTURE_ID              366 non-null    object 
 2   STRUCTURE_NAME            366 non-null    object 
 3   ACTION                    366 non-null    object 
 4   REF_AREA                  366 non-null    object 
 5   Reference area            366 non-null    object 
 6   FREQ                      366 non-null    object 
 7   Frequency of observation  366 non-null    object 
 8   MEASURE                   366 non-null    object 
 9   Measure                   366 non-null    object 
 10  UNIT_MEASURE              366 non-null    object 
 11  Unit of measure           366 non-null    object 
 12  EXPEND_SOURCE             366 non-null    object 
 13  Expenditure source        366 non-null    object 
 14  SPENDING_T

#### 2.2.1.1) Removing columns to get an overview of the data

In [78]:
exp_social_df = expend_on_social_policy_df.drop(columns=['PROGRAMME_TYPE', 'SPENDING_TYPE', 'EXPEND_SOURCE', 'MEASURE', 'Price base', 'BASE_PER', 'Base period', 'STRUCTURE', 'STRUCTURE_ID', 'ACTION', 'FREQ', 'CURRENCY', 'Currency', 'Decimals',  'Time period', 'Observation value', 'PRICE_BASE', 'STRUCTURE_NAME', 'Frequency of observation', 'UNIT_MULT', 'DECIMALS', 'UNIT_MEASURE'])

In [79]:
exp_social_df.head()


Unnamed: 0,REF_AREA,Reference area,Measure,Unit of measure,Expenditure source,Spending type,Programme type,TIME_PERIOD,OBS_VALUE,OBS_STATUS,Observation status,Unit multiplier
0,TUR,Türkiye,Social expenditure,Percentage of GDP,Net total,Total,Total,2017,11.803,A,Normal value,Units
1,TUR,Türkiye,Social expenditure,Percentage of GDP,Net total,Total,Total,2019,12.463,A,Normal value,Units
2,TUR,Türkiye,Social expenditure,Percentage of GDP,Net total,Total,Total,2005,9.333,A,Normal value,Units
3,TUR,Türkiye,Social expenditure,Percentage of GDP,Net total,Total,Total,2007,10.164,A,Normal value,Units
4,TUR,Türkiye,Social expenditure,Percentage of GDP,Net total,Total,Total,2009,12.074,A,Normal value,Units


In [80]:
all_years_social_exp = exp_social_df["TIME_PERIOD"].unique()
all_years_social_exp.sort()
print(all_years_social_exp)

[1993 1995 1997 1999 2001 2003 2005 2007 2009 2011 2013 2015 2017 2019]


In [81]:
df_obs_status = exp_social_df[exp_social_df["TIME_PERIOD"] == 2017]
df_obs_status.info()

<class 'pandas.core.frame.DataFrame'>
Index: 36 entries, 0 to 358
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   REF_AREA            36 non-null     object 
 1   Reference area      36 non-null     object 
 2   Measure             36 non-null     object 
 3   Unit of measure     36 non-null     object 
 4   Expenditure source  36 non-null     object 
 5   Spending type       36 non-null     object 
 6   Programme type      36 non-null     object 
 7   TIME_PERIOD         36 non-null     int64  
 8   OBS_VALUE           36 non-null     float64
 9   OBS_STATUS          36 non-null     object 
 10  Observation status  36 non-null     object 
 11  Unit multiplier     36 non-null     object 
dtypes: float64(1), int64(1), object(10)
memory usage: 3.7+ KB


In [82]:
exp_social_df_DK = exp_social_df[exp_social_df["Reference area"] == "Denmark"]

In [83]:
exp_social_df_DK.head()

Unnamed: 0,REF_AREA,Reference area,Measure,Unit of measure,Expenditure source,Spending type,Programme type,TIME_PERIOD,OBS_VALUE,OBS_STATUS,Observation status,Unit multiplier
224,DNK,Denmark,Social expenditure,Percentage of GDP,Net total,Total,Total,2005,21.821,A,Normal value,Units
225,DNK,Denmark,Social expenditure,Percentage of GDP,Net total,Total,Total,2007,22.839,A,Normal value,Units
226,DNK,Denmark,Social expenditure,Percentage of GDP,Net total,Total,Total,2009,26.397,A,Normal value,Units
227,DNK,Denmark,Social expenditure,Percentage of GDP,Net total,Total,Total,2011,26.442,A,Normal value,Units
228,DNK,Denmark,Social expenditure,Percentage of GDP,Net total,Total,Total,2013,26.787,A,Normal value,Units


In [84]:
exp_social_df_DK.sort_values(by = "TIME_PERIOD", ascending=True, inplace=False)

Unnamed: 0,REF_AREA,Reference area,Measure,Unit of measure,Expenditure source,Spending type,Programme type,TIME_PERIOD,OBS_VALUE,OBS_STATUS,Observation status,Unit multiplier
232,DNK,Denmark,Social expenditure,Percentage of GDP,Net total,Total,Total,1993,23.61,A,Normal value,Units
233,DNK,Denmark,Social expenditure,Percentage of GDP,Net total,Total,Total,1995,22.994,A,Normal value,Units
234,DNK,Denmark,Social expenditure,Percentage of GDP,Net total,Total,Total,1997,21.604,A,Normal value,Units
235,DNK,Denmark,Social expenditure,Percentage of GDP,Net total,Total,Total,1999,21.638,A,Normal value,Units
236,DNK,Denmark,Social expenditure,Percentage of GDP,Net total,Total,Total,2001,20.879,A,Normal value,Units
237,DNK,Denmark,Social expenditure,Percentage of GDP,Net total,Total,Total,2003,22.152,A,Normal value,Units
224,DNK,Denmark,Social expenditure,Percentage of GDP,Net total,Total,Total,2005,21.821,A,Normal value,Units
225,DNK,Denmark,Social expenditure,Percentage of GDP,Net total,Total,Total,2007,22.839,A,Normal value,Units
226,DNK,Denmark,Social expenditure,Percentage of GDP,Net total,Total,Total,2009,26.397,A,Normal value,Units
227,DNK,Denmark,Social expenditure,Percentage of GDP,Net total,Total,Total,2011,26.442,A,Normal value,Units


## 2.3) Happiness

### 2.3.1) Wellbeing (hand picked features) 2004-2024

LINK: https://data-explorer.oecd.org/vis?fs[0]=Topic%2C1%7CSociety%23SOC%23%7CWell-being%20and%20beyond%20GDP%23SOC_WEL%23&pg=0&fc=Topic&bp=true&snb=8&vw=tb&df[ds]=dsDisseminateFinalDMZ&df[id]=DSD_HSL%40DF_HSL_CWB&df[ag]=OECD.WISE.WDP&df[vs]=1.0&dq=ZAF%2BBRA%2BUSA%2BGBR%2BTUR%2BCHE%2BSWE%2BESP%2BSVN%2BSVK%2BPRT%2BPOL%2BNOR%2BNZL%2BNLD%2BMEX%2BLUX%2BLTU%2BLVA%2BKOR%2BJPN%2BISR%2BITA%2BIRL%2BISL%2BHUN%2BGRC%2BDEU%2BFIN%2BFRA%2BEST%2BDNK%2BCZE%2BCRI%2BCOL%2BCAN%2BCHL%2BBEL%2BAUT%2BAUS.4_4%2B11_1%2B10_2%2B10_2_DEP%2B10_1%2B7_3%2B7_1_DEP%2B7_1%2B5_4%2B5_3%2B5_2_DEP%2B5_2.._T._T._T.&pd=2004%2C2024&to[TIME_PERIOD]=false&ly[cl]=DOMAIN%2CCOMBINED_MEASURE%2CCOMBINED_UNIT_MEASURE%2CTIME_PERIOD&ly[rw]=REF_AREA

In [88]:
current_wellbeing_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5466 entries, 0 to 5465
Data columns (total 30 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   STRUCTURE           5466 non-null   object 
 1   STRUCTURE_ID        5466 non-null   object 
 2   STRUCTURE_NAME      5466 non-null   object 
 3   ACTION              5466 non-null   object 
 4   REF_AREA            5466 non-null   object 
 5   Reference area      5466 non-null   object 
 6   MEASURE             5466 non-null   object 
 7   Measure             5466 non-null   object 
 8   UNIT_MEASURE        5466 non-null   object 
 9   Unit of measure     5466 non-null   object 
 10  AGE                 5466 non-null   object 
 11  Age                 5466 non-null   object 
 12  SEX                 5466 non-null   object 
 13  Sex                 5466 non-null   object 
 14  EDUCATION_LEV       5466 non-null   object 
 15  Education level     5466 non-null   object 
 16  DOMAIN

#### 2.3.1.1) Removing columns to get an overview of the data

In [90]:
wellbeing_df = current_wellbeing_df.drop(columns=['EDUCATION_LEV', 'MEASURE', 'DOMAIN', 'Domain', 'Education level', 'Sex', 'SEX', 'AGE', 'Age', 'OBS_STATUS', 'BASE_PER', 'Base period', 'STRUCTURE', 'STRUCTURE_ID', 'ACTION', 'Decimals',  'Time period', 'Observation value', 'STRUCTURE_NAME', 'UNIT_MULT', 'DECIMALS', 'UNIT_MEASURE'])

In [91]:
wellbeing_df.head()

Unnamed: 0,REF_AREA,Reference area,Measure,Unit of measure,TIME_PERIOD,OBS_VALUE,Observation status,Unit multiplier
0,LVA,Latvia,Perceived health as positive,Percentage of population aged 16 years or over,2017,44.2,Normal value,Units
1,LVA,Latvia,Perceived health as positive,Percentage of population aged 16 years or over,2018,46.9,Normal value,Units
2,LVA,Latvia,Perceived health as positive,Percentage of population aged 16 years or over,2019,47.1,Normal value,Units
3,LVA,Latvia,Perceived health as positive,Percentage of population aged 16 years or over,2020,49.7,Normal value,Units
4,LVA,Latvia,Perceived health as positive,Percentage of population aged 16 years or over,2021,49.8,Normal value,Units


In [92]:
unit_of_measure_values = wellbeing_df["Measure"].unique()

print(unit_of_measure_values)

['Perceived health as positive' 'Satisfaction with personal relationships'
 'Life satisfaction' 'Satisfaction with time use' 'Lack of social support'
 'Not feeling safe at night' 'Feeling safe at night'
 'Perceived health as negative' 'Social support'
 'Self-reported depression' 'Homicides'
 'Deaths from suicide, alcohol, drugs']


##### PERCEIVED HEALTH

In [94]:
percivd_health_pos_df = wellbeing_df[wellbeing_df["Measure"] == "Perceived health as positive"]

In [95]:
percivd_health_pos_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 590 entries, 0 to 5374
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   REF_AREA            590 non-null    object 
 1   Reference area      590 non-null    object 
 2   Measure             590 non-null    object 
 3   Unit of measure     590 non-null    object 
 4   TIME_PERIOD         590 non-null    int64  
 5   OBS_VALUE           590 non-null    float64
 6   Observation status  590 non-null    object 
 7   Unit multiplier     590 non-null    object 
dtypes: float64(1), int64(1), object(6)
memory usage: 41.5+ KB


##### SATISFACTION WITH PERSONAL RELATIONSHIPS (TO FEW)

In [97]:
stsfaction_rltnship_df = wellbeing_df[wellbeing_df["Measure"] == "Satisfaction with personal relationships"]

In [98]:
stsfaction_rltnship_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 93 entries, 31 to 5383
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   REF_AREA            93 non-null     object 
 1   Reference area      93 non-null     object 
 2   Measure             93 non-null     object 
 3   Unit of measure     93 non-null     object 
 4   TIME_PERIOD         93 non-null     int64  
 5   OBS_VALUE           93 non-null     float64
 6   Observation status  93 non-null     object 
 7   Unit multiplier     93 non-null     object 
dtypes: float64(1), int64(1), object(6)
memory usage: 6.5+ KB


##### LIFE SATISFACTION (TO FEW - 189)

In [100]:
life_stsfaction_df = wellbeing_df[wellbeing_df["Measure"] == "Life satisfaction"]

In [101]:
life_stsfaction_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 189 entries, 33 to 5380
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   REF_AREA            189 non-null    object 
 1   Reference area      189 non-null    object 
 2   Measure             189 non-null    object 
 3   Unit of measure     189 non-null    object 
 4   TIME_PERIOD         189 non-null    int64  
 5   OBS_VALUE           189 non-null    float64
 6   Observation status  189 non-null    object 
 7   Unit multiplier     189 non-null    object 
dtypes: float64(1), int64(1), object(6)
memory usage: 13.3+ KB


##### SATISFACTION WITH TOME USE (TO FEW)

In [103]:
stsfaction_w_time_use_df = wellbeing_df[wellbeing_df["Measure"] == "Satisfaction with time use"]

In [104]:
stsfaction_w_time_use_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 95 entries, 35 to 5385
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   REF_AREA            95 non-null     object 
 1   Reference area      95 non-null     object 
 2   Measure             95 non-null     object 
 3   Unit of measure     95 non-null     object 
 4   TIME_PERIOD         95 non-null     int64  
 5   OBS_VALUE           95 non-null     float64
 6   Observation status  95 non-null     object 
 7   Unit multiplier     95 non-null     object 
dtypes: float64(1), int64(1), object(6)
memory usage: 6.7+ KB


##### LACK OF SOCIAL SUPPORT (TO FEW AND NEG)

In [106]:
lck_of_social_supprt_df = wellbeing_df[wellbeing_df["Measure"] == "Lack of Social support"]

In [107]:
lck_of_social_supprt_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   REF_AREA            0 non-null      object 
 1   Reference area      0 non-null      object 
 2   Measure             0 non-null      object 
 3   Unit of measure     0 non-null      object 
 4   TIME_PERIOD         0 non-null      int64  
 5   OBS_VALUE           0 non-null      float64
 6   Observation status  0 non-null      object 
 7   Unit multiplier     0 non-null      object 
dtypes: float64(1), int64(1), object(6)
memory usage: 0.0+ bytes


##### HOMICIDES

In [109]:
homicides_df = wellbeing_df[wellbeing_df["Measure"] == "Homicides"]

In [110]:
homicides_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 642 entries, 3838 to 5461
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   REF_AREA            642 non-null    object 
 1   Reference area      642 non-null    object 
 2   Measure             642 non-null    object 
 3   Unit of measure     642 non-null    object 
 4   TIME_PERIOD         642 non-null    int64  
 5   OBS_VALUE           642 non-null    float64
 6   Observation status  642 non-null    object 
 7   Unit multiplier     642 non-null    object 
dtypes: float64(1), int64(1), object(6)
memory usage: 45.1+ KB


##### NOT FEEL SAFE (NEG)

In [112]:
not_feel_safe_df = wellbeing_df[wellbeing_df["Measure"] == "Not feeling safe at night"]

In [113]:
not_feel_safe_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 623 entries, 53 to 5369
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   REF_AREA            623 non-null    object 
 1   Reference area      623 non-null    object 
 2   Measure             623 non-null    object 
 3   Unit of measure     623 non-null    object 
 4   TIME_PERIOD         623 non-null    int64  
 5   OBS_VALUE           623 non-null    float64
 6   Observation status  623 non-null    object 
 7   Unit multiplier     623 non-null    object 
dtypes: float64(1), int64(1), object(6)
memory usage: 43.8+ KB


##### FEEL SAFE

In [115]:
feel_safe_df = wellbeing_df[wellbeing_df["Measure"] == "Feeling safe at night"]

In [116]:
feel_safe_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 675 entries, 68 to 5324
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   REF_AREA            675 non-null    object 
 1   Reference area      675 non-null    object 
 2   Measure             675 non-null    object 
 3   Unit of measure     675 non-null    object 
 4   TIME_PERIOD         675 non-null    int64  
 5   OBS_VALUE           675 non-null    float64
 6   Observation status  675 non-null    object 
 7   Unit multiplier     675 non-null    object 
dtypes: float64(1), int64(1), object(6)
memory usage: 47.5+ KB


In [117]:
feel_safe_df.head()

Unnamed: 0,REF_AREA,Reference area,Measure,Unit of measure,TIME_PERIOD,OBS_VALUE,Observation status,Unit multiplier
68,SVK,Slovak Republic,Feeling safe at night,Percentage of population aged 15 years or over,2016,65.331679,Normal value,Units
69,SVK,Slovak Republic,Feeling safe at night,Percentage of population aged 15 years or over,2017,69.671352,Normal value,Units
70,SVK,Slovak Republic,Feeling safe at night,Percentage of population aged 15 years or over,2018,69.671352,Normal value,Units
71,SVK,Slovak Republic,Feeling safe at night,Percentage of population aged 15 years or over,2019,69.671352,Normal value,Units
72,SVK,Slovak Republic,Feeling safe at night,Percentage of population aged 15 years or over,2020,78.012499,Normal value,Units


##### HEALTH NEGATIVE (NEG)

In [119]:
perceived_health_neg = wellbeing_df[wellbeing_df["Measure"] == "Perceived health as negative"]

In [120]:
perceived_health_neg.info()

<class 'pandas.core.frame.DataFrame'>
Index: 586 entries, 114 to 5378
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   REF_AREA            586 non-null    object 
 1   Reference area      586 non-null    object 
 2   Measure             586 non-null    object 
 3   Unit of measure     586 non-null    object 
 4   TIME_PERIOD         586 non-null    int64  
 5   OBS_VALUE           586 non-null    float64
 6   Observation status  586 non-null    object 
 7   Unit multiplier     586 non-null    object 
dtypes: float64(1), int64(1), object(6)
memory usage: 41.2+ KB


##### SOCIAL SUPPORT

In [122]:
social_support_df = wellbeing_df[wellbeing_df["Measure"] == "Social support"]

In [123]:
social_support_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 673 entries, 186 to 5381
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   REF_AREA            673 non-null    object 
 1   Reference area      673 non-null    object 
 2   Measure             673 non-null    object 
 3   Unit of measure     673 non-null    object 
 4   TIME_PERIOD         673 non-null    int64  
 5   OBS_VALUE           673 non-null    float64
 6   Observation status  673 non-null    object 
 7   Unit multiplier     673 non-null    object 
dtypes: float64(1), int64(1), object(6)
memory usage: 47.3+ KB


##### DEPRESSION (TO FEW)

In [125]:
self_rep_depression_df = wellbeing_df[wellbeing_df["Measure"] == "Self-reported depression"]

In [126]:
self_rep_depression_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 49 entries, 251 to 5382
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   REF_AREA            49 non-null     object 
 1   Reference area      49 non-null     object 
 2   Measure             49 non-null     object 
 3   Unit of measure     49 non-null     object 
 4   TIME_PERIOD         49 non-null     int64  
 5   OBS_VALUE           49 non-null     float64
 6   Observation status  49 non-null     object 
 7   Unit multiplier     49 non-null     object 
dtypes: float64(1), int64(1), object(6)
memory usage: 3.4+ KB


##### DEATHS OF SUICIDE, ALCOHOL and DRUGS

In [128]:
deaths_suicide_alc_drug_df = wellbeing_df[wellbeing_df["Measure"] == "Deaths from suicide, alcohol, drugs"]

In [129]:
deaths_suicide_alc_drug_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 630 entries, 3866 to 5465
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   REF_AREA            630 non-null    object 
 1   Reference area      630 non-null    object 
 2   Measure             630 non-null    object 
 3   Unit of measure     630 non-null    object 
 4   TIME_PERIOD         630 non-null    int64  
 5   OBS_VALUE           630 non-null    float64
 6   Observation status  630 non-null    object 
 7   Unit multiplier     630 non-null    object 
dtypes: float64(1), int64(1), object(6)
memory usage: 44.3+ KB


In [130]:
all_unique = deaths_suicide_alc_drug_df["Reference area"].unique()

In [131]:
all_unique

array(['Ireland', 'Estonia', 'Spain', 'Slovenia', 'Australia', 'Hungary',
       'Czechia', 'Sweden', 'United States', 'Costa Rica', 'Norway',
       'France', 'South Africa', 'Brazil', 'Netherlands', 'Canada',
       'Chile', 'Poland', 'Iceland', 'New Zealand', 'Greece', 'Denmark',
       'Switzerland', 'Latvia', 'Luxembourg', 'Austria', 'Japan',
       'Türkiye', 'Israel', 'Mexico', 'Slovak Republic', 'Korea',
       'Lithuania', 'Belgium', 'Portugal', 'Germany', 'Colombia', 'Italy',
       'Finland', 'United Kingdom'], dtype=object)

In [132]:
print(len(all_unique))

40


## 2.4) Taxation data

### 2.4.1) European Comission total taxation of GDP

#### 2.4.1.1) Renaming column with countries to Reference area

In [136]:
total_taxes_df.rename(columns = {"Unnamed: 0": "Reference area"}, inplace=True)

In [137]:
total_taxes_df.head()

Unnamed: 0,Reference area,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,Difference 2012-2022 (pp),Ranking 2022,Revenue 2022\n(million EUR)
0,EU-27,37.922746,38.345385,39.301449,39.78725,39.865975,39.730539,39.843225,39.930718,40.054073,39.944737,40.002493,40.395208,40.161307,0.9,,6387768.3
1,EA-19,38.101209,38.58623,39.615106,40.116221,40.179571,40.061478,40.104618,40.241508,40.427915,40.296537,40.357165,40.786176,40.841225,1.2,,5482502.8
2,Belgium,43.580535,44.369598,45.319126,46.033293,45.683596,44.99157,44.213578,44.747951,44.824441,43.467108,43.411264,43.166396,43.268219,-2.0,2.0,239725.1
3,Bulgaria,25.398276,25.477641,26.078859,28.132118,28.399268,28.863177,29.184384,29.840882,29.683558,30.353742,30.498298,30.785349,31.145084,5.0,23.0,26722.7
4,Czechia,32.854823,33.975879,34.497486,34.897577,34.136013,34.288463,35.075928,35.402552,35.972707,35.907027,35.923476,35.890107,35.287515,0.8,17.0,97487.3


#### 2.4.1.2) Reshaping (melting) the table, so that year is one column and country and year together identify a row  

In [139]:
total_tax_melted_df = pd.melt(total_taxes_df, id_vars=["Reference area"], var_name = "Year", value_name = "Taxation")
total_tax_melted_df.head()

Unnamed: 0,Reference area,Year,Taxation
0,EU-27,2010,37.922746
1,EA-19,2010,38.101209
2,Belgium,2010,43.580535
3,Bulgaria,2010,25.398276
4,Czechia,2010,32.854823


In [140]:
total_tax_melted_df.isna().sum()

Reference area     64
Year                0
Taxation          100
dtype: int64

In [141]:
df_2023 = total_tax_melted_df[total_tax_melted_df["Year"]== "2021"]
df_2023.head()

Unnamed: 0,Reference area,Year,Taxation
407,EU-27,2021,40.395208
408,EA-19,2021,40.786176
409,Belgium,2021,43.166396
410,Bulgaria,2021,30.785349
411,Czechia,2021,35.890107


# 3) Collecting data frames

## 3.1) All data frames listed

In [144]:

list_of_dfs= []

list_of_dfs.append(social_exp_eu_usa_aus)
list_of_dfs.append(gdp_1970_defl_simple)
list_of_dfs.append(feel_safe_df)
list_of_dfs.append(percivd_health_pos_df)

world_happiness_2023.rename(columns={"year":"TIME_PERIOD"}, inplace=True)

list_of_dfs.append(world_happiness_2023)


In [145]:
for df in list_of_dfs:
    df.rename(columns={"TIME_PERIOD": "Year"}, inplace=True)

    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns={"TIME_PERIOD": "Year"}, inplace=True)


In [146]:
world_happiness_2023.head()

Unnamed: 0,Country name,Year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect
0,Afghanistan,2008,3.72359,7.350416,0.450662,50.5,0.718114,0.167652,0.881686,0.414297,0.258195
1,Afghanistan,2009,4.401778,7.508646,0.552308,50.799999,0.678896,0.190809,0.850035,0.481421,0.237092
2,Afghanistan,2010,4.758381,7.6139,0.539075,51.099998,0.600127,0.121316,0.706766,0.516907,0.275324
3,Afghanistan,2011,3.831719,7.581259,0.521104,51.400002,0.495901,0.163571,0.731109,0.479835,0.267175
4,Afghanistan,2012,3.782938,7.660506,0.520637,51.700001,0.530935,0.237588,0.77562,0.613513,0.267919


In [147]:
world_happiness_2023.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2199 entries, 0 to 2198
Data columns (total 11 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Country name                      2199 non-null   object 
 1   Year                              2199 non-null   int64  
 2   Life Ladder                       2199 non-null   float64
 3   Log GDP per capita                2179 non-null   float64
 4   Social support                    2186 non-null   float64
 5   Healthy life expectancy at birth  2145 non-null   float64
 6   Freedom to make life choices      2166 non-null   float64
 7   Generosity                        2126 non-null   float64
 8   Perceptions of corruption         2083 non-null   float64
 9   Positive affect                   2175 non-null   float64
 10  Negative affect                   2183 non-null   float64
dtypes: float64(9), int64(1), object(1)
memory usage: 189.1+ KB


## 3.2) Deleting all rows before 2005

In [149]:
social_exp_2005 = social_exp_eu_usa_aus[(social_exp_eu_usa_aus["Year"]>= 2005) & (social_exp_eu_usa_aus["Unit of measure"] == "Percentage of general government expenditure")]
gdp_defl_2005 = gdp_1970_defl_simple[gdp_1970_defl_simple["Year"]>= 2005]
feel_safe_2005 = feel_safe_df[feel_safe_df["Year"]>= 2005]
percivd_health_2005 = percivd_health_pos_df[percivd_health_pos_df["Year"]>= 2005]
world_happiness_2005 = world_happiness_2023[world_happiness_2023["Year"]>= 2005]

In [150]:
all_unit_of_measure = social_exp_2005["Unit of measure"].unique()
print(all_unit_of_measure)

['Percentage of general government expenditure']


In [151]:
social_exp_2005.info()

<class 'pandas.core.frame.DataFrame'>
Index: 467 entries, 0 to 1814
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   REF_AREA            467 non-null    object 
 1   Reference area      467 non-null    object 
 2   Measure             467 non-null    object 
 3   Unit of measure     467 non-null    object 
 4   Expenditure source  467 non-null    object 
 5   Spending type       467 non-null    object 
 6   Programme type      467 non-null    object 
 7   Year                467 non-null    int64  
 8   OBS_VALUE           467 non-null    float64
 9   OBS_STATUS          467 non-null    object 
 10  Observation status  467 non-null    object 
 11  Unit multiplier     467 non-null    object 
dtypes: float64(1), int64(1), object(10)
memory usage: 47.4+ KB


In [152]:
gdp_defl_2005.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1171 entries, 0 to 2623
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   REF_AREA              1171 non-null   object 
 1   Reference area        1171 non-null   object 
 2   Institutional sector  1171 non-null   object 
 3   Transaction           1171 non-null   object 
 4   Unit of measure       1171 non-null   object 
 5   Price base            1171 non-null   object 
 6   Year                  1171 non-null   int64  
 7   OBS_VALUE             1171 non-null   float64
 8   OBS_STATUS            1171 non-null   object 
 9   Observation status    1171 non-null   object 
 10  Unit multiplier       1171 non-null   object 
dtypes: float64(1), int64(1), object(9)
memory usage: 109.8+ KB


In [153]:
feel_safe_2005.info()

<class 'pandas.core.frame.DataFrame'>
Index: 675 entries, 68 to 5324
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   REF_AREA            675 non-null    object 
 1   Reference area      675 non-null    object 
 2   Measure             675 non-null    object 
 3   Unit of measure     675 non-null    object 
 4   Year                675 non-null    int64  
 5   OBS_VALUE           675 non-null    float64
 6   Observation status  675 non-null    object 
 7   Unit multiplier     675 non-null    object 
dtypes: float64(1), int64(1), object(6)
memory usage: 47.5+ KB


In [154]:
percivd_health_2005.info()

<class 'pandas.core.frame.DataFrame'>
Index: 570 entries, 0 to 5374
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   REF_AREA            570 non-null    object 
 1   Reference area      570 non-null    object 
 2   Measure             570 non-null    object 
 3   Unit of measure     570 non-null    object 
 4   Year                570 non-null    int64  
 5   OBS_VALUE           570 non-null    float64
 6   Observation status  570 non-null    object 
 7   Unit multiplier     570 non-null    object 
dtypes: float64(1), int64(1), object(6)
memory usage: 40.1+ KB


In [155]:
world_happiness_2005.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2199 entries, 0 to 2198
Data columns (total 11 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Country name                      2199 non-null   object 
 1   Year                              2199 non-null   int64  
 2   Life Ladder                       2199 non-null   float64
 3   Log GDP per capita                2179 non-null   float64
 4   Social support                    2186 non-null   float64
 5   Healthy life expectancy at birth  2145 non-null   float64
 6   Freedom to make life choices      2166 non-null   float64
 7   Generosity                        2126 non-null   float64
 8   Perceptions of corruption         2083 non-null   float64
 9   Positive affect                   2175 non-null   float64
 10  Negative affect                   2183 non-null   float64
dtypes: float64(9), int64(1), object(1)
memory usage: 189.1+ KB


In [156]:
world_happiness_2005.describe()

Unnamed: 0,Year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect
count,2199.0,2199.0,2179.0,2186.0,2145.0,2166.0,2126.0,2083.0,2175.0,2183.0
mean,2014.161437,5.479226,9.389766,0.810679,63.294583,0.747858,9.6e-05,0.745195,0.652143,0.271501
std,4.718736,1.125529,1.153387,0.120952,6.901104,0.14015,0.161083,0.185837,0.105922,0.086875
min,2005.0,1.281271,5.526723,0.228217,6.72,0.257534,-0.337527,0.035198,0.178886,0.082737
25%,2010.0,4.64675,8.499764,0.746609,59.119999,0.656528,-0.112116,0.688139,0.571684,0.20766
50%,2014.0,5.432437,9.498955,0.835535,65.050003,0.769821,-0.022671,0.799654,0.663063,0.260671
75%,2018.0,6.30946,10.373216,0.904792,68.5,0.859382,0.09207,0.868827,0.737936,0.322894
max,2022.0,8.018934,11.663788,0.987343,74.474998,0.985178,0.702708,0.983276,0.883586,0.70459


In [157]:
world_happiness_2005.head()

Unnamed: 0,Country name,Year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect
0,Afghanistan,2008,3.72359,7.350416,0.450662,50.5,0.718114,0.167652,0.881686,0.414297,0.258195
1,Afghanistan,2009,4.401778,7.508646,0.552308,50.799999,0.678896,0.190809,0.850035,0.481421,0.237092
2,Afghanistan,2010,4.758381,7.6139,0.539075,51.099998,0.600127,0.121316,0.706766,0.516907,0.275324
3,Afghanistan,2011,3.831719,7.581259,0.521104,51.400002,0.495901,0.163571,0.731109,0.479835,0.267175
4,Afghanistan,2012,3.782938,7.660506,0.520637,51.700001,0.530935,0.237588,0.77562,0.613513,0.267919


## 3.3) Checking for Observation status and deleting all non-normal status

#### 3.3.1) Social expenditure

In [160]:
obs_stat_social = social_exp_2005["Observation status"].unique()
print(obs_stat_social)

['Normal value']


#### 3.3.2) GDP

In [162]:
obs2 = gdp_defl_2005["Observation status"].unique()
print(obs2)

obs2_counts = gdp_defl_2005["Observation status"].value_counts()
print("Counts of each unique value:\n", obs2_counts)

['Normal value' 'Provisional value' 'Estimated value' 'Definition differs'
 'Time series break']
Counts of each unique value:
 Observation status
Normal value          1087
Estimated value         44
Provisional value       37
Definition differs       2
Time series break        1
Name: count, dtype: int64


In [163]:
gdp_no_etimated = gdp_defl_2005[gdp_defl_2005["Observation status"]!= "Estimated value"]
gdp_no_etimated1 = gdp_no_etimated[gdp_no_etimated["Observation status"]!= "Provisional value"]
gdp_no_etimated2 = gdp_no_etimated1[gdp_no_etimated1["Observation status"]!= "Definition differs"]
gdp_no_etimated3 = gdp_no_etimated2[gdp_no_etimated2["Observation status"]!= "Time series break"]

In [164]:
obs3_counts = gdp_no_etimated3["Observation status"].value_counts()
print(obs3_counts)

Observation status
Normal value    1087
Name: count, dtype: int64


#### 3.3.3) Feel safe

In [166]:
obs3 = feel_safe_2005["Observation status"].unique()
print(obs3)

['Normal value']


#### 3.3.4) Perceived health

In [168]:
obs4 = percivd_health_2005["Observation status"].unique()
print(obs4)
obs4_counts = percivd_health_2005["Observation status"].value_counts()
print(obs4_counts)

['Normal value' 'Definition differs' 'Time series break']
Observation status
Normal value          497
Definition differs     62
Time series break      11
Name: count, dtype: int64


In [169]:
percvd_health_clean1 = percivd_health_2005[percivd_health_2005["Observation status"]!= "Definition differs"]
percvd_health_clean2 = percvd_health_clean1[percvd_health_clean1["Observation status"]!= "Time series break"]
obs_counts4 = percvd_health_clean2 ["Observation status"].value_counts()
print(obs_counts4)

Observation status
Normal value    497
Name: count, dtype: int64


## 3.4) Removing null and na values and duplicates

#### 3.4.1) Social expenditure

In [172]:
social_exp_2005.isna().sum()

REF_AREA              0
Reference area        0
Measure               0
Unit of measure       0
Expenditure source    0
Spending type         0
Programme type        0
Year                  0
OBS_VALUE             0
OBS_STATUS            0
Observation status    0
Unit multiplier       0
dtype: int64

In [173]:
social_exp_clean_df = social_exp_2005.drop_duplicates()
social_exp_clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 467 entries, 0 to 1814
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   REF_AREA            467 non-null    object 
 1   Reference area      467 non-null    object 
 2   Measure             467 non-null    object 
 3   Unit of measure     467 non-null    object 
 4   Expenditure source  467 non-null    object 
 5   Spending type       467 non-null    object 
 6   Programme type      467 non-null    object 
 7   Year                467 non-null    int64  
 8   OBS_VALUE           467 non-null    float64
 9   OBS_STATUS          467 non-null    object 
 10  Observation status  467 non-null    object 
 11  Unit multiplier     467 non-null    object 
dtypes: float64(1), int64(1), object(10)
memory usage: 47.4+ KB


#### 3.4.2) GDP

In [175]:
gdp_no_etimated3.isna().sum()

REF_AREA                0
Reference area          0
Institutional sector    0
Transaction             0
Unit of measure         0
Price base              0
Year                    0
OBS_VALUE               0
OBS_STATUS              0
Observation status      0
Unit multiplier         0
dtype: int64

In [176]:
gdp_clean_df = gdp_no_etimated3.drop_duplicates()
gdp_clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1087 entries, 0 to 2623
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   REF_AREA              1087 non-null   object 
 1   Reference area        1087 non-null   object 
 2   Institutional sector  1087 non-null   object 
 3   Transaction           1087 non-null   object 
 4   Unit of measure       1087 non-null   object 
 5   Price base            1087 non-null   object 
 6   Year                  1087 non-null   int64  
 7   OBS_VALUE             1087 non-null   float64
 8   OBS_STATUS            1087 non-null   object 
 9   Observation status    1087 non-null   object 
 10  Unit multiplier       1087 non-null   object 
dtypes: float64(1), int64(1), object(9)
memory usage: 101.9+ KB


#### 3.4.3) Feel safe

In [178]:
feel_safe_2005.isna().sum()

REF_AREA              0
Reference area        0
Measure               0
Unit of measure       0
Year                  0
OBS_VALUE             0
Observation status    0
Unit multiplier       0
dtype: int64

In [179]:
feel_safe_clean_df = feel_safe_2005.drop_duplicates()
feel_safe_clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 675 entries, 68 to 5324
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   REF_AREA            675 non-null    object 
 1   Reference area      675 non-null    object 
 2   Measure             675 non-null    object 
 3   Unit of measure     675 non-null    object 
 4   Year                675 non-null    int64  
 5   OBS_VALUE           675 non-null    float64
 6   Observation status  675 non-null    object 
 7   Unit multiplier     675 non-null    object 
dtypes: float64(1), int64(1), object(6)
memory usage: 47.5+ KB


#### 3.4.4) Perceived health

In [181]:
percvd_health_clean2.isna().sum()

REF_AREA              0
Reference area        0
Measure               0
Unit of measure       0
Year                  0
OBS_VALUE             0
Observation status    0
Unit multiplier       0
dtype: int64

In [182]:
percvd_health_clean_df = percvd_health_clean2.drop_duplicates()
percvd_health_clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 497 entries, 0 to 5374
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   REF_AREA            497 non-null    object 
 1   Reference area      497 non-null    object 
 2   Measure             497 non-null    object 
 3   Unit of measure     497 non-null    object 
 4   Year                497 non-null    int64  
 5   OBS_VALUE           497 non-null    float64
 6   Observation status  497 non-null    object 
 7   Unit multiplier     497 non-null    object 
dtypes: float64(1), int64(1), object(6)
memory usage: 34.9+ KB


#### 3.4.5) World happiness

In [184]:
world_happiness_2005.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2199 entries, 0 to 2198
Data columns (total 11 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Country name                      2199 non-null   object 
 1   Year                              2199 non-null   int64  
 2   Life Ladder                       2199 non-null   float64
 3   Log GDP per capita                2179 non-null   float64
 4   Social support                    2186 non-null   float64
 5   Healthy life expectancy at birth  2145 non-null   float64
 6   Freedom to make life choices      2166 non-null   float64
 7   Generosity                        2126 non-null   float64
 8   Perceptions of corruption         2083 non-null   float64
 9   Positive affect                   2175 non-null   float64
 10  Negative affect                   2183 non-null   float64
dtypes: float64(9), int64(1), object(1)
memory usage: 189.1+ KB


In [185]:
world_happiness_2005.isna().sum()

Country name                          0
Year                                  0
Life Ladder                           0
Log GDP per capita                   20
Social support                       13
Healthy life expectancy at birth     54
Freedom to make life choices         33
Generosity                           73
Perceptions of corruption           116
Positive affect                      24
Negative affect                      16
dtype: int64

In [186]:
world_happiness_2005.drop(columns = ["Negative affect"], inplace=True) 

In [187]:
world_happiness_2005.isna().sum()

Country name                          0
Year                                  0
Life Ladder                           0
Log GDP per capita                   20
Social support                       13
Healthy life expectancy at birth     54
Freedom to make life choices         33
Generosity                           73
Perceptions of corruption           116
Positive affect                      24
dtype: int64

In [188]:
world_happiness_no_na = world_happiness_2005.dropna()
world_happiness_no_na.isna().sum()

Country name                        0
Year                                0
Life Ladder                         0
Log GDP per capita                  0
Social support                      0
Healthy life expectancy at birth    0
Freedom to make life choices        0
Generosity                          0
Perceptions of corruption           0
Positive affect                     0
dtype: int64

In [189]:
world_happiness_no_na.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1959 entries, 0 to 2198
Data columns (total 10 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Country name                      1959 non-null   object 
 1   Year                              1959 non-null   int64  
 2   Life Ladder                       1959 non-null   float64
 3   Log GDP per capita                1959 non-null   float64
 4   Social support                    1959 non-null   float64
 5   Healthy life expectancy at birth  1959 non-null   float64
 6   Freedom to make life choices      1959 non-null   float64
 7   Generosity                        1959 non-null   float64
 8   Perceptions of corruption         1959 non-null   float64
 9   Positive affect                   1959 non-null   float64
dtypes: float64(8), int64(1), object(1)
memory usage: 168.4+ KB


In [190]:
world_happiness_clean_df = world_happiness_no_na.drop_duplicates()

In [191]:
world_happiness_clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1959 entries, 0 to 2198
Data columns (total 10 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Country name                      1959 non-null   object 
 1   Year                              1959 non-null   int64  
 2   Life Ladder                       1959 non-null   float64
 3   Log GDP per capita                1959 non-null   float64
 4   Social support                    1959 non-null   float64
 5   Healthy life expectancy at birth  1959 non-null   float64
 6   Freedom to make life choices      1959 non-null   float64
 7   Generosity                        1959 non-null   float64
 8   Perceptions of corruption         1959 non-null   float64
 9   Positive affect                   1959 non-null   float64
dtypes: float64(8), int64(1), object(1)
memory usage: 168.4+ KB


#### 3.4.6) Unemployment

In [193]:
unempl_euro_melted_df.isna().sum()

geo                                            0
TIME_PERIOD                                    0
Unemployment as percentage of labour force    15
dtype: int64

In [194]:
unempl_no_na = unempl_euro_melted_df.dropna()
unempl_no_na.isna().sum()

geo                                           0
TIME_PERIOD                                   0
Unemployment as percentage of labour force    0
dtype: int64

In [195]:
unempl_clean_df = unempl_euro_melted_df.drop_duplicates()
unempl_clean_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 444 entries, 0 to 443
Data columns (total 3 columns):
 #   Column                                      Non-Null Count  Dtype 
---  ------                                      --------------  ----- 
 0   geo                                         444 non-null    object
 1   TIME_PERIOD                                 444 non-null    object
 2   Unemployment as percentage of labour force  429 non-null    object
dtypes: object(3)
memory usage: 10.5+ KB


## 3.6) Removing all unecessary columns and aligning column names before merging

### 3.6.1) Social expenditure

In [198]:
social_exp_clean_df.head()

Unnamed: 0,REF_AREA,Reference area,Measure,Unit of measure,Expenditure source,Spending type,Programme type,Year,OBS_VALUE,OBS_STATUS,Observation status,Unit multiplier
0,HUN,Hungary,Social expenditure,Percentage of general government expenditure,Public,Total,Total,2014,42.619,A,Normal value,Units
1,HUN,Hungary,Social expenditure,Percentage of general government expenditure,Public,Total,Total,2015,40.403,A,Normal value,Units
2,HUN,Hungary,Social expenditure,Percentage of general government expenditure,Public,Total,Total,2016,43.11,A,Normal value,Units
3,HUN,Hungary,Social expenditure,Percentage of general government expenditure,Public,Total,Total,2017,41.605,A,Normal value,Units
4,HUN,Hungary,Social expenditure,Percentage of general government expenditure,Public,Total,Total,2018,40.342,A,Normal value,Units


In [199]:
social_exp_clean_df.rename(columns={'OBS_VALUE': 'Social expenditure as % of general gorvernment expenditure', "Year": "TIME_PERIOD"}, inplace=True)

In [200]:
social_exp_clean_df.drop(columns = ["Measure", "Unit of measure", "Expenditure source", "Spending type", "Programme type", "Observation status", "Unit multiplier", "OBS_STATUS"], inplace=True)

In [201]:
social_exp_clean_df.head()

Unnamed: 0,REF_AREA,Reference area,TIME_PERIOD,Social expenditure as % of general gorvernment expenditure
0,HUN,Hungary,2014,42.619
1,HUN,Hungary,2015,40.403
2,HUN,Hungary,2016,43.11
3,HUN,Hungary,2017,41.605
4,HUN,Hungary,2018,40.342


### 3.6.2) Feel safe

In [203]:
feel_safe_clean_df.head()

Unnamed: 0,REF_AREA,Reference area,Measure,Unit of measure,Year,OBS_VALUE,Observation status,Unit multiplier
68,SVK,Slovak Republic,Feeling safe at night,Percentage of population aged 15 years or over,2016,65.331679,Normal value,Units
69,SVK,Slovak Republic,Feeling safe at night,Percentage of population aged 15 years or over,2017,69.671352,Normal value,Units
70,SVK,Slovak Republic,Feeling safe at night,Percentage of population aged 15 years or over,2018,69.671352,Normal value,Units
71,SVK,Slovak Republic,Feeling safe at night,Percentage of population aged 15 years or over,2019,69.671352,Normal value,Units
72,SVK,Slovak Republic,Feeling safe at night,Percentage of population aged 15 years or over,2020,78.012499,Normal value,Units


In [204]:
feel_safe_clean_df.rename(columns={'OBS_VALUE': 'Feeling safe at night as % percentage of population 15 years or older', "Year": "TIME_PERIOD"}, inplace=True)
feel_safe_clean_df.drop(columns = ["Measure", "Unit of measure", "Observation status", "Unit multiplier"], inplace=True)
feel_safe_clean_df.head()

Unnamed: 0,REF_AREA,Reference area,TIME_PERIOD,Feeling safe at night as % percentage of population 15 years or older
68,SVK,Slovak Republic,2016,65.331679
69,SVK,Slovak Republic,2017,69.671352
70,SVK,Slovak Republic,2018,69.671352
71,SVK,Slovak Republic,2019,69.671352
72,SVK,Slovak Republic,2020,78.012499


### 3.6.3) Percieved health

In [206]:
percvd_health_clean_df.head()

Unnamed: 0,REF_AREA,Reference area,Measure,Unit of measure,Year,OBS_VALUE,Observation status,Unit multiplier
0,LVA,Latvia,Perceived health as positive,Percentage of population aged 16 years or over,2017,44.2,Normal value,Units
1,LVA,Latvia,Perceived health as positive,Percentage of population aged 16 years or over,2018,46.9,Normal value,Units
2,LVA,Latvia,Perceived health as positive,Percentage of population aged 16 years or over,2019,47.1,Normal value,Units
3,LVA,Latvia,Perceived health as positive,Percentage of population aged 16 years or over,2020,49.7,Normal value,Units
4,LVA,Latvia,Perceived health as positive,Percentage of population aged 16 years or over,2021,49.8,Normal value,Units


In [207]:
world_happiness_clean_df.head()

Unnamed: 0,Country name,Year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect
0,Afghanistan,2008,3.72359,7.350416,0.450662,50.5,0.718114,0.167652,0.881686,0.414297
1,Afghanistan,2009,4.401778,7.508646,0.552308,50.799999,0.678896,0.190809,0.850035,0.481421
2,Afghanistan,2010,4.758381,7.6139,0.539075,51.099998,0.600127,0.121316,0.706766,0.516907
3,Afghanistan,2011,3.831719,7.581259,0.521104,51.400002,0.495901,0.163571,0.731109,0.479835
4,Afghanistan,2012,3.782938,7.660506,0.520637,51.700001,0.530935,0.237588,0.77562,0.613513


In [208]:
unempl_clean_df.head()

Unnamed: 0,geo,TIME_PERIOD,Unemployment as percentage of labour force
0,AT,2012,5.2
1,BA,2012,
2,BE,2012,7.6
3,BG,2012,13.3
4,CH,2012,4.5


## 3.7) Merging data frames

In [210]:
social_exp_clean_df
feel_safe_clean_df
percvd_health_clean_df
world_happiness_clean_df
unempl_clean_df

Unnamed: 0,geo,TIME_PERIOD,Unemployment as percentage of labour force
0,AT,2012,5.2
1,BA,2012,
2,BE,2012,7.6
3,BG,2012,13.3
4,CH,2012,4.5
...,...,...,...
439,RS,2023,9.5
440,SE,2023,7.7
441,SI,2023,3.7
442,SK,2023,5.8


In [211]:
unempl_clean_df.dropna(inplace=True)

In [212]:
unempl_clean_df.isna().sum()

geo                                           0
TIME_PERIOD                                   0
Unemployment as percentage of labour force    0
dtype: int64

In [213]:
unempl_clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 429 entries, 0 to 443
Data columns (total 3 columns):
 #   Column                                      Non-Null Count  Dtype 
---  ------                                      --------------  ----- 
 0   geo                                         429 non-null    object
 1   TIME_PERIOD                                 429 non-null    object
 2   Unemployment as percentage of labour force  429 non-null    object
dtypes: object(3)
memory usage: 13.4+ KB


In [214]:
unempl_clean_df.head()

Unnamed: 0,geo,TIME_PERIOD,Unemployment as percentage of labour force
0,AT,2012,5.2
2,BE,2012,7.6
3,BG,2012,13.3
4,CH,2012,4.5
5,CY,2012,11.9


In [215]:
#unempl_clean_df.sort_values(by="TIME_PERIOD", inplace=True)
#unempl_clean_df.head()

In [216]:
country_mapping = {country.alpha_2: country.name for country in pycountry.countries}

# Replace abbreviations with full country names
unempl_clean_df['geo'] = unempl_clean_df['geo'].replace(country_mapping)
unempl_clean_df.head()

Unnamed: 0,geo,TIME_PERIOD,Unemployment as percentage of labour force
0,Austria,2012,5.2
2,Belgium,2012,7.6
3,Bulgaria,2012,13.3
4,Switzerland,2012,4.5
5,Cyprus,2012,11.9


In [217]:
geo_unique = unempl_clean_df["geo"].value_counts()
print(geo_unique)

geo
Austria                   12
Poland                    12
Lithuania                 12
Luxembourg                12
Latvia                    12
Malta                     12
Netherlands               12
Norway                    12
Portugal                  12
Belgium                   12
Romania                   12
Serbia                    12
Sweden                    12
Slovenia                  12
Slovakia                  12
Türkiye                   12
Italy                     12
Iceland                   12
Ireland                   12
Estonia                   12
Bulgaria                  12
Switzerland               12
Cyprus                    12
Czechia                   12
Germany                   12
Denmark                   12
EA20                      12
EL                        12
Hungary                   12
Spain                     12
EU27_2020                 12
Finland                   12
France                    12
Croatia                   12
North Mace

In [218]:
unempl_clean_df.isna().sum()

geo                                           0
TIME_PERIOD                                   0
Unemployment as percentage of labour force    0
dtype: int64

In [219]:
world_happiness_clean_df.head()

Unnamed: 0,Country name,Year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect
0,Afghanistan,2008,3.72359,7.350416,0.450662,50.5,0.718114,0.167652,0.881686,0.414297
1,Afghanistan,2009,4.401778,7.508646,0.552308,50.799999,0.678896,0.190809,0.850035,0.481421
2,Afghanistan,2010,4.758381,7.6139,0.539075,51.099998,0.600127,0.121316,0.706766,0.516907
3,Afghanistan,2011,3.831719,7.581259,0.521104,51.400002,0.495901,0.163571,0.731109,0.479835
4,Afghanistan,2012,3.782938,7.660506,0.520637,51.700001,0.530935,0.237588,0.77562,0.613513


In [220]:
world_happiness_clean_df.rename(columns = {"Year": "TIME_PERIOD", "Country name": "Reference area"}, inplace=True)
world_happiness_clean_df.head()

Unnamed: 0,Reference area,TIME_PERIOD,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect
0,Afghanistan,2008,3.72359,7.350416,0.450662,50.5,0.718114,0.167652,0.881686,0.414297
1,Afghanistan,2009,4.401778,7.508646,0.552308,50.799999,0.678896,0.190809,0.850035,0.481421
2,Afghanistan,2010,4.758381,7.6139,0.539075,51.099998,0.600127,0.121316,0.706766,0.516907
3,Afghanistan,2011,3.831719,7.581259,0.521104,51.400002,0.495901,0.163571,0.731109,0.479835
4,Afghanistan,2012,3.782938,7.660506,0.520637,51.700001,0.530935,0.237588,0.77562,0.613513


In [221]:
df_hungary = world_happiness_clean_df[world_happiness_clean_df["Reference area"]=="Hungary"]
df_hungary.iloc[1:15]

Unnamed: 0,Reference area,TIME_PERIOD,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect
807,Hungary,2009,4.8946,10.092547,0.900874,65.639999,0.464373,-0.128532,0.914701,0.5752
808,Hungary,2010,4.725132,10.105513,0.895694,65.800003,0.513835,-0.148766,0.983276,0.574386
809,Hungary,2011,4.917603,10.126842,0.893662,65.959999,0.6311,-0.092765,0.939908,0.586092
810,Hungary,2012,4.683358,10.119426,0.906114,66.120003,0.569232,-0.139944,0.930297,0.581949
811,Hungary,2013,4.914467,10.140044,0.877318,66.279999,0.673728,-0.116701,0.911533,0.646606
812,Hungary,2014,5.180563,10.184189,0.844735,66.440002,0.494475,-0.153601,0.855361,0.577917
813,Hungary,2015,5.344383,10.22297,0.858734,66.599998,0.557721,-0.201611,0.90753,0.649912
814,Hungary,2016,5.448902,10.247692,0.899512,66.75,0.553952,-0.190679,0.924186,0.589962
815,Hungary,2017,6.065039,10.292184,0.876748,66.900002,0.661166,-0.143131,0.886361,0.643605
816,Hungary,2018,5.935771,10.345686,0.940591,67.050003,0.692627,-0.246665,0.911277,0.594677


In [222]:
social_exp_clean_df.drop(columns="REF_AREA", inplace=True)


In [223]:
social_exp_clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 467 entries, 0 to 1814
Data columns (total 3 columns):
 #   Column                                                      Non-Null Count  Dtype  
---  ------                                                      --------------  -----  
 0   Reference area                                              467 non-null    object 
 1   TIME_PERIOD                                                 467 non-null    int64  
 2   Social expenditure as % of general gorvernment expenditure  467 non-null    float64
dtypes: float64(1), int64(1), object(1)
memory usage: 14.6+ KB


### 3.7.1) First test: social_expenditure and feeling safe at night

In [225]:
merged_df1 = pd.merge(social_exp_clean_df, feel_safe_clean_df, on =['Reference area', 'TIME_PERIOD'], how='inner') 
merged_df1.head()

Unnamed: 0,Reference area,TIME_PERIOD,Social expenditure as % of general gorvernment expenditure,REF_AREA,Feeling safe at night as % percentage of population 15 years or older
0,Hungary,2014,42.619,HUN,54.841674
1,Hungary,2015,40.403,HUN,54.841674
2,Hungary,2016,43.11,HUN,54.841674
3,Hungary,2017,41.605,HUN,68.892288
4,Hungary,2018,40.342,HUN,68.892288


In [226]:
merged_df1.isna().sum()


Reference area                                                           0
TIME_PERIOD                                                              0
Social expenditure as % of general gorvernment expenditure               0
REF_AREA                                                                 0
Feeling safe at night as % percentage of population 15 years or older    0
dtype: int64

ref_area = merged_df1["TIME_PERIOD"].value_counts()
print(ref_area)

### 3.7.2) Happiness and social expenditure

In [229]:
merged_df2 = pd.merge(social_exp_clean_df, world_happiness_clean_df, on =['Reference area', 'TIME_PERIOD'], how='inner') 
merged_df2.head()

Unnamed: 0,Reference area,TIME_PERIOD,Social expenditure as % of general gorvernment expenditure,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect
0,Hungary,2014,42.619,5.180563,10.184189,0.844735,66.440002,0.494475,-0.153601,0.855361,0.577917
1,Hungary,2015,40.403,5.344383,10.22297,0.858734,66.599998,0.557721,-0.201611,0.90753,0.649912
2,Hungary,2016,43.11,5.448902,10.247692,0.899512,66.75,0.553952,-0.190679,0.924186,0.589962
3,Hungary,2017,41.605,6.065039,10.292184,0.876748,66.900002,0.661166,-0.143131,0.886361,0.643605
4,Hungary,2018,40.342,5.935771,10.345686,0.940591,67.050003,0.692627,-0.246665,0.911277,0.594677


In [230]:
merged_df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 352 entries, 0 to 351
Data columns (total 11 columns):
 #   Column                                                      Non-Null Count  Dtype  
---  ------                                                      --------------  -----  
 0   Reference area                                              352 non-null    object 
 1   TIME_PERIOD                                                 352 non-null    int64  
 2   Social expenditure as % of general gorvernment expenditure  352 non-null    float64
 3   Life Ladder                                                 352 non-null    float64
 4   Log GDP per capita                                          352 non-null    float64
 5   Social support                                              352 non-null    float64
 6   Healthy life expectancy at birth                            352 non-null    float64
 7   Freedom to make life choices                                352 non-null    float64
 8   

In [231]:
merged_df2.sort_values(by="TIME_PERIOD", inplace=True)
merged_df2.iloc[:20]

Unnamed: 0,Reference area,TIME_PERIOD,Social expenditure as % of general gorvernment expenditure,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect
260,Austria,2006,51.732,7.122211,10.836452,0.93635,69.5,0.941382,0.298436,0.490111,0.746153
247,Ireland,2006,47.199,7.144247,10.982106,0.967041,69.620003,0.943275,0.236722,0.472849,0.814785
169,France,2006,53.698,6.5827,10.65403,0.943929,70.800003,0.789121,0.122206,0.69927,0.694246
284,Latvia,2006,33.13,4.709502,10.042187,0.884499,63.099998,0.640807,-0.2336,0.937049,0.590247
76,Lithuania,2006,40.054,5.954443,10.042293,0.93044,63.5,0.567255,-0.298515,0.966879,0.56656
141,Portugal,2006,47.833,5.405246,10.358528,0.90529,68.339996,0.882068,-0.182437,0.880059,0.646715
303,New Zealand,2006,51.605,7.305014,10.54081,0.946047,69.720001,0.93208,0.30632,0.22422,0.824703
181,Finland,2006,49.462,7.672449,10.745317,0.964563,68.720001,0.96858,-0.008802,0.13243,0.682819
30,Norway,2006,47.287,7.415682,11.048457,0.958511,69.400002,0.959533,0.102744,0.39715,0.766967
153,Switzerland,2006,47.832,7.473253,11.069195,0.951352,71.160004,0.918958,0.284541,0.407931,0.742094


In [232]:
time_periods = merged_df2["TIME_PERIOD"].value_counts()
print(time_periods)

TIME_PERIOD
2012    29
2015    29
2016    29
2017    29
2014    28
2018    28
2013    27
2019    27
2011    26
2010    24
2008    21
2009    19
2007    18
2006    12
2020     5
2021     1
Name: count, dtype: int64


In [233]:
### 3.7.3) Unemployment added to Happiness and Social Expenditure data frame

In [405]:
unempl_clean_df.rename(columns={"geo":"Reference area"}, inplace=True)
unempl_clean_df.head()

Unnamed: 0,Reference area,TIME_PERIOD,Unemployment as percentage of labour force
0,Austria,2012,5.2
2,Belgium,2012,7.6
3,Bulgaria,2012,13.3
4,Switzerland,2012,4.5
5,Cyprus,2012,11.9


In [407]:
merged_df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 352 entries, 260 to 104
Data columns (total 11 columns):
 #   Column                                                      Non-Null Count  Dtype  
---  ------                                                      --------------  -----  
 0   Reference area                                              352 non-null    object 
 1   TIME_PERIOD                                                 352 non-null    int64  
 2   Social expenditure as % of general gorvernment expenditure  352 non-null    float64
 3   Life Ladder                                                 352 non-null    float64
 4   Log GDP per capita                                          352 non-null    float64
 5   Social support                                              352 non-null    float64
 6   Healthy life expectancy at birth                            352 non-null    float64
 7   Freedom to make life choices                                352 non-null    float64
 8   Gen

In [409]:
unempl_clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 429 entries, 0 to 443
Data columns (total 3 columns):
 #   Column                                      Non-Null Count  Dtype 
---  ------                                      --------------  ----- 
 0   Reference area                              429 non-null    object
 1   TIME_PERIOD                                 429 non-null    object
 2   Unemployment as percentage of labour force  429 non-null    object
dtypes: object(3)
memory usage: 13.4+ KB


In [411]:
unempl_percent = unempl_clean_df["Unemployment as percentage of labour force"].value_counts()
print(unempl_percent)

Unemployment as percentage of labour force
6.8      11
6.1       8
5.6       7
5.0       7
4.8       7
         ..
21.8      1
11.8      1
19.5      1
7.2 b     1
9.4       1
Name: count, Length: 194, dtype: int64


In [549]:
unempl_clean_df = unempl_clean_df[unempl_clean_df['Unemployment as percentage of labour force'] != "9.9 b"]
unempl_clean_df = unempl_clean_df[unempl_clean_df['Unemployment as percentage of labour force'] != "6.7 b"]
unempl_clean_df = unempl_clean_df[unempl_clean_df['Unemployment as percentage of labour force'] != "6.0 b"]
unempl_clean_df = unempl_clean_df[unempl_clean_df['Unemployment as percentage of labour force'] != "7.2 b"]
unempl_clean_df = unempl_clean_df[unempl_clean_df['Unemployment as percentage of labour force'] != "5.8 b"]
unempl_clean_df = unempl_clean_df[unempl_clean_df['Unemployment as percentage of labour force'] != "10.6 b"]
unempl_clean_df = unempl_clean_df[unempl_clean_df['Unemployment as percentage of labour force'] != "5.2 b"]
unempl_clean_df = unempl_clean_df[unempl_clean_df['Unemployment as percentage of labour force'] != "6.6 b"]
unempl_clean_df = unempl_clean_df[unempl_clean_df['Unemployment as percentage of labour force'] != "3.3 b"]
unempl_clean_df = unempl_clean_df[unempl_clean_df['Unemployment as percentage of labour force'] != "3.7 b"]
unempl_clean_df = unempl_clean_df[unempl_clean_df['Unemployment as percentage of labour force'] != "5.5 b"]
unempl_clean_df = unempl_clean_df[unempl_clean_df['Unemployment as percentage of labour force'] != "14.9 b"]
unempl_clean_df = unempl_clean_df[unempl_clean_df['Unemployment as percentage of labour force'] != "7.9 d"]
unempl_clean_df = unempl_clean_df[unempl_clean_df['Unemployment as percentage of labour force'] != "6.2 b"]
unempl_clean_df = unempl_clean_df[unempl_clean_df['Unemployment as percentage of labour force'] != "11.2 b"]
unempl_clean_df = unempl_clean_df[unempl_clean_df['Unemployment as percentage of labour force'] != "12.0 b"]
unempl_clean_df = unempl_clean_df[unempl_clean_df['Unemployment as percentage of labour force'] != "12.0 d"]
unempl_clean_df = unempl_clean_df[unempl_clean_df['Unemployment as percentage of labour force'] != "13.0 d"]
unempl_clean_df = unempl_clean_df[unempl_clean_df['Unemployment as percentage of labour force'] != "7.3 d"]
unempl_clean_df = unempl_clean_df[unempl_clean_df['Unemployment as percentage of labour force'] != "5.1 b"]
unempl_clean_df = unempl_clean_df[unempl_clean_df['Unemployment as percentage of labour force'] != "12.2 d"]


In [551]:
unempl_clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 407 entries, 0 to 443
Data columns (total 3 columns):
 #   Column                                      Non-Null Count  Dtype 
---  ------                                      --------------  ----- 
 0   Reference area                              407 non-null    object
 1   TIME_PERIOD                                 407 non-null    int64 
 2   Unemployment as percentage of labour force  407 non-null    object
dtypes: int64(1), object(2)
memory usage: 12.7+ KB


In [555]:
unempl_clean_df['TIME_PERIOD'] = pd.to_numeric(unempl_clean_df['TIME_PERIOD'], errors='raise').astype('int64')

# Convert to float64 (if values contain decimals)
unempl_clean_df['Unemployment as percentage of labour force'] = pd.to_numeric(unempl_clean_df['Unemployment as percentage of labour force'], errors='raise').astype('float64')
unempl_clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 407 entries, 0 to 443
Data columns (total 3 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   Reference area                              407 non-null    object 
 1   TIME_PERIOD                                 407 non-null    int64  
 2   Unemployment as percentage of labour force  407 non-null    float64
dtypes: float64(1), int64(1), object(1)
memory usage: 12.7+ KB


In [557]:
merged_df3 = pd.merge(unempl_clean_df, merged_df2, on =['Reference area', 'TIME_PERIOD'], how='inner') 
merged_df3.head()


Unnamed: 0,Reference area,TIME_PERIOD,Unemployment as percentage of labour force,Social expenditure as % of general gorvernment expenditure,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect
0,Austria,2012,5.2,53.914,7.400689,10.883644,0.945142,70.099998,0.919704,0.113814,0.770586,0.71215
1,Belgium,2012,7.6,49.511,6.935122,10.783341,0.927117,69.519997,0.855267,-0.054311,0.757573,0.718187
2,Switzerland,2012,4.5,47.434,7.776209,11.107383,0.946864,71.519997,0.945428,0.132033,0.323241,0.792705
3,Czechia,2012,7.0,44.45,6.334149,10.4249,0.912427,67.940002,0.739809,-0.157698,0.9568,0.634971
4,Germany,2012,5.1,54.901,6.702362,10.817224,0.926407,70.040001,0.90444,0.066589,0.679237,0.699049


In [559]:
merged_df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 173 entries, 0 to 172
Data columns (total 12 columns):
 #   Column                                                      Non-Null Count  Dtype  
---  ------                                                      --------------  -----  
 0   Reference area                                              173 non-null    object 
 1   TIME_PERIOD                                                 173 non-null    int64  
 2   Unemployment as percentage of labour force                  173 non-null    float64
 3   Social expenditure as % of general gorvernment expenditure  173 non-null    float64
 4   Life Ladder                                                 173 non-null    float64
 5   Log GDP per capita                                          173 non-null    float64
 6   Social support                                              173 non-null    float64
 7   Healthy life expectancy at birth                            173 non-null    float64
 8   