# Extracting data on taxation (main aggregated taxes) in European countries from 2010 to 2022

# Link

Data: https://taxation-customs.ec.europa.eu/taxation/economic-analyses/taxation-trends-eu/data-taxation-trends_en

Explaining unit of measure: https://taxation-customs.ec.europa.eu/document/download/1d57efba-ea2b-4e2e-8e33-b5b0c24da640_en?filename=Reference%20metadata%20on%20methodology%20and%20quality%20for%20Data_on_Taxation_final2024.pdf


# Imports

In [5]:
import pandas as pd
import numpy as np

# 1) Extracting data from xlsx file

In [7]:
df = pd.read_excel("All_data_files/tax-main-aggregates.xlsx", header=2)

# 2) Initial exploration of data (for cleaning and transformation)

In [9]:
df.head()

Unnamed: 0.1,Unnamed: 0,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,Difference 2012-2022 (pp),Ranking 2022,Revenue 2022\n(million EUR)
0,EU-27,37.922746,38.345385,39.301449,39.78725,39.865975,39.730539,39.843225,39.930718,40.054073,39.944737,40.002493,40.395208,40.161307,0.9,,6387768.3
1,EA-19,38.101209,38.58623,39.615106,40.116221,40.179571,40.061478,40.104618,40.241508,40.427915,40.296537,40.357165,40.786176,40.841225,1.2,,5482502.8
2,Belgium,43.580535,44.369598,45.319126,46.033293,45.683596,44.99157,44.213578,44.747951,44.824441,43.467108,43.411264,43.166396,43.268219,-2.0,2.0,239725.1
3,Bulgaria,25.398276,25.477641,26.078859,28.132118,28.399268,28.863177,29.184384,29.840882,29.683558,30.353742,30.498298,30.785349,31.145084,5.0,23.0,26722.7
4,Czechia,32.854823,33.975879,34.497486,34.897577,34.136013,34.288463,35.075928,35.402552,35.972707,35.907027,35.923476,35.890107,35.287515,0.8,17.0,97487.3


In [10]:
df.describe()

Unnamed: 0,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,Difference 2012-2022 (pp),Ranking 2022,Revenue 2022\n(million EUR)
count,31.0,31.0,31.0,31.0,31.0,31.0,31.0,31.0,31.0,31.0,31.0,31.0,31.0,31.0,27.0,31.0
mean,34.971549,35.297372,35.840976,36.275764,36.585698,36.330365,37.05243,36.657492,36.850943,36.860766,36.845322,37.160131,37.038403,1.196774,14.0,597198.3
std,5.622321,5.556011,5.859375,5.704859,5.682088,5.767238,6.166751,5.787401,5.672944,5.69714,5.789676,5.812674,5.647022,3.012694,7.937254,1474485.0
min,25.398276,25.477641,26.078859,26.905064,27.476607,23.177846,23.651552,22.52835,22.307106,21.929389,19.778489,20.736071,20.92036,-7.4,1.0,4995.7
25%,31.135295,31.572889,31.648698,31.701377,32.203787,32.77489,33.213012,33.313545,33.469553,34.257439,34.149338,34.32837,34.625991,-3.552714e-15,7.5,25935.75
50%,35.524502,35.46472,36.447049,36.253839,37.036155,36.944934,38.03418,37.285551,37.44526,37.73576,37.69931,37.91457,37.526233,1.2,14.0,97487.3
75%,39.331721,39.862323,40.459811,39.951735,40.022773,39.896009,40.960007,40.086113,40.386303,40.191647,40.179829,41.118597,41.495061,2.65,20.5,242617.2
max,45.01615,45.045726,45.756075,46.265653,48.904413,46.375069,50.285574,46.356457,46.256388,47.054823,47.416352,47.579249,46.152086,6.2,27.0,6387768.0


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37 entries, 0 to 36
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Unnamed: 0                  33 non-null     object 
 1   2010                        31 non-null     float64
 2   2011                        31 non-null     float64
 3   2012                        31 non-null     float64
 4   2013                        31 non-null     float64
 5   2014                        31 non-null     float64
 6   2015                        31 non-null     float64
 7   2016                        31 non-null     float64
 8   2017                        31 non-null     float64
 9   2018                        31 non-null     float64
 10  2019                        31 non-null     float64
 11  2020                        31 non-null     float64
 12  2021                        31 non-null     float64
 13  2022                        31 non-nu

In [12]:
df.isna().sum()

Unnamed: 0                      4
2010                            6
2011                            6
2012                            6
2013                            6
2014                            6
2015                            6
2016                            6
2017                            6
2018                            6
2019                            6
2020                            6
2021                            6
2022                            6
Difference 2012-2022 (pp)       6
Ranking 2022                   10
Revenue 2022\n(million EUR)     6
dtype: int64

In [13]:
df.duplicated().sum()

3

# 3) Transforming and cleaning data

### 3.1) Renaming columns

- 'Unnamed 0' to 'Country'
- 'Difference 2012-2022 (pp)' to 'Tax difference 2012-2022'

In [17]:
df.rename(columns = {"Unnamed: 0": "Country",'Difference 2012-2022 (pp)': 'Tax difference 2012-2022' }, inplace=True)

In [18]:
df.head()

Unnamed: 0,Country,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,Tax difference 2012-2022,Ranking 2022,Revenue 2022\n(million EUR)
0,EU-27,37.922746,38.345385,39.301449,39.78725,39.865975,39.730539,39.843225,39.930718,40.054073,39.944737,40.002493,40.395208,40.161307,0.9,,6387768.3
1,EA-19,38.101209,38.58623,39.615106,40.116221,40.179571,40.061478,40.104618,40.241508,40.427915,40.296537,40.357165,40.786176,40.841225,1.2,,5482502.8
2,Belgium,43.580535,44.369598,45.319126,46.033293,45.683596,44.99157,44.213578,44.747951,44.824441,43.467108,43.411264,43.166396,43.268219,-2.0,2.0,239725.1
3,Bulgaria,25.398276,25.477641,26.078859,28.132118,28.399268,28.863177,29.184384,29.840882,29.683558,30.353742,30.498298,30.785349,31.145084,5.0,23.0,26722.7
4,Czechia,32.854823,33.975879,34.497486,34.897577,34.136013,34.288463,35.075928,35.402552,35.972707,35.907027,35.923476,35.890107,35.287515,0.8,17.0,97487.3


### 3.2) Removing 'Ranking 2022' and 'Revenue 2022\n(million EUR)'

In [20]:
df.drop(columns=["Revenue 2022\n(million EUR)","Ranking 2022"], inplace= True)
df.head()

Unnamed: 0,Country,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,Tax difference 2012-2022
0,EU-27,37.922746,38.345385,39.301449,39.78725,39.865975,39.730539,39.843225,39.930718,40.054073,39.944737,40.002493,40.395208,40.161307,0.9
1,EA-19,38.101209,38.58623,39.615106,40.116221,40.179571,40.061478,40.104618,40.241508,40.427915,40.296537,40.357165,40.786176,40.841225,1.2
2,Belgium,43.580535,44.369598,45.319126,46.033293,45.683596,44.99157,44.213578,44.747951,44.824441,43.467108,43.411264,43.166396,43.268219,-2.0
3,Bulgaria,25.398276,25.477641,26.078859,28.132118,28.399268,28.863177,29.184384,29.840882,29.683558,30.353742,30.498298,30.785349,31.145084,5.0
4,Czechia,32.854823,33.975879,34.497486,34.897577,34.136013,34.288463,35.075928,35.402552,35.972707,35.907027,35.923476,35.890107,35.287515,0.8


### 3.2) Transforming the data frame's year values from wide to long

In [26]:
df = pd.melt(df, id_vars=["Country", "Tax difference 2012-2022"], var_name = "Year", value_name = "Main aggregated taxation as % of GDP")
df.head()

Unnamed: 0,Country,Tax difference 2012-2022,Year,Main aggregated taxation as % of GDP
0,EU-27,0.9,2010,37.922746
1,EA-19,1.2,2010,38.101209
2,Belgium,-2.0,2010,43.580535
3,Bulgaria,5.0,2010,25.398276
4,Czechia,0.8,2010,32.854823


In [32]:

df = df[['Country', 'Year', 'Main aggregated taxation as % of GDP', "Tax difference 2012-2022"]]
df.head()

Unnamed: 0,Country,Year,Main aggregated taxation as % of GDP,Tax difference 2012-2022
0,EU-27,2010,37.922746,0.9
1,EA-19,2010,38.101209,1.2
2,Belgium,2010,43.580535,-2.0
3,Bulgaria,2010,25.398276,5.0
4,Czechia,2010,32.854823,0.8


### 3.3) Removing rows with no relevant information 

In [35]:
countries = df["Country"].value_counts()
print(countries)

Country
EU-27                                                                                 13
Luxembourg                                                                            13
Source: European Commission, DG Taxation and Customs Union, based on Eurostat data    13
Norway                                                                                13
Iceland                                                                               13
Sweden                                                                                13
Finland                                                                               13
Slovakia                                                                              13
Slovenia                                                                              13
Romania                                                                               13
Portugal                                                                              13
Poland       

In [37]:
values_to_remove = ["EA-19", "Source: European Commission, DG Taxation and Customs Union, based on Eurostat data", "EU-27", "Data extracted February 2024"]
df = df[~df['Country'].isin(values_to_remove)]
countries1 = df['Country'].value_counts()
print(countries1)

Country
Belgium        13
Luxembourg     13
Iceland        13
Sweden         13
Finland        13
Slovakia       13
Slovenia       13
Romania        13
Portugal       13
Poland         13
Austria        13
Netherlands    13
Malta          13
Hungary        13
Lithuania      13
Bulgaria       13
Latvia         13
Cyprus         13
Italy          13
Croatia        13
France         13
Spain          13
Greece         13
Ireland        13
Estonia        13
Germany        13
Denmark        13
Czechia        13
Norway         13
Name: count, dtype: int64


In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 429 entries, 2 to 478
Data columns (total 4 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Country                               377 non-null    object 
 1   Year                                  429 non-null    object 
 2   Main aggregated taxation as % of GDP  377 non-null    float64
 3   Tax difference 2012-2022              377 non-null    float64
dtypes: float64(2), object(2)
memory usage: 16.8+ KB


### 3.4) Checking for and removing null/na/missing values

In [42]:
df.isna().sum()

Country                                 52
Year                                     0
Main aggregated taxation as % of GDP    52
Tax difference 2012-2022                52
dtype: int64

In [44]:
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 377 entries, 2 to 474
Data columns (total 4 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Country                               377 non-null    object 
 1   Year                                  377 non-null    object 
 2   Main aggregated taxation as % of GDP  377 non-null    float64
 3   Tax difference 2012-2022              377 non-null    float64
dtypes: float64(2), object(2)
memory usage: 14.7+ KB


### 3.5) Transforming numeric object values to numeric values (The 'Year' values)

In [47]:
years = df["Year"].value_counts()
print(years)

Year
2010    29
2011    29
2012    29
2013    29
2014    29
2015    29
2016    29
2017    29
2018    29
2019    29
2020    29
2021    29
2022    29
Name: count, dtype: int64


In [49]:
df.head()

Unnamed: 0,Country,Year,Main aggregated taxation as % of GDP,Tax difference 2012-2022
2,Belgium,2010,43.580535,-2.0
3,Bulgaria,2010,25.398276,5.0
4,Czechia,2010,32.854823,0.8
5,Denmark,2010,45.01615,-3.9
6,Germany,2010,37.30666,2.4


In [51]:
df['Year'] = pd.to_numeric(df['Year'], errors='coerce').astype('int64')
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 377 entries, 2 to 474
Data columns (total 4 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Country                               377 non-null    object 
 1   Year                                  377 non-null    int64  
 2   Main aggregated taxation as % of GDP  377 non-null    float64
 3   Tax difference 2012-2022              377 non-null    float64
dtypes: float64(2), int64(1), object(1)
memory usage: 14.7+ KB


In [53]:
df.head()

Unnamed: 0,Country,Year,Main aggregated taxation as % of GDP,Tax difference 2012-2022
2,Belgium,2010,43.580535,-2.0
3,Bulgaria,2010,25.398276,5.0
4,Czechia,2010,32.854823,0.8
5,Denmark,2010,45.01615,-3.9
6,Germany,2010,37.30666,2.4


# 4) Storing data frame in file

In [56]:
with pd.HDFStore('dataframes.h5') as store:
    store['taxation_df'] = df