# Data about unemployment (as % of labour force)

# Link

https://ec.europa.eu/eurostat/databrowser/view/tps00203/default/table?lang=en&category=t_labour.t_employ.t_lfsi.t_une

# Imports

In [5]:
import pandas as pd
import numpy as np
import pycountry

# 1) Extracting data from tsv file

In [7]:
df = pd.read_csv("All_data_files/unemployment_eurostat.tsv", sep=r"[\t,\\]", engine = "python")

# 2) Initial exploration of data (for cleaning and transformation)

In [9]:
df.head()

Unnamed: 0,freq,age,unit,sex,geo,TIME_PERIOD,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,A,Y15-74,PC_ACT,T,AT,5.2,5.7,6.0,6.1,6.5,5.9,5.2,4.8,6.0,6.2,4.8,5.1,
1,A,Y15-74,PC_ACT,T,BA,:,:,:,:,:,:,:,:,:,17.4,15.4,13.2,
2,A,Y15-74,PC_ACT,T,BE,7.6,8.6,8.7,8.7,7.9,7.2 b,6.0,5.5,5.8,6.3,5.6,5.5,
3,A,Y15-74,PC_ACT,T,BG,13.3,13.9,12.4,10.1,8.6,7.2,6.2,5.2 b,6.1,5.2,4.2,4.3,
4,A,Y15-74,PC_ACT,T,CH,4.5,4.8,4.9,4.8,5.0,4.8,4.7,4.4,4.8,5.1,4.1,4.1,


In [10]:
df.describe()

Unnamed: 0,2023
count,0.0
mean,
std,
min,
25%,
50%,
75%,
max,


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 111 entries, 0 to 110
Data columns (total 18 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   freq         111 non-null    object 
 1   age          111 non-null    object 
 2   unit         111 non-null    object 
 3   sex          111 non-null    object 
 4   geo          111 non-null    object 
 5   TIME_PERIOD  111 non-null    object 
 6   2012         111 non-null    object 
 7   2013         111 non-null    object 
 8   2014         111 non-null    object 
 9   2015         111 non-null    object 
 10  2016         111 non-null    object 
 11  2017         111 non-null    object 
 12  2018         111 non-null    object 
 13  2019         111 non-null    object 
 14  2020         111 non-null    object 
 15  2021         111 non-null    object 
 16  2022         111 non-null    object 
 17  2023         0 non-null      float64
dtypes: float64(1), object(17)
memory usage: 15.7+ KB


In [12]:
df.isna().sum()

freq             0
age              0
unit             0
sex              0
geo              0
TIME_PERIOD      0
2012             0
2013             0
2014             0
2015             0
2016             0
2017             0
2018             0
2019             0
2020             0
2021             0
2022             0
2023           111
dtype: int64

In [13]:
df.duplicated().sum()

0

In [14]:
unit = df["unit"].value_counts()
print(unit)

unit
PC_ACT     37
PC_POP     37
THS_PER    37
Name: count, dtype: int64


* PC_ACT = Percentage of labour force
* PC_POP = Percentage of population
* THS_PER = Thousand persons

See link: http://localhost:8888/notebooks/BusinessIntelligencedat4%2FExam%2FExtracting_Unemployment_Eurostat.ipynb.ipynb

# 3) Transforming and cleaning data

### 3.1) Transforming ':' to NaN

In [18]:
df.replace(r'^\s*:\s*$', np.nan, regex=True, inplace=True)
df.head()

Unnamed: 0,freq,age,unit,sex,geo,TIME_PERIOD,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,A,Y15-74,PC_ACT,T,AT,5.2,5.7,6.0,6.1,6.5,5.9,5.2,4.8,6.0,6.2,4.8,5.1,
1,A,Y15-74,PC_ACT,T,BA,,,,,,,,,,17.4,15.4,13.2,
2,A,Y15-74,PC_ACT,T,BE,7.6,8.6,8.7,8.7,7.9,7.2 b,6.0,5.5,5.8,6.3,5.6,5.5,
3,A,Y15-74,PC_ACT,T,BG,13.3,13.9,12.4,10.1,8.6,7.2,6.2,5.2 b,6.1,5.2,4.2,4.3,
4,A,Y15-74,PC_ACT,T,CH,4.5,4.8,4.9,4.8,5.0,4.8,4.7,4.4,4.8,5.1,4.1,4.1,


### 3.2) Moving the values on column to the right so the valyes matches the correct year and then removing "TIME_PERIOD" column

In [20]:
# Removing the 2023 column given that I don't need that column (I only have data up to 2022 in the main data frame: happiness_df) 
# and given that it's of a different type for some reason
df.drop(columns="2023", inplace=True)

In [21]:

start_index = df.columns.get_loc("TIME_PERIOD")

df.iloc[:, start_index + 1:] = df.iloc[:, start_index:-1]

df.head()

Unnamed: 0,freq,age,unit,sex,geo,TIME_PERIOD,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,A,Y15-74,PC_ACT,T,AT,5.2,5.2,5.7,6.0,6.1,6.5,5.9,5.2,4.8,6.0,6.2,4.8
1,A,Y15-74,PC_ACT,T,BA,,,,,,,,,,,17.4,15.4
2,A,Y15-74,PC_ACT,T,BE,7.6,7.6,8.6,8.7,8.7,7.9,7.2 b,6.0,5.5,5.8,6.3,5.6
3,A,Y15-74,PC_ACT,T,BG,13.3,13.3,13.9,12.4,10.1,8.6,7.2,6.2,5.2 b,6.1,5.2,4.2
4,A,Y15-74,PC_ACT,T,CH,4.5,4.5,4.8,4.9,4.8,5.0,4.8,4.7,4.4,4.8,5.1,4.1


In [22]:
df.drop(columns="TIME_PERIOD", inplace=True)
df.head()

Unnamed: 0,freq,age,unit,sex,geo,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,A,Y15-74,PC_ACT,T,AT,5.2,5.7,6.0,6.1,6.5,5.9,5.2,4.8,6.0,6.2,4.8
1,A,Y15-74,PC_ACT,T,BA,,,,,,,,,,17.4,15.4
2,A,Y15-74,PC_ACT,T,BE,7.6,8.6,8.7,8.7,7.9,7.2 b,6.0,5.5,5.8,6.3,5.6
3,A,Y15-74,PC_ACT,T,BG,13.3,13.9,12.4,10.1,8.6,7.2,6.2,5.2 b,6.1,5.2,4.2
4,A,Y15-74,PC_ACT,T,CH,4.5,4.8,4.9,4.8,5.0,4.8,4.7,4.4,4.8,5.1,4.1


### 3.3) Creating two data frames: 1) With PC_POP and 2) With PC_ACT 

I don't want thousand persons because it's a bad unit of measure for comparison of countries given that countries have very different populations


In [25]:
act_df = df[df["unit"] == "PC_ACT"]
unit1 = act_df["unit"].value_counts()
print(unit1)

unit
PC_ACT    37
Name: count, dtype: int64


In [26]:
df = df[~df["unit"].isin(["PC_ACT", "THS_PER"])]

unit = df["unit"].value_counts()
print(unit)

unit
PC_POP    37
Name: count, dtype: int64


In [27]:
df.head()

Unnamed: 0,freq,age,unit,sex,geo,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
37,A,Y15-74,PC_POP,T,AT,3.5,3.8,4.0,4.1,4.4,4.0,3.6,3.3,4.1,4.2,3.3
38,A,Y15-74,PC_POP,T,BA,,,,,,,,,,9.0,7.9
39,A,Y15-74,PC_POP,T,BE,4.6,5.1,5.2,5.2,4.7,4.3 b,3.6,3.3,3.4,3.8,3.4
40,A,Y15-74,PC_POP,T,BG,7.8,8.3,7.4,6.1,5.1,4.4,3.8,3.3 b,3.7,3.2,2.6
41,A,Y15-74,PC_POP,T,CH,3.3,3.5,3.6,3.6,3.7,3.7,3.6,3.4,3.7,3.8,3.1


### 3.4) Deleting irrelevant columns: 'freq', 'age', 'unit' and 'sex' columns

In [29]:
df.drop(columns=["freq", "age", "unit", "sex"], inplace= True)
df.head()

Unnamed: 0,geo,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
37,AT,3.5,3.8,4.0,4.1,4.4,4.0,3.6,3.3,4.1,4.2,3.3
38,BA,,,,,,,,,,9.0,7.9
39,BE,4.6,5.1,5.2,5.2,4.7,4.3 b,3.6,3.3,3.4,3.8,3.4
40,BG,7.8,8.3,7.4,6.1,5.1,4.4,3.8,3.3 b,3.7,3.2,2.6
41,CH,3.3,3.5,3.6,3.6,3.7,3.7,3.6,3.4,3.7,3.8,3.1


In [30]:
act_df.drop(columns=["freq", "age", "unit", "sex"], inplace= True)
act_df.head()

Unnamed: 0,geo,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,AT,5.2,5.7,6.0,6.1,6.5,5.9,5.2,4.8,6.0,6.2,4.8
1,BA,,,,,,,,,,17.4,15.4
2,BE,7.6,8.6,8.7,8.7,7.9,7.2 b,6.0,5.5,5.8,6.3,5.6
3,BG,13.3,13.9,12.4,10.1,8.6,7.2,6.2,5.2 b,6.1,5.2,4.2
4,CH,4.5,4.8,4.9,4.8,5.0,4.8,4.7,4.4,4.8,5.1,4.1


### 3.5) Transforming the data frames' year values from wide to long


In [32]:
df = pd.melt(df, id_vars=["geo"], var_name = "Year", value_name = "Unemployment as percentage of labour force")
df.head()

Unnamed: 0,geo,Year,Unemployment as percentage of labour force
0,AT,2012,3.5
1,BA,2012,
2,BE,2012,4.6
3,BG,2012,7.8
4,CH,2012,3.3


In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 407 entries, 0 to 406
Data columns (total 3 columns):
 #   Column                                      Non-Null Count  Dtype 
---  ------                                      --------------  ----- 
 0   geo                                         407 non-null    object
 1   Year                                        407 non-null    object
 2   Unemployment as percentage of labour force  394 non-null    object
dtypes: object(3)
memory usage: 9.7+ KB


In [34]:
act_df = pd.melt(act_df, id_vars=["geo"], var_name = "Year", value_name = "Unemployment as percentage of labour force")
act_df.head()

Unnamed: 0,geo,Year,Unemployment as percentage of labour force
0,AT,2012,5.2
1,BA,2012,
2,BE,2012,7.6
3,BG,2012,13.3
4,CH,2012,4.5


In [35]:
act_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 407 entries, 0 to 406
Data columns (total 3 columns):
 #   Column                                      Non-Null Count  Dtype 
---  ------                                      --------------  ----- 
 0   geo                                         407 non-null    object
 1   Year                                        407 non-null    object
 2   Unemployment as percentage of labour force  394 non-null    object
dtypes: object(3)
memory usage: 9.7+ KB


### 3.6) Removing null/na/missing values

This because I prefer as precise data as possible

In [38]:
df.isna().sum()

geo                                            0
Year                                           0
Unemployment as percentage of labour force    13
dtype: int64

In [39]:
df.dropna(inplace=True)
df.isna().sum()

geo                                           0
Year                                          0
Unemployment as percentage of labour force    0
dtype: int64

In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 394 entries, 0 to 406
Data columns (total 3 columns):
 #   Column                                      Non-Null Count  Dtype 
---  ------                                      --------------  ----- 
 0   geo                                         394 non-null    object
 1   Year                                        394 non-null    object
 2   Unemployment as percentage of labour force  394 non-null    object
dtypes: object(3)
memory usage: 12.3+ KB


In [41]:
act_df.isna().sum()

geo                                            0
Year                                           0
Unemployment as percentage of labour force    13
dtype: int64

In [42]:
act_df.dropna(inplace=True)
act_df.isna().sum()

geo                                           0
Year                                          0
Unemployment as percentage of labour force    0
dtype: int64

### 3.7) Checking for duplicates

In [44]:
df.duplicated().sum()

0

In [45]:
act_df.duplicated().sum()

0

### 3.8) Renaming columns

In [47]:
df.rename(columns={"geo": "Country", "Unemployment as percentage of labour force": "Unemployment as % of labour force"}, inplace= True)
df.head()

Unnamed: 0,Country,Year,Unemployment as % of labour force
0,AT,2012,3.5
2,BE,2012,4.6
3,BG,2012,7.8
4,CH,2012,3.3
5,CY,2012,8.0


In [48]:
act_df.rename(columns={"geo": "Country", "Unemployment as percentage of labour force": "Unemployment as % of labour force"}, inplace= True)
act_df.head()

Unnamed: 0,Country,Year,Unemployment as % of labour force
0,AT,2012,5.2
2,BE,2012,7.6
3,BG,2012,13.3
4,CH,2012,4.5
5,CY,2012,11.9


### 3.9) Transforming values in 'Country' from abbreviations to full country names 

So that the data can be merged with the other data

In [51]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 394 entries, 0 to 406
Data columns (total 3 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   Country                            394 non-null    object
 1   Year                               394 non-null    object
 2   Unemployment as % of labour force  394 non-null    object
dtypes: object(3)
memory usage: 12.3+ KB


In [52]:
# Creating a country mapper
country_mapping = {country.alpha_2: country.name for country in pycountry.countries}

# Replace abbreviations with full country names
df['Country'] = df['Country'].replace(country_mapping)
df.head()

Unnamed: 0,Country,Year,Unemployment as % of labour force
0,Austria,2012,3.5
2,Belgium,2012,4.6
3,Bulgaria,2012,7.8
4,Switzerland,2012,3.3
5,Cyprus,2012,8.0


In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 394 entries, 0 to 406
Data columns (total 3 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   Country                            394 non-null    object
 1   Year                               394 non-null    object
 2   Unemployment as % of labour force  394 non-null    object
dtypes: object(3)
memory usage: 12.3+ KB


In [54]:
df.isna().sum()

Country                              0
Year                                 0
Unemployment as % of labour force    0
dtype: int64

In [55]:
# Creating a country mapper
country_mapping2 = {country.alpha_2: country.name for country in pycountry.countries}

# Replacing abbreviations with full country names
act_df['Country'] = act_df['Country'].replace(country_mapping2)
act_df.head()

Unnamed: 0,Country,Year,Unemployment as % of labour force
0,Austria,2012,5.2
2,Belgium,2012,7.6
3,Bulgaria,2012,13.3
4,Switzerland,2012,4.5
5,Cyprus,2012,11.9


In [56]:
act_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 394 entries, 0 to 406
Data columns (total 3 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   Country                            394 non-null    object
 1   Year                               394 non-null    object
 2   Unemployment as % of labour force  394 non-null    object
dtypes: object(3)
memory usage: 12.3+ KB


### 3.10) Transforming "numeric" object values to numeric values 

In [58]:
df['Year'] = pd.to_numeric(df['Year'], errors='coerce').astype('int64')
act_df['Year'] = pd.to_numeric(act_df['Year'], errors='coerce').astype('int64')

# Convert to float64 (if values contain decimals)
df['Unemployment as % of labour force'] = pd.to_numeric(df['Unemployment as % of labour force'], errors='coerce').astype('float64')
act_df['Unemployment as % of labour force'] = pd.to_numeric(act_df['Unemployment as % of labour force'], errors='coerce').astype('float64')

In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 394 entries, 0 to 406
Data columns (total 3 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Country                            394 non-null    object 
 1   Year                               394 non-null    int64  
 2   Unemployment as % of labour force  376 non-null    float64
dtypes: float64(1), int64(1), object(1)
memory usage: 12.3+ KB


In [60]:
act_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 394 entries, 0 to 406
Data columns (total 3 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Country                            394 non-null    object 
 1   Year                               394 non-null    int64  
 2   Unemployment as % of labour force  376 non-null    float64
dtypes: float64(1), int64(1), object(1)
memory usage: 12.3+ KB


### 3.11) Removing new missing values 

error="coerce" in the above creates missing values if it can't convert the values from object to numeric

In [63]:
df.dropna(inplace=True)
act_df.dropna(inplace=True)

In [64]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 376 entries, 0 to 406
Data columns (total 3 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Country                            376 non-null    object 
 1   Year                               376 non-null    int64  
 2   Unemployment as % of labour force  376 non-null    float64
dtypes: float64(1), int64(1), object(1)
memory usage: 11.8+ KB


In [65]:
act_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 376 entries, 0 to 406
Data columns (total 3 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Country                            376 non-null    object 
 1   Year                               376 non-null    int64  
 2   Unemployment as % of labour force  376 non-null    float64
dtypes: float64(1), int64(1), object(1)
memory usage: 11.8+ KB


# 4) Storing data frame in file

In [67]:
with pd.HDFStore('dataframes.h5') as store:
    store['unempl_pop_df'] = df

In [68]:
with pd.HDFStore('dataframes.h5') as store:
    store['unempl_act_df'] = act_df