Description of eurostat module: https://pypi.org/project/eurostat/<br>
Eurostat database: https://ec.europa.eu/eurostat/databrowser/explore/all/

In [0]:
%pip install eurostat

[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


In [0]:
import eurostat
import re
from pyspark.sql.functions import substring, length, regexp_replace, when, col, sum, max
from pyspark.sql.types import IntegerType, FloatType

In [0]:
def get_data_spark_df(code):

    data = eurostat.get_data(code)
    header = [h.split('\\')[0] for h in data[0]]
    rows = data[1:]

    df = spark.createDataFrame(rows,header)
    return df

def get_mapping_spark_df(code, param):

    rows = eurostat.get_dic(code,param)
    headers = ['value','description']

    df = spark.createDataFrame(rows,headers)
    return df

In [0]:
eu_iso2_codes = [
    "AT", "BE", "BG", "HR", "CY", "CZ", "DK", "EE", "FI", "FR",
    "DE", "EL", "HU", "IE", "IT", "LV", "LT", "LU", "MT", "NL",
    "PL", "PT", "RO", "SK", "SI", "ES", "SE"
]

In [0]:
code = 'hlth_silc_02'
parameters = eurostat.get_pars(code)

for parameter in parameters:
    print(F'{parameter} - list of values')
    parameters_values = eurostat.get_par_values(code, parameter)
    print(parameters_values)

freq - list of values
['A']
unit - list of values
['PC']
isced11 - list of values
['TOTAL', 'ED0-2', 'ED3_4', 'ED5-8']
age - list of values
['Y16-24', 'Y16-29', 'Y16-44', 'Y16-64', 'Y_GE16', 'Y25-29', 'Y25-34', 'Y25-64', 'Y35-44', 'Y45-49', 'Y45-54', 'Y45-64', 'Y55-64', 'Y65-74', 'Y_GE65', 'Y75-84', 'Y_GE75', 'Y_GE85']
sex - list of values
['T', 'M', 'F']
levels - list of values
['VGOOD', 'GOOD', 'VG_G', 'FAIR', 'BAD', 'VBAD', 'B_VB']
geo - list of values
['EU', 'EU27_2020', 'EU28', 'EU27_2007', 'EA', 'EA20', 'EA19', 'EA18', 'BE', 'BG', 'CZ', 'DK', 'DE', 'EE', 'IE', 'EL', 'ES', 'FR', 'HR', 'IT', 'CY', 'LV', 'LT', 'LU', 'HU', 'MT', 'NL', 'AT', 'PL', 'PT', 'RO', 'SI', 'SK', 'FI', 'SE', 'IS', 'NO', 'CH', 'UK', 'ME', 'MK', 'AL', 'RS', 'TR', 'XK']


In [0]:
dictionary = eurostat.get_dic(code, frmt='df')
display(dictionary)

dim,name,descr
freq,Time frequency,This code list contains the periodicity that refers to the frequency.
unit,Unit of measure,
icha11_hf,Classification of health care financing schemes - SHA 2011,"The System of health accounts, abbreviated as SHA, provides an economic framework for health accounting in the European Union (EU) Member States, using accounting rules methodologically compatible with the System of national accounts. National Health Accounts have been developed using the System of Health Accounts (SHA) defined by OECD. The SHA has been developed since 2000. A new version of SHA has been released in 2011. SHA refers to a functional approach based on selected health care activities that can be captured by transactions. Transactions are valued activities that take place between different actors or organisations. The transactions recorded in the SHA accounting framework relate to health care goods and services provided and consumed to improve the health status of individuals and of the population as a whole. The SHA is organised around a tri-axial system for the recording of health expenditure, by means of the International Classification for Health Accounts (ICHA), defining: • health care by function (ICHA-HC), • health care service provider industries(ICHA-HP) and • health care financing scheme (ICHA-HF). Classification of health care financing schemes classifies the types of financing arrangements through which people obtain health services; health care financing schemes include direct payments by households for services and goods and third-party financing arrangements."
geo,Geopolitical entity (reporting),This code list defines the reporting geopolitical entities.


####Time off due to health issues

In [0]:
code = 'hlth_silc_15'
data = get_data_spark_df(code)

display(data.limit(1000))

freq,unit,wstatus,age,sex,reason,geo,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
A,PC,EMP,Y16-24,F,FEAR,AL,,,,,,,,,,1.1,1.3,0.9,0.6,0.4,,,
A,PC,EMP,Y16-24,F,FEAR,AT,0.3,0.0,0.9,0.0,0.0,1.2,0.0,1.0,1.4,0.1,0.1,0.1,0.1,0.4,0.3,0.1,0.2
A,PC,EMP,Y16-24,F,FEAR,BE,0.0,0.5,0.8,1.6,1.1,0.3,0.6,2.9,0.3,0.2,0.3,0.3,0.1,0.0,0.1,0.3,0.3
A,PC,EMP,Y16-24,F,FEAR,BG,0.6,1.6,0.0,1.1,0.0,0.0,1.4,0.0,0.0,0.2,0.1,0.2,0.0,0.1,0.1,0.2,0.2
A,PC,EMP,Y16-24,F,FEAR,CH,0.5,0.2,1.4,0.0,0.3,0.2,0.2,0.5,0.3,0.5,0.4,0.1,0.3,0.2,0.3,0.2,
A,PC,EMP,Y16-24,F,FEAR,CY,2.4,1.0,1.1,0.0,1.8,0.0,0.0,0.0,0.0,0.0,0.1,0.4,0.1,0.2,0.0,0.1,0.3
A,PC,EMP,Y16-24,F,FEAR,CZ,4.5,0.6,2.1,0.0,0.0,1.8,0.0,0.0,0.0,0.3,0.2,0.3,0.2,0.2,0.3,0.1,0.1
A,PC,EMP,Y16-24,F,FEAR,DE,0.3,0.0,0.3,0.3,0.5,0.3,0.5,0.0,0.2,0.0,0.1,0.1,0.2,0.0,0.1,0.1,0.1
A,PC,EMP,Y16-24,F,FEAR,DK,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3,0.8,0.9,0.6,0.6,0.8,1.0,1.3
A,PC,EMP,Y16-24,F,FEAR,EA,0.8,0.8,0.9,0.8,0.7,0.7,0.6,0.3,0.4,0.2,0.3,0.3,0.3,0.2,,,


####Population Stats

In [0]:
# Population on 1 January by age, sex
code = 'demo_pjan'

# load data from API
df = get_data_spark_df(code)

# perform all the transformations
df_transformed = df \
    .withColumnsRenamed({'geo':'Country_Code'}) \
    .withColumn('Age',regexp_replace('age','Y','').try_cast('int')) \
    .filter( 
        (df.sex.isin(['M','F'])) & 
        (col('Country_Code').isin(eu_iso2_codes)) &
        (col('Age').isNotNull())) \
    .select('Age','Sex','Country_Code','2014','2019','2023') \
    .unpivot(
        ['Age','Sex','Country_Code'],
        ['2014','2019','2023'], 'Year', 'Population') \
    .filter(col('Population') > 0)

# create UC table
unity_catalog_table_name = 'workspace.eurostat.population_split'
df_transformed.write.option('mergeSchema', 'true').mode('overwrite').saveAsTable(unity_catalog_table_name)


#### Healthcare Expenditure

In [0]:
# health care expenditure by financing scheme
code = 'hlth_sha11_hf'

# load data from API
data = get_data_spark_df(code)
map_health = get_mapping_spark_df(code,'icha11_hf')
map_unit = get_mapping_spark_df(code,'unit')

# expenditure schemas classfication
public =  ['Government schemes and compulsory contributory health care financing schemes', 'Government schemes', 'Social health insurance schemes','Compulsory contributory health insurance schemes and CMSA']
private = ['Household out-of-pocket payment', 'Out-of-pocket excluding cost-sharing', 'Voluntary health insurance schemes', 'Voluntary health care payment schemes', 'Compulsory private insurance scheme', 'Enterprise financing schemes']
mixed = ['Cost sharing with third-party payers', 'Compulsory medical savings accounts (CMSA)', 'NPISH financing schemes', 'Rest of the world financing schemes (non-resident)']

# do transformations for final df
df_transformed = data \
    .withColumnRenamed('geo','Country_Code').withColumnRenamed('icha11_hf','SHA_11') \
    .filter(
        (~substring(col('Country_Code'),1,2).isin(['EU','EA'])) & 
        (col('SHA_11') != 'TOT_HF')) \
    .join( map_health, col('SHA_11') == map_health.value, 'left' ) \
    .join( map_unit, col('unit') == map_unit.value, 'left' ) \
    .select('SHA_11','Country_Code','2015','2019','2023', 
            map_health.description.alias('Financing_schema'), map_unit.description.alias('unit')) \
    .withColumn('Financing_type',
        when(col('Financing_schema').isin(public), 'Public')
        .when(col('Financing_schema').isin(private), 'Private')
        .when(col('Financing_schema').isin(mixed), 'Mixed')
        .otherwise('Unknown')) \
    .unpivot(
        ['Country_Code','Financing_schema','unit','SHA_11','Financing_type'], 
        ['2015','2019','2023'], 'Year', 'Value') \
    .groupBy('Country_Code','Year','SHA_11','Financing_type','Financing_schema').pivot('unit').max('Value')

# remove spaces or other illegal characters from column names
df_final = df_transformed.toDF(*[re.sub(r'[^0-9a-zA-Z_]','_',c) for c in df_transformed.columns])

# create UC table
unity_catalog_table_name = 'workspace.eurostat.healthcare_expenditure'
df_final.write.option('mergeSchema', 'true').mode('overwrite').saveAsTable(unity_catalog_table_name)


#### Percived Health

In [0]:
# Self-perceived health by sex, age and educational attainment level
code = 'hlth_silc_02'

# load data from API
data = get_data_spark_df(code)
map_levels = get_mapping_spark_df(code,'levels')
population = spark.read.table('workspace.eurostat.population_split')

# do transformations for final df
df_transformed = data \
    .withColumnsRenamed({'geo':'Country_Code', 'age':'Age_Group'}) \
    .filter( 
        (col('Country_Code').isin(eu_iso2_codes)) &
        (col('levels').isin(['VGOOD', 'GOOD', 'FAIR', 'BAD', 'VBAD'])) &
        (col('isced11') == 'TOTAL') &
        (col('sex').isin(['M','F'])) &
        (col('Age_Group').isin(['Y16-24', 'Y25-34', 'Y35-44', 'Y45-54', 'Y55-64', 'Y_GE65']))) \
    .join( map_levels, data.levels == map_levels.value, 'left' ) \
    .select('Age_Group','Country_Code','sex', '2014','2019','2023', map_levels.description.alias('Health_Assesement')) \
    .unpivot(
        ['Age_Group','Country_Code','Sex','Health_Assesement'], 
        ['2014','2019','2023'], 'Year', 'Percentage')

# joining actual population numbers to calculate number of people in each category from percantage info
df_final = df_transformed.alias('d') \
    .join(population.alias('p'),
        (col('d.Country_Code') == col('p.Country_Code')) & 
        (col('d.Sex')==col('p.Sex')) & 
        (col('d.Year')==col('p.Year')) &
        (col('d.Age_Group') == when(col('p.Age') >= 65, 'Y_GE65')
        .when(col('p.Age') >= 55, 'Y55-64')
        .when(col('p.Age') >= 45, 'Y45-54')
        .when(col('p.Age') >= 35, 'Y35-44')
        .when(col('p.Age') >= 25, 'Y25-34')
        .when(col('p.Age') >= 16, 'Y16-24')
        .otherwise('0')),'left') \
    .select('d.*' , 'Population') \
    .groupBy(df_transformed.columns).agg(sum('Population').alias('Population')) \
    .withColumn('Number_of_People', ((col('Percentage')/100)*col('Population')).cast('int')) \
    .drop('Population')

# create UC table
unity_catalog_table_name = 'workspace.eurostat.own_health_assesement'
df_final.write.option('mergeSchema', 'true').mode('overwrite').saveAsTable(unity_catalog_table_name)
