# Data Preprocessing

Preprocessing consists of several steps:
- Loading csv-files with pandas
- separate testcases and deaths
- Remove entries with NaN/Empty Cells
- Remove irrelevant rows and columns
- Split year-month column into year and month column
- Adjust Geo-Isocode (Use 2 Letter ISO)
- Save cleaned csv-files

# Import Dependencies

In [15]:
import pandas as pd

# Load raw data
covid_testcases_and_deaths_in_eu_2020_to_2023.csv contains data on death/test cases of each eu-countries from 2020 to 2023.

prc_ppp_in_eu_2012_to_2023.csv contains data on local-adjusted gdp of each eu-countries from 2012 to 2023

In [16]:
filepath_to_unprocessed_data = '../data/raw_data/'
covid_filename = 'covid_testcases_and_deaths_in_eu_2020_to_2023.csv'
pps_filename = 'prc_ppp_in_eu_2012_to_2023.csv'

covid_data = pd.read_csv(filepath_to_unprocessed_data + covid_filename)
pps_data = pd.read_csv(filepath_to_unprocessed_data + pps_filename)

# Preprocess covid death/test-cases file

split year_week column into year and week column

select only relevant columns

convert 3 letter iso to 2 letter iso country code.

In [17]:
covid_data[['year', 'week']] = covid_data['year_week'].str.split('-', n=1, expand=True)
covid_data['year'] = pd.to_numeric(covid_data['year'])
covid_data['week'] = pd.to_numeric(covid_data['week'])

covid_data = covid_data[['country_code', 'year', 'week', 'rate_14_day', 'indicator']]

country_iso_code_dict = {'AUT':'AT','BEL':'BE','BGR':'BG','HRV':'HR','CYP':'CY','DNK':'DK','EST':'EE','FIN':'FI','FRA':'FR',
                         'DEU':'DE','GRC':'GR','HUN':'HU','IRL':'IE','ITA':'IT','LVA':'LV','LTU':'LT','LUX':'LU','MLT':'MT',
                         'NLD':'NL','POL':'PL','PRT':'PT','ROU':'RO','SVK':'SK','SVN':'SI','ESP':'ES','SWE':'SE','GBR':'GB'}
covid_data = covid_data[covid_data['country_code'].isin(country_iso_code_dict.keys())] # remove unknown countries etc
covid_data['country_code'] = covid_data['country_code'].map(country_iso_code_dict)

# Split covid death/test-cases file

split covid data into death and case data

remove nan values

In [18]:
covid_deaths_data = covid_data[covid_data['indicator'] == 'deaths'].copy()
covid_deaths_data.drop(columns=['indicator'], inplace=True)
covid_deaths_data.dropna(inplace=True)
print(covid_deaths_data)

covid_cases_data = covid_data[covid_data['indicator'] == 'cases'].copy()
covid_cases_data.drop(columns=['indicator'], inplace=True)
covid_cases_data.dropna(inplace=True)
print(covid_cases_data)


      country_code  year  week  rate_14_day
213             AT  2020    10     0.000000
214             AT  2020    11     0.222744
215             AT  2020    12     3.563899
216             AT  2020    13    14.589713
217             AT  2020    14    25.281412
...            ...   ...   ...          ...
12643           SE  2023    43    13.298475
12644           SE  2023    44    17.986427
12645           SE  2023    45    25.640226
12646           SE  2023    46    31.284903
12647           SE  2023    47    15.403270

[4873 rows x 4 columns]
      country_code  year  week  rate_14_day
9               AT  2020    10     1.414423
10              AT  2020    11    11.014677
11              AT  2020    12    43.657768
12              AT  2020    13    90.701241
13              AT  2020    14    91.458569
...            ...   ...   ...          ...
12439           SE  2023    43    31.361441
12440           SE  2023    44    45.176547
12441           SE  2023    45    59.824005
12442  

# Preprocess ppp/pps file

select only relevant columns

rename column names to match covid dataset column names

In [19]:
pps_data = pps_data[['geo', 'TIME_PERIOD', 'OBS_VALUE']]
pps_data.rename(columns={'geo': 'country_code', 'TIME_PERIOD': 'year', 'OBS_VALUE':'pps'}, inplace=True)

print(pps_data)

    country_code  year  pps
0             AL  2012   30
1             AL  2013   29
2             AL  2014   30
3             AL  2015   30
4             AL  2016   30
..           ...   ...  ...
496           US  2018  140
497           US  2019  134
498           US  2020  135
499           US  2021  135
500           US  2022  134

[501 rows x 3 columns]


# Merge files

join datasets by year and country

In [20]:
combined_cases_data = pd.merge(covid_cases_data, pps_data, on=['country_code', 'year'], how='inner')
combined_death_data = pd.merge(covid_deaths_data, pps_data, on=['country_code', 'year'], how='inner')
print(combined_death_data)
print(combined_cases_data)

     country_code  year  week  rate_14_day  pps
0              AT  2020    10     0.000000  125
1              AT  2020    11     0.222744  125
2              AT  2020    12     3.563899  125
3              AT  2020    13    14.589713  125
4              AT  2020    14    25.281412  125
...           ...   ...   ...          ...  ...
4665           SE  2023    43    13.298475  114
4666           SE  2023    44    17.986427  114
4667           SE  2023    45    25.640226  114
4668           SE  2023    46    31.284903  114
4669           SE  2023    47    15.403270  114

[4670 rows x 5 columns]
     country_code  year  week  rate_14_day  pps
0              AT  2020    10     1.414423  125
1              AT  2020    11    11.014677  125
2              AT  2020    12    43.657768  125
3              AT  2020    13    90.701241  125
4              AT  2020    14    91.458569  125
...           ...   ...   ...          ...  ...
4788           SE  2023    43    31.361441  114
4789           

# Save files

In [21]:
filepath_to_processed_data = '../data/processed_data/'

pps_data.to_csv(filepath_to_processed_data + "processed_pps_in_eu_2012_to_2023.csv", index=False)
covid_deaths_data.to_csv(filepath_to_processed_data + "processed_covid_deaths_in_eu_2020_to_2023.csv", index=False)
covid_cases_data.to_csv(filepath_to_processed_data + "processed_covid_cases_in_eu_2020_to_2023.csv", index=False)
combined_cases_data.to_csv(filepath_to_processed_data + "processed_combined_covid_cases_and_pps_in_eu_2020_to_2023.csv", index=False)
combined_death_data.to_csv(filepath_to_processed_data + "processed_combined_covid_deaths_and_pps_in_eu_2020_to_2023.csv", index=False)