# Data preprocessing

In [1]:
import pandas as pd

## Load Oxford data

In [2]:
data_oxford = pd.read_csv('https://github.com/OxCGRT/covid-policy-tracker/blob/master/data/OxCGRT_latest.csv?raw=true')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
print(data_oxford.columns)

Index(['CountryName', 'CountryCode', 'RegionName', 'RegionCode',
       'Jurisdiction', 'Date', 'C1_School closing', 'C1_Flag',
       'C2_Workplace closing', 'C2_Flag', 'C3_Cancel public events', 'C3_Flag',
       'C4_Restrictions on gatherings', 'C4_Flag', 'C5_Close public transport',
       'C5_Flag', 'C6_Stay at home requirements', 'C6_Flag',
       'C7_Restrictions on internal movement', 'C7_Flag',
       'C8_International travel controls', 'E1_Income support', 'E1_Flag',
       'E2_Debt/contract relief', 'E3_Fiscal measures',
       'E4_International support', 'H1_Public information campaigns',
       'H1_Flag', 'H2_Testing policy', 'H3_Contact tracing',
       'H4_Emergency investment in healthcare', 'H5_Investment in vaccines',
       'H6_Facial Coverings', 'H6_Flag', 'H7_Vaccination policy', 'H7_Flag',
       'H8_Protection of elderly people', 'H8_Flag', 'M1_Wildcard',
       'V1_Vaccine Prioritisation (summary)',
       'V2A_Vaccine Availability (summary)',
       'V2B_Vaccin

### Choose a country from the following list

In [4]:
print(data_oxford['CountryName'].unique())

['Aruba' 'Afghanistan' 'Angola' 'Albania' 'Andorra' 'United Arab Emirates'
 'Argentina' 'Australia' 'Austria' 'Azerbaijan' 'Burundi' 'Belgium'
 'Benin' 'Burkina Faso' 'Bangladesh' 'Bulgaria' 'Bahrain' 'Bahamas'
 'Bosnia and Herzegovina' 'Belarus' 'Belize' 'Bermuda' 'Bolivia' 'Brazil'
 'Barbados' 'Brunei' 'Bhutan' 'Botswana' 'Central African Republic'
 'Canada' 'Switzerland' 'Chile' 'China' "Cote d'Ivoire" 'Cameroon'
 'Democratic Republic of Congo' 'Congo' 'Colombia' 'Comoros' 'Cape Verde'
 'Costa Rica' 'Cuba' 'Cyprus' 'Czech Republic' 'Germany' 'Djibouti'
 'Dominica' 'Denmark' 'Dominican Republic' 'Algeria' 'Ecuador' 'Egypt'
 'Eritrea' 'Spain' 'Estonia' 'Ethiopia' 'Finland' 'Fiji' 'France'
 'Faeroe Islands' 'Gabon' 'United Kingdom' 'Georgia' 'Ghana' 'Guinea'
 'Gambia' 'Greece' 'Greenland' 'Guatemala' 'Guam' 'Guyana' 'Hong Kong'
 'Honduras' 'Croatia' 'Haiti' 'Hungary' 'Indonesia' 'India' 'Ireland'
 'Iran' 'Iraq' 'Iceland' 'Israel' 'Italy' 'Jamaica' 'Jordan' 'Japan'
 'Kazakhstan' 'Kenya'

In [5]:
country_of_interest = 'Japan'

In [6]:
df_country=data_oxford[data_oxford['CountryName'] == country_of_interest]

### Feature selection

For now we only focus on the indices

In [7]:
columns_to_select = ['Date', 'StringencyIndex', 'GovernmentResponseIndex', 'ContainmentHealthIndex', 'EconomicSupportIndex']
df_country = df_country[columns_to_select]

In [8]:
df_country['Date'] = pd.to_datetime(df_country['Date'], format='%Y%m%d')
df_country = df_country.sort_values(by=['Date'])

## Load reproduction data

In [9]:
if country_of_interest == 'Japan':
    df_rt_country = pd.read_csv('effective_reproduction_number_japan.csv', names=['Date', 'Rt'], header=0)
    df_rt_country['Date'] = pd.to_datetime(df_rt_country['Date'], format='%Y/%m/%d')
    df_rt_country = df_rt_country.sort_values(by=['Date'])
elif country_of_interest == 'Netherlands':
    df_rt_country = pd.read_json('https://data.rivm.nl/covid-19/COVID-19_reproductiegetal.json', orient='records')
    df_rt_country.rename(columns={"Rt_avg": "Rt"}, inplace=True)
    df_rt_country = df_rt_country[['Date', 'Rt']]
    df_rt_country['Date'] = pd.to_datetime(df_rt_country['Date'], format='%Y-%m-%d')
    df_rt_country = df_rt_country.sort_values(by=['Date'])
else:
    print(f'No reproduction data found for {country_of_interest}')

In [10]:
print(df_rt_country.head())

    

        Date    Rt
0 2020-03-01  1.31
1 2020-03-02  1.18
2 2020-03-03  1.29
3 2020-03-04  1.25
4 2020-03-05  1.17


### Merge Oxford and reproduction data

In [11]:
df_country = df_country.merge(df_rt_country, on='Date').sort_values(by=['Date'])

### Add time index

In [12]:
df_country['time_index'] = [i for i in range(len(df_country))]

In [13]:
print(df_country.head())

        Date  StringencyIndex  GovernmentResponseIndex  \
0 2020-03-01            34.26                    30.21   
1 2020-03-02            43.52                    35.42   
2 2020-03-03            43.52                    35.42   
3 2020-03-04            43.52                    35.42   
4 2020-03-05            43.52                    35.42   

   ContainmentHealthIndex  EconomicSupportIndex    Rt  time_index  
0                   34.52                   0.0  1.31           0  
1                   40.48                   0.0  1.18           1  
2                   40.48                   0.0  1.29           2  
3                   40.48                   0.0  1.25           3  
4                   40.48                   0.0  1.17           4  


### Filter by date

In [14]:
max_date = pd.Timestamp(2021,6,1)
df_country = df_country[df_country['Date'] <= max_date]

### Export data

In [15]:
df_country.to_pickle(f"reproduction_vs_index_{country_of_interest}.pkl")