# Preprocess restriction data

In [1]:
import numpy as np
import pandas as pd

## Get updated dataset

In [2]:
restrictions_df = pd.read_csv('data/Kaggle_CountryInfo/restrictions.csv')

## Make some initial transformations towards a columnwise format

In [3]:
# Update type format
restrictions_df.loc[:,'type'] = 'Date.' + restrictions_df.loc[:,'type'].astype(str)

In [4]:
# rename last column
restrictions_df.columns = ['country_region', 'date', 'type', 'limit', 'mandatory', 'notes']

In [5]:
# Get set of columns and rows for the dates
countries = restrictions_df.country_region.unique()
dates = restrictions_df.type.unique()

In [6]:
#append extra info for the rows
cols = np.append(dates, ["limit", "mandatory", "notes"])

In [7]:
# Create new dataframe
df = pd.DataFrame(index=countries, columns=cols)

In [8]:
def agg_notes_without_nas(notes):
    result = []
    for note in notes:
        if note is not np.nan:
            result.append(note)
    if len(result) > 0:
        return ",".join(result)
    else: return np.nan

In [9]:
# Get Dates data
tmp_dates = restrictions_df.pivot_table(index="country_region", columns="type", values="date", aggfunc=lambda x: x)

In [10]:
df.loc[:, tmp_dates.columns] = tmp_dates.values

In [11]:
limits = restrictions_df.groupby("country_region").limit.sum()
mandatory = restrictions_df.groupby("country_region").mandatory.agg(agg_notes_without_nas)
notes = restrictions_df.groupby("country_region").notes.agg(agg_notes_without_nas)

In [12]:
df.loc[:, "limit"] = limits.values
df.loc[:, "mandatory"] = mandatory.values
df.loc[:, "notes"] = notes.values

In [13]:
# eliminate NaNs
# df = df.fillna("")

In [14]:
# make country a column again
df = df.reset_index()

In [15]:
# rename columns
df.columns = ['Country.Region', 'Date.Schools', 'Date.Public Places', 'Date.Gatherings',
       'Date.Stay at Home', 'Date.Lockdown', 'Date.Non-essential',
       'Gatherings.limit', 'mandatory', 'notes']

## Add extra columns

In [16]:
df['Country.Continent'] = np.nan
df['Province.State'] = np.nan

In [17]:
# Rearange columns
df = df.loc[:, ['Country.Continent', 'Country.Region', 'Province.State', 'Date.Schools',
       'Date.Public Places', 'Date.Gatherings', 'Date.Stay at Home',
       'Date.Lockdown', 'Date.Non-essential', 'Gatherings.limit', 'mandatory',
       'notes']]

In [18]:
# Copy the Country.Region column to Province.State
df.loc[:, 'Province.State'] = df.loc[:, 'Country.Region']

## Update Restrictions dataset with new data

### Load restriction dataset

In [19]:
restrictions_colwise_df = pd.read_csv('data/Kaggle_CountryInfo/restrictions_columnwise_updated.csv')

### Merge with new data

In [20]:
def setdiff_sorted(array1,array2,assume_unique=False):
    """
    Compares 2 arrays and returns the difference
    """
    ans = np.setdiff1d(array1,array2,assume_unique).tolist()
    if assume_unique:
        return sorted(ans)
    return ans

In [21]:
# First we update the previous data
restrictions_colwise_df.update(df, overwrite=False)

In [22]:
# then we check if there are new rows and update them
new_rows = setdiff_sorted(df['Province.State'].unique(), restrictions_colwise_df['Province.State'].unique())

In [24]:
# Add new rowsupdate
for row in new_rows:
    restrictions_colwise_df = restrictions_colwise_df.append(df[df['Province.State'] == row], ignore_index=True, verify_integrity=True)

In [25]:
# Finally we sort by Province.State 
restrictions_colwise_df.sort_values(by='Province.State', ascending=False, inplace=True)

In [27]:
# Save new updated version
restrictions_colwise_df.to_csv("data/Kaggle_CountryInfo/restrictions_columnwise_updated.csv", index=False)

### Auxiliary methods

In [30]:
restrictions_colwise_df[restrictions_colwise_df['Country.Region'].str.contains('Cz')]

Unnamed: 0.1,Unnamed: 0,Country.Continent,Country.Region,Province.State,Date.Schools,Date.Public Places,Date.Gatherings,Date.Stay at Home,Date.Lockdown,Date.Non-essential,Gatherings.limit,mandatory,notes
17,17,Europe,Czech Republic,Czech,2020-03-13T00:00:00.000000000,2020-03-13T00:00:00.000000000,,2020-03-17T00:00:00.000000000,2020-03-16,,0,"Yes,Yes,Yes",3/18: everyone is required to cover their nose...


In [72]:
df[df['Country.Region'].str.contains('Cz')]

Unnamed: 0,Country.Continent,Country.Region,Province.State,Date.Schools,Date.Public Places,Date.Gatherings,Date.Stay at Home,Date.Lockdown,Date.Non-essential,Gatherings.limit,mandatory,notes
17,,Czech,Czech,3/13/2020,3/13/2020,,3/17/2020,,,0,"Yes,Yes,Yes",3/18: everyone is required to cover their nose...


In [26]:
# index = 17

In [32]:
# restrictions_colwise_df.loc[index, :]

In [33]:
# Notes, will need to rename country_region to Province.State!

In [28]:
# restrictions_colwise_df.loc[index, 'Province.State'] = 'Czech'

In [68]:
# df = df.append({'Country.Continent':'Middle East', 
#            'Country.Region':'Completely Made Up', 
#            'Province.State':'', 
#            'Date.Schools':'', 
#            'Date.Public Places':'', 
#            'Date.Gatherings':'', 
#            'Date.Stay at Home':'', 
#            'Date.Lockdown':'2020-3-15', 
#            'Date.Non-essential':'', 
#            'Gatherings.limit':0, 
#            'mandatory':'Yes', 
#            'notes':''}, ignore_index=True)

In [69]:
# df.loc[df["Province.State"] == '', 'Province.State'] = df.loc[df["Province.State"] == '', 'Country.Region']