Download data from OWID and generate dataset files with train-val splits

In [None]:
import pandas as pd
import numpy as np
import datetime as dt
import torch
from torch.utils import data as tdt

DATA_DIR = 'data'

### Download

In [None]:
!curl https://covid.ourworldindata.org/data/owid-covid-data.csv --output data/owid_$(date +%Y-%m-%d).csv

### Config

In [None]:
config = {
    'FEATURES': ['new_cases', 'new_deaths'],
    "VAL_RATIO": 0.3,
    "IP_SEQ_LEN": 40,
    "OP_SEQ_LEN": 20,
    "SRC": "owid_2020-06-24.csv"
}
fn = "ds_cd_" + str(config['IP_SEQ_LEN']) + str(config['OP_SEQ_LEN']) + '_' + config['SRC'] + ".pt"

### Read

In [None]:
cols = ['location', 'date', 'total_cases', 'new_cases', 'total_deaths', 'new_deaths', 'population']
dates = ['date']
df = pd.read_csv(DATA_DIR + "/" + config['SRC'],
                 usecols=cols,
                 parse_dates=dates)
df.sample()

### Prepare dataset

In [None]:
def gen_dataset(cfg):
    IP_SEQ_LEN = cfg['IP_SEQ_LEN']
    OP_SEQ_LEN = cfg['OP_SEQ_LEN']
    VAL_RATIO = cfg['VAL_RATIO']
    
    ip_trn = []
    op_trn = []

    countries = df['location'].unique()
    pop_countries = ['China', 'United States', 'Indonesia', 'Pakistan', 'Brazil', 'Bangladesh', 'Russia', 'Mexico']

    c = 0
    for country in countries:
        if country in ['World', 'International', 'India']: # Countries to be skipped
            continue
        country_df = df.loc[df.location == country]
        tot_cases_gt_100 = (country_df['total_cases'] >= 100)
        country_df = country_df.loc[tot_cases_gt_100]

        if len(country_df) >= IP_SEQ_LEN + OP_SEQ_LEN:
            c += 1
            pop = country_df['population'].iloc[0]
            print(c, country, len(country_df), pop)
            daily_cases = np.array(country_df[cfg['FEATURES']].rolling(7, center=True, min_periods=1).mean() * 1000 / pop, dtype=np.float32)

            for i in range(len(country_df) - IP_SEQ_LEN - OP_SEQ_LEN + 1):
                ip_trn.append(daily_cases[i : i+IP_SEQ_LEN])
                op_trn.append(daily_cases[i+IP_SEQ_LEN : i+IP_SEQ_LEN+OP_SEQ_LEN])

    ip_trn = torch.from_numpy(np.array(ip_trn, dtype=np.float32))
    op_trn = torch.from_numpy(np.array(op_trn, dtype=np.float32))
    dataset = tdt.TensorDataset(ip_trn, op_trn)

    val_len = int(VAL_RATIO * len(dataset))
    trn_len = len(dataset) - val_len
    trn_set, val_set = tdt.random_split(dataset, (trn_len, val_len))
    return trn_set, val_set

In [None]:
try:
    ds = torch.load(DATA_DIR + '/' + fn)
    trn_set, val_set, ds_cfg = ds['trn'], ds['val'], ds['config']
    print(fn, "already exists.")
    print(ds_cfg)
except FileNotFoundError:
    trn_set, val_set = gen_dataset(config)
    torch.save({'trn': trn_set, 'val': val_set, 'config': config}, DATA_DIR + '/' + fn)
    print("Saved dataset to", fn)
finally:
    print("Training data:", len(trn_set), "Validation data:", len(val_set))