In [18]:
import sys

!{sys.executable} -m pip install requests retry-requests openpyxl xlwt pandas

Collecting openpyxl
[?25l  Downloading https://files.pythonhosted.org/packages/95/8c/83563c60489954e5b80f9e2596b93a68e1ac4e4a730deb1aae632066d704/openpyxl-3.0.3.tar.gz (172kB)
[K     |████████████████████████████████| 174kB 1.9MB/s eta 0:00:01
Collecting jdcal (from openpyxl)
  Downloading https://files.pythonhosted.org/packages/f0/da/572cbc0bc582390480bbd7c4e93d14dc46079778ed915b505dc494b37c57/jdcal-1.4.1-py2.py3-none-any.whl
Collecting et_xmlfile (from openpyxl)
  Downloading https://files.pythonhosted.org/packages/22/28/a99c42aea746e18382ad9fb36f64c1c1f04216f41797f2f0fa567da11388/et_xmlfile-1.0.1.tar.gz
Installing collected packages: jdcal, et-xmlfile, openpyxl
  Running setup.py install for et-xmlfile ... [?25ldone
[?25h  Running setup.py install for openpyxl ... [?25ldone
[?25hSuccessfully installed et-xmlfile-1.0.1 jdcal-1.4.1 openpyxl-3.0.3
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [4]:
from retry_requests import retry
from requests import Session

def get(url):
    session = retry(Session(), retries=10, backoff_factor=0.2)
    
    ret = session.get(url)
    
    while ret.status_code != 200:
        print('asd')
        ret = session.get(url)
    
    return ret


In [58]:
from datetime import date
from pathlib import Path
import json
import pandas as pd

today = date.today()



data_output_path = Path('./data')
data_output_path.mkdir(parents=True, exist_ok=True)

# remove old files
for f in data_output_path.glob('*'):
    f.unlink()

stichting_nice_url = 'https://www.stichting-nice.nl'

stichting_nice_license = """
    vermeld dat het artikel van www.stichting-nice.nl komt,
    vermeld dat het artikel copyright 1996-2020 van Stichting NICE is,
    vermeld een duidelijke en werkende link naar de juiste pagina op de website van Stichting NICE.
"""

expected_mappings = [
    '/covid-19/public/global',
    '/covid-19/public/new-intake/',
    '/covid-19/public/intake-count/',
    '/covid-19/public/intake-cumulative/',
    '/covid-19/public/ic-count/',
    '/covid-19/public/died-and-survivors-cumulative/',
    '/covid-19/public/age-distribution/',
    '/covid-19/public/age-distribution-died/'
]


def distribution_to_xlsx(data, output_file):
    df = pd.DataFrame(data=data, columns=['age_group', 'percentage'])
    df.to_excel(output_file, index=False)

    
def global_to_xlsx(data, output_file):
    df = pd.DataFrame.from_dict(data, orient='index')
    
    df.to_excel(output_file, header=False)
    
    
def date_based_data_to_xlsx(data, output_file):
    df = pd.DataFrame.from_dict(data)
    df['date'] = pd.to_datetime(df['date']).dt.date  # convert date to date-type 
    df = df.set_index('date').sort_index()  # set date as index and sort on date
    df = df.loc[:, (df != 0).any(axis=0)]  # remove all columns with only 0-values
    idx = pd.date_range(df.index.min(), df.index.max())  # reindex so that missing dates are added
    df = df.reindex(idx)
    df = df.sort_index()  # sort by index
    
    for column in df.columns:
        if 'cumulative' in column.lower() or column.lower() in ['intakecount']:
            df[column] = df[column].fillna(method='ffill').fillna(0)
        else:
            df[column] = df[column].fillna(0)
    
    df.index = df.index.date
    df.to_excel(output_file)

    
def died_and_survivors_to_xlsx(data, output_file):
    modified = {}
    
    for died in data[0]:
        modified[died['date']] = {'died': died['value']}
        
    for survivor in data[1]:
        if survivor['date'] not in modified:
            modified[survivor['date']] = {}
        
        modified[survivor['date']]['survivors'] = survivor['value']
    
    df = pd.DataFrame.from_dict(modified, orient='index')
    df.index = pd.to_datetime(df.index)
    
    idx = pd.date_range(df.index.min(), df.index.max())
    df = df.reindex(idx)
    
    df = df.sort_index().fillna(method='ffill').fillna(0)
    
    df.index = df.index.date
    df.to_excel(output_file)
    
    
parser_mappings = {
    'age-distribution': distribution_to_xlsx,
    'age-distribution-died': distribution_to_xlsx,
    'ic-count': date_based_data_to_xlsx,
    'intake-count': date_based_data_to_xlsx,
    'intake-cumulative': date_based_data_to_xlsx,
    'new-intake': date_based_data_to_xlsx,
    'died-and-survivors-cumulative': died_and_survivors_to_xlsx,
    'global': global_to_xlsx
}


resp = get(f'{stichting_nice_url}/js/covid-19.js')

for line in resp.text.splitlines():
    if 'url' in line.lower():
        url = [x.strip() for x in line.split('\'')]
        if len(url) > 2 and url[1] in expected_mappings:
            name = url[1].strip('/').split('/')[-1]
            print(f'Downloading {url[1]} to {name}.json')
            data_req = get(f'{stichting_nice_url}{url[1]}')
            expected_mappings.remove(url[1])
            
            data = {'data': data_req.json()}
            
            data['license'] = stichting_nice_license
            data['source'] = data_req.url
            
            with open(data_output_path / f'{name}.json', 'w') as fh:
                fh.write(json.dumps(data, sort_keys=True, indent=4))
                
            if name in parser_mappings:
                parser_mappings[name](data['data'], data_output_path / f'{name}.xlsx')
        else:
            print(f'Unknown url: {url}')
            
if len(expected_mappings) > 0:
    for mapping in expected_mappings:
        print(f'Missing the following dataset: {mapping}')

Downloading /covid-19/public/global to global.json
Downloading /covid-19/public/new-intake/ to new-intake.json
Downloading /covid-19/public/intake-count/ to intake-count.json
Downloading /covid-19/public/intake-cumulative/ to intake-cumulative.json
Downloading /covid-19/public/ic-count/ to ic-count.json
Downloading /covid-19/public/died-and-survivors-cumulative/ to died-and-survivors-cumulative.json
Downloading /covid-19/public/age-distribution/ to age-distribution.json
Downloading /covid-19/public/age-distribution-died/ to age-distribution-died.json
