# Getting data from Public Health England
https://github.com/ScottishCovidResponse/scrc-vis-analytical/tree/master/ClusteringAndImpactAnalysis/data

In [1]:
import os
import sys

module_path = os.path.abspath(os.pardir)
if module_path not in sys.path:
    sys.path.append(module_path)
    
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
from urllib import parse
import json
import requests

import pandas as pd

from stream import generate_streams, test_endpoints, get_token, register, generate_streams_from_urls

## Generate elements for urls.json

In [101]:
uk_cumAdmissions = "https://api.coronavirus.data.gov.uk/v2/data?areaType=overview&metric=cumAdmissions&format=csv"
uk_cumCasesBySpecimenDate = "https://api.coronavirus.data.gov.uk/v2/data?areaType=overview&metric=cumCasesBySpecimenDate&format=csv"
uk_cumDeaths28DaysByDeathDate = "https://api.coronavirus.data.gov.uk/v2/data?areaType=overview&metric=cumDeaths28DaysByDeathDate&format=csv"
uk_cumPeopleVaccinatedFirstDoseByPublishDate = "https://api.coronavirus.data.gov.uk/v2/data?areaType=overview&metric=cumPeopleVaccinatedFirstDoseByPublishDate&format=csv"
uk_cumPeopleVaccinatedSecondDoseByPublishDate = "https://api.coronavirus.data.gov.uk/v2/data?areaType=overview&metric=cumPeopleVaccinatedSecondDoseByPublishDate&format=csv"
uk_cumPeopleVaccinatedThirdInjectionByPublishDate = "https://api.coronavirus.data.gov.uk/v2/data?areaType=overview&metric=cumPeopleVaccinatedThirdInjectionByPublishDate&format=csv"
uk_newAdmissions = "https://api.coronavirus.data.gov.uk/v2/data?areaType=overview&metric=newAdmissions&format=csv"
uk_newCasesBySpecimenDate = "https://api.coronavirus.data.gov.uk/v2/data?areaType=overview&metric=newCasesBySpecimenDate&format=csv"
uk_newDeaths28DaysByDeathDate = "https://api.coronavirus.data.gov.uk/v2/data?areaType=overview&metric=newDeaths28DaysByDeathDate&format=csv"
uk_newPeopleVaccinatedFirstDoseByPublishDate = "https://api.coronavirus.data.gov.uk/v2/data?areaType=overview&metric=newPeopleVaccinatedFirstDoseByPublishDate&format=csv"
uk_newPeopleVaccinatedSecondDoseByPublishDate = "https://api.coronavirus.data.gov.uk/v2/data?areaType=overview&metric=newPeopleVaccinatedSecondDoseByPublishDate&format=csv"
uk_newPeopleVaccinatedThirdInjectionByPublishDate = "https://api.coronavirus.data.gov.uk/v2/data?areaType=overview&metric=newPeopleVaccinatedThirdInjectionByPublishDate&format=csv"

ltla_cases = "https://api.coronavirus.data.gov.uk/v2/data?areaType=ltla&areaCode=S12000036&metric=cumCasesBySpecimenDate&metric=newCasesBySpecimenDate&format=csv"
ltla_deaths = "https://api.coronavirus.data.gov.uk/v2/data?areaType=ltla&areaCode=S12000036&metric=cumWeeklyNsoDeathsByRegDate&metric=newWeeklyNsoDeathsByRegDate&format=csv"
ltla_vaccination = "https://api.coronavirus.data.gov.uk/v2/data?areaType=ltla&areaCode=S12000036&metric=cumVaccinationFirstDoseUptakeByVaccinationDatePercentage&metric=cumVaccinationSecondDoseUptakeByVaccinationDatePercentage&format=csv"
ltla_vaccinationAgeDemographics = "https://api.coronavirus.data.gov.uk/v2/data?areaType=ltla&areaCode=S12000036&metric=vaccinationsAgeDemographics&format=csv"

In [102]:
urls = [uk_cumAdmissions, uk_cumCasesBySpecimenDate, uk_cumDeaths28DaysByDeathDate, uk_cumPeopleVaccinatedFirstDoseByPublishDate,
        uk_cumPeopleVaccinatedSecondDoseByPublishDate, uk_cumPeopleVaccinatedThirdInjectionByPublishDate, uk_newAdmissions,
        uk_newCasesBySpecimenDate, uk_newDeaths28DaysByDeathDate, uk_newPeopleVaccinatedFirstDoseByPublishDate,
        uk_newPeopleVaccinatedSecondDoseByPublishDate, uk_newPeopleVaccinatedThirdInjectionByPublishDate,
        ltla_cases, ltla_deaths, ltla_vaccination, ltla_vaccinationAgeDemographics]

In [104]:
template = {
    "name": "phe",
    "url": "https://api.coronavirus.data.gov.uk/v2/data?areaType=overview&metric=cumAdmissions&format=csv",
    "save_to": "phe/cumAdmissions.csv",
    "dataType": "cum_timeseries",
    "keywords": ["phe", "uk", "hospital_admission", "daily"]
}

In [106]:
from pprint import pprint

In [108]:
import json

In [None]:
for url in urls:
    d = template.copy()
    d['url'] = url
    d['save_to'] = f'phe/{dict(parse.parse_qsl(parse.urlsplit(url).query))["metric"]}.csv'
    print(json.dumps(d, indent=4))

## Data streams

In [4]:
with open('../manifest/urls.json') as f:
    urls = json.load(f)
    urls = [url for url in urls if url['name'] == 'phe']

streams = generate_streams_from_urls(urls)
print(streams[0])
len(streams)

{'urlCode': 'API_PY', 'endpoint': '/data/?product=phe/overview&component=cumAdmissions&format=long', 'dataType': 'cum_timeseries', 'keywords': ['phe', 'uk', 'hospital_admission', 'daily'], 'description': ''}


16

### UK

In [28]:
uk_streams = [s for s in streams if 'uk' in s['keywords']]
len(uk_streams)

12

In [72]:
test_endpoints(uk_streams)

In [93]:
token = get_token(prod=True)
token

'eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJpZCI6IjYwMzZkMDJkYzg2ZTkzZGZmY2Y0NWE5YSIsInJvbGUiOiJhZG1pbiIsInBlcm1pc3Npb25zIjp7InVzZXIiOnsicmVhZDpvd24iOlsiaWQiXSwidXBkYXRlOmFueSI6WyJwYXNzd29yZCIsInJvbGUiXSwicmVhZDphbnkiOlsiKiJdLCJjcmVhdGU6YW55IjpbIioiXSwiZGVsZXRlOmFueSI6WyIqIl19fSwiaWF0IjoxNjM5MjQxMTE5LCJleHAiOjI1ODU5MjU5MTl9.Hldl6pRLwjbUJKnxqFpXV2prr2LLhnYCI_rSQTX4IyfP0a1hT_8YKq_7QzR3RXdaXkAo7Zkeqho2x87-odaHhYzWvFrYbehX3aeF1swQVHo4H9anKXnKZlmqTYiCXAy_zok1OnW6AtmvF5MS1XlHJI06jSeT_8bbgFWIQ6ys7dVx3fHj9pFyrPw5wl78TdLRltbbNVxxxLWjYcdoimgHWS3AZaZC8qJgXIgeij15u_AxiEPTsbpX3GDD1A60a-oBtzz2UksAtsKqwXzQDzhAlB6fKwpf157sqbiVSVRillotj6lxzn5o8dqJ_DOgcKI_SR1SbLDwUOD9wC5pu8LwTtrb6Rw77Z7GGuDNXhHLldzBXiZpqsCdhY7g_8weI8i_ucFRFYEx4KxNBiyjr6x_45cG9ikgn7DoFEVu0_vdDHY5zSnvOVI4nEo8xIFtxe1qOczu1HVSZIm8Ddw8fqKmlmpjfGmwJWsbp-37edqRa9ddlstWY4Nd_LcRqwRdrjjv5mOM8_3Y388unkD7M1ENB_2dXdbCDIB1FVbA-SlC-NrIwMnymG-Vi-v8TyrAstE7AaqxjramdjA1iSLjzodVykUMR9rW1s-1Lvf_hbbnqesYbwpNkZvhdLJcRlQQKmzhuDu8uA8RxfNyNPqV7U0Ds-UoKmp56ifjbhHog

In [94]:
for s in uk_streams:
    register(s, token, prod=True)

### LTLA

In [5]:
ltla_streams = [s for s in streams if 'ltla' in s['keywords']]
len(ltla_streams)

4

Get all area codes, it can be done from API but only select the codes that have all data using downloaded data.

In [8]:
area_code_ltla_streams = []
n_expected_files = len(ltla_streams)
for p in Path('../../data/live/phe/ltla/').iterdir():
    if len(list(p.glob('*.csv'))) == n_expected_files:
        for s in ltla_streams:
            scopy = s.copy()
            scopy['endpoint'] = s['endpoint'].replace('phe/ltla', 'phe/ltla/' + p.name)
            scopy['keywords'] = scopy['keywords'] + [p.name]
            area_code_ltla_streams.append(scopy)
area_code_ltla_streams[0], len(area_code_ltla_streams)

({'urlCode': 'API_PY',
  'endpoint': '/data/?product=phe/ltla/s12000041&component=newCasesBySpecimenDate&format=long',
  'dataType': 'timeseries',
  'keywords': ['phe', 'ltla', 'new_cases', 'daily', 's12000041'],
  'description': ''},
 1344)

In [19]:
# area_codes = requests.get('https://api.coronavirus.data.gov.uk/generic/area/ltla').json()
# area_codes_dict = {a['areaCode'].lower(): a['areaName'] for a in area_codes}
# print(json.dumps(area_codes_dict, indent=4))

In [114]:
test_endpoints(area_code_ltla_streams[:100])

In [95]:
for s in area_code_ltla_streams:
    register(s, token, prod=True)

---
## Playground

In [69]:
# age
age_case_url = 'https://api.coronavirus.data.gov.uk/v2/data?areaType=region&metric=newCasesBySpecimenDateAgeDemographics&format=csv'
age_death_url = 'https://api.coronavirus.data.gov.uk/v2/data?areaType=region&metric=newDeaths28DaysByDeathDateAgeDemographics&format=csv'
age_vaccine_url = 'https://api.coronavirus.data.gov.uk/v2/data?areaType=region&metric=vaccinationsAgeDemographics&format=csv'

# new cases & deaths
case_death_url = 'https://api.coronavirus.data.gov.uk/v2/data?areaType=region&metric=newCasesByPublishDate&metric=newDeaths28DaysByPublishDate&format=csv'

# vaccine
vaccine_url = 'https://api.coronavirus.data.gov.uk/v2/data?areaType=region&metric=cumPeopleVaccinatedFirstDoseByVaccinationDate&metric=cumPeopleVaccinatedSecondDoseByVaccinationDate&metric=cumVaccinationFirstDoseUptakeByVaccinationDatePercentage&metric=cumVaccinationSecondDoseUptakeByVaccinationDatePercentage&format=csv'

In [15]:
url_one = 'https://api.coronavirus.data.gov.uk/v2/data?areaType=ltla&areaCode=S12000036&metric=cumCasesBySpecimenDate&metric=newCasesBySpecimenDate&format=csv'
url_full = 'https://api.coronavirus.data.gov.uk/v2/data?areaType=ltla&metric=cumCasesBySpecimenDate&metric=newCasesBySpecimenDate&format=csv'