# Getting data from Public Health England

In [20]:
import os
import sys
module_path = os.path.abspath(os.pardir)
if module_path not in sys.path:
    sys.path.append(module_path)
    
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [27]:
import json
from collections import Counter
from pathlib import Path
from urllib import parse
import json
import requests
import pandas as pd
from pprint import pprint

from stream import test_endpoints, get_token, register, generate_streams_from_urls

In [28]:
def generate_streams(base_streams, codes):
    streams = []
    for s in base_streams:
        for c in codes:
            scopy = s.copy()
            scopy['endpoint'] = s['endpoint'] + '&areaCode=' + c
            scopy['keywords'] = scopy['keywords'] + [c.lower()]
            streams.append(scopy)
    return streams

In [59]:
token = get_token(prod=True)

## Nation

In [86]:
with open('../manifest/nation-streams.json') as f:
    base_streams = json.load(f)

print(base_streams[0])
len(base_streams)

{'urlCode': 'API_GOVUK', 'endpoint': 'https://api.coronavirus.data.gov.uk/v2/data?areaType=nation&metric=cumAdmissions&metric=newAdmissions&format=json', 'dataType': 'timeseries', 'keywords': ['phe', 'nation', 'hospital_admission', 'daily'], 'description': ''}


6

In [94]:
area_codes = requests.get('https://api.coronavirus.data.gov.uk/generic/area/nation').json()
area_codes = [a['areaCode'] for a in area_codes]
print('area_codes length', len(area_codes))
area_codes.remove('E92000001')
print('area_codes length', len(area_codes))

area_codes length 4
area_codes length 3


In [95]:
# area_codes = ['E92000001']
streams = generate_streams(base_streams, area_codes)
len(streams)

18

In [96]:
test_endpoints(streams, base_url='')

In [97]:
for s in streams:
    register(s, token, prod=True)

## UTLA

In [110]:
with open('../manifest/utla-streams.json') as f:
    base_streams = json.load(f)

print(base_streams[0])
len(base_streams)

{'urlCode': 'API_GOVUK', 'endpoint': 'https://api.coronavirus.data.gov.uk/v2/data?areaType=utla&metric=newCasesBySpecimenDate&metric=newDeaths28DaysByDeathDate&metric=newPeopleVaccinatedFirstDoseByVaccinationDate&metric=newPeopleVaccinatedSecondDoseByVaccinationDate&metric=cumVaccinationThirdInjectionUptakeByVaccinationDatePercentage&format=json', 'dataType': 'timeseries', 'keywords': ['phe', 'utla', 'group1'], 'description': ''}


2

In [None]:
# area_codes = requests.get('https://api.coronavirus.data.gov.uk/generic/area/utla').json()
# sorted(a['areaCode'].lower() for a in area_codes)

In [111]:
area_codes = requests.get('https://api.coronavirus.data.gov.uk/generic/area/utla').json()
area_codes = [a['areaCode'] for a in area_codes]
print('area_codes length', len(area_codes))
area_codes.remove('S12000033')
print('area_codes length', len(area_codes))

area_codes length 216
area_codes length 215


In [112]:
# area_codes = ['S12000033']
streams = generate_streams(base_streams, area_codes)
len(streams)

430

In [113]:
test_endpoints(streams[:10], base_url='')

In [114]:
for s in streams:
    register(s, token, prod=True)

## MSOA

In [115]:
with open('../manifest/msoa-streams.json') as f:
    base_streams = json.load(f)

print(base_streams[0])
len(base_streams)

{'urlCode': 'API_GOVUK', 'endpoint': 'https://api.coronavirus.data.gov.uk/v2/data?areaType=msoa&metric=newCasesBySpecimenDateRollingSum&metric=newCasesBySpecimenDateRollingRate&metric=newCasesBySpecimenDateChangePercentage&format=json', 'dataType': 'timeseries', 'keywords': ['phe', 'msoa', 'group1'], 'description': ''}


2

In [None]:
# area_codes = requests.get('https://api.coronavirus.data.gov.uk/generic/area/msoa').json()
# pprint(sorted(a['areaCode'].lower() for a in area_codes))

In [116]:
area_codes = requests.get('https://api.coronavirus.data.gov.uk/generic/area/msoa').json()
area_codes = [a['areaCode'] for a in area_codes]
print('area_codes length', len(area_codes))
area_codes.remove('E02000961')
print('area_codes length', len(area_codes))

area_codes length 6791
area_codes length 6790


In [117]:
# area_codes = ['E02000961']
streams = generate_streams(base_streams, area_codes)
len(streams)

13580

In [122]:
test_endpoints(streams[:10], base_url='')

In [124]:
for s in streams:
    register(s, token, prod=True)

## NHS Region

In [48]:
# with open('../manifest/nhsregion-streams.json') as f:
#     base_streams = json.load(f)

# print(base_streams[0])
# len(base_streams)

{'urlCode': 'API_GOVUK', 'endpoint': 'https://api.coronavirus.data.gov.uk/v2/data?areaType=nhsRegion&metric=cumAdmissions&metric=hospitalCases&metric=newAdmissions&format=json', 'dataType': 'timeseries', 'keywords': ['phe', 'nhsregion', 'group1'], 'description': ''}


2

In [24]:
# area_codes = requests.get('https://api.coronavirus.data.gov.uk/generic/area/nhsregion').json()
# pprint(sorted(a['areaCode'].lower() for a in area_codes))

['e40000003',
 'e40000005',
 'e40000006',
 'e40000007',
 'e40000008',
 'e40000009',
 'e40000010']


In [25]:
# # To update title genration
# area_codes = requests.get('https://api.coronavirus.data.gov.uk/generic/area/nhsregion').json()
# area_codes_dict = {a['areaCode'].lower(): a['areaName'] for a in area_codes}
# print(json.dumps(area_codes_dict, indent=4))

{
    "e40000007": "East of England",
    "e40000003": "London",
    "e40000008": "Midlands",
    "e40000009": "North East and Yorkshire",
    "e40000010": "North West",
    "e40000005": "South East",
    "e40000006": "South West"
}


In [49]:
# area_codes = ['E40000007']
# streams = generate_streams(base_streams, area_codes)
# len(streams)

2

In [51]:
# test_endpoints(streams, base_url='')

In [52]:
# for s in streams:
#     register(s, token, prod=True)

## Region

In [98]:
with open('../manifest/region-streams.json') as f:
    base_streams = json.load(f)

print(base_streams[0])
len(base_streams)

{'urlCode': 'API_GOVUK', 'endpoint': 'https://api.coronavirus.data.gov.uk/v2/data?areaType=region&metric=cumCasesBySpecimenDate&metric=newCasesBySpecimenDate&metric=uniqueCasePositivityBySpecimenDateRollingSum&metric=cumVaccinationFirstDoseUptakeByVaccinationDatePercentage&metric=cumVaccinationSecondDoseUptakeByVaccinationDatePercentage&format=json', 'dataType': 'timeseries', 'keywords': ['phe', 'region', 'group1'], 'description': ''}


8

In [54]:
# area_codes = requests.get('https://api.coronavirus.data.gov.uk/generic/area/region').json()
# pprint(sorted(a['areaCode'].lower() for a in area_codes))

['e12000001',
 'e12000002',
 'e12000003',
 'e12000004',
 'e12000005',
 'e12000006',
 'e12000007',
 'e12000008',
 'e12000009']


In [56]:
# # To update title genration
# area_codes_dict = {a['areaCode'].lower(): a['areaName'] for a in area_codes}
# print(json.dumps(area_codes_dict, indent=4))

{
    "e12000004": "East Midlands",
    "e12000006": "East of England",
    "e12000007": "London",
    "e12000001": "North East",
    "e12000002": "North West",
    "e12000008": "South East",
    "e12000009": "South West",
    "e12000005": "West Midlands",
    "e12000003": "Yorkshire and The Humber"
}


In [106]:
area_codes = requests.get('https://api.coronavirus.data.gov.uk/generic/area/region').json()
area_codes = [a['areaCode'] for a in area_codes]
print('area_codes length', len(area_codes))
area_codes.remove('E12000007')
print('area_codes length', len(area_codes))

area_codes length 9
area_codes length 8


In [107]:
# area_codes = ['E12000007']
streams = generate_streams(base_streams, area_codes)
len(streams)

64

In [108]:
test_endpoints(streams[:10], base_url='')

In [109]:
for s in streams:
    register(s, token, prod=True)