# Exploring some ideas

In [1]:
from pathlib import Path
import json
import pandas as pd
import numpy as np
import shutil
import re

In [58]:
nhs_code_to_name = {
    'S08000015': 'Ayrshire and Arran',
    'S08000016': 'Borders',
    'S08000017': 'Dumfries and Galloway',
    'S08000018': 'Fife',
    'S08000019': 'Forth Valley',
    'S08000020': 'Grampian',
    'S08000021': 'Greater Glasgow and Clyde',
    'S08000022': 'Highland',
    'S08000023': 'Lanarkshire',
    'S08000024': 'Lothian',
    'S08000025': 'Orkney',
    'S08000026': 'Shetland',
    'S08000027': 'Tayside',
    'S08000028': 'Western Isles',
    'S08000029': 'Fife',
    'S08000030': 'Tayside',
    'S08000031': 'Greater Glasgow and Clyde',
    'S08000032': 'Lanarkshire'
}

In [71]:
def process_health_board(in_file, group_file, all_file, is_gender):
    df = pd.read_csv(in_file, index_col='health board')

    # 1. Grouping
    # 91: the last group is 90+
    age_groups = [range(0,1), range(1,15), range(15,45), range(45,65), range(65,75), range(75,85), range(85,91)]
    old_cols = df.columns
    prefixes = ['males___', 'females___'] if is_gender else ['']
    grouped_cols = []
    all_cols = []
    for prefix in prefixes:
        for group in age_groups:
            cols = [c for c in df.columns if c.startswith(prefix) and int(re.search(r'(\d+)', c).group()) in group]
            assert len(cols) == len(list(group))
            m1, m2 = min(list(group)), max(list(group))
            suffix = f'{m1} years and over' if m1 == 85 else f'{m1} years' if m1 == m2 else f'{m1}-{m2} years'
            new_col = (prefix + suffix).replace('males', 'male')
            df[new_col] = df[cols].sum(axis=1)
            grouped_cols.append(new_col)
            
        cols = [c for c in df.columns if c.startswith(prefix)]
        all_col = prefix + 'all'
        df[all_col] = df[cols].sum(axis=1)
        all_cols.append(all_col)
    
    # 2. Rename index
    df.index = df.index.map(nhs_code_to_name)
    df = df.sort_index()
    df[grouped_cols].to_csv(group_file)
    df[all_cols].to_csv(all_file)
    
process_health_board('../data/static/human/demographics/population/scotland/health board_age_genders.csv',
                     '../data/static/human/demographics/population/scotland/health board_age_genders_grouped.csv',
                     '../data/static/human/demographics/population/scotland/health board_age_genders_all.csv',
                     is_gender=True)

In [182]:
SCOTLAND_COUNCILS = ['aberdeen_city', 'aberdeenshire', 'angus', 'argyll_and_bute', 'city_of_edinburgh', 'clackmannanshire', 'dumfries_and_galloway', 'dundee_city', 'east_ayrshire', 'east_dunbartonshire', 'east_lothian', 'east_renfrewshire', 'falkirk', 'fife', 'glasgow_city', 'highland', 'inverclyde', 'midlothian', 'moray', 'na_h_eileanan_siar', 'north_ayrshire', 'north_lanarkshire', 'orkney_islands', 'perth_and_kinross', 'renfrewshire', 'scottish_borders', 'shetland_islands', 'south_ayrshire', 'south_lanarkshire', 'stirling', 'west_dunbartonshire', 'west_lothian']
ENGLAND_COUNCILS = ['adur', 'allerdale', 'amber_valley', 'arun', 'ashfield', 'ashford', 'babergh', 'barking_and_dagenham', 'barnet', 'barnsley', 'barrow_in_furness', 'basildon', 'basingstoke_and_deane', 'bassetlaw', 'bath_and_north_east_somerset', 'bedford', 'bexley', 'birmingham', 'blaby', 'blackburn_with_darwen', 'blackpool', 'blaenau_gwent', 'bolsover', 'bolton', 'boston', 'bournemouth,_christchurch_and_poole', 'bracknell_forest', 'bradford', 'braintree', 'breckland', 'brent', 'brentwood', 'bridgend', 'brighton_and_hove', 'bristol,_city_of', 'broadland', 'bromley', 'bromsgrove', 'broxbourne', 'broxtowe', 'buckinghamshire', 'burnley', 'bury', 'caerphilly', 'calderdale', 'cambridge', 'camden', 'cannock_chase', 'canterbury', 'cardiff', 'carlisle', 'carmarthenshire', 'castle_point', 'central_bedfordshire', 'ceredigion', 'charnwood', 'chelmsford', 'cheltenham', 'cherwell', 'cheshire_east', 'cheshire_west_and_chester', 'chesterfield', 'chichester', 'chorley', 'city_of_london', 'colchester', 'conwy', 'copeland', 'corby', 'cornwall', 'cotswold', 'county_durham', 'coventry', 'craven', 'crawley', 'croydon', 'dacorum', 'darlington', 'dartford', 'daventry', 'denbighshire', 'derby', 'derbyshire_dales', 'doncaster', 'dorset', 'dover', 'dudley', 'ealing', 'east_cambridgeshire', 'east_devon', 'east_hampshire', 'east_hertfordshire', 'east_lindsey', 'east_northamptonshire', 'east_riding_of_yorkshire', 'east_staffordshire', 'east_suffolk', 'eastbourne', 'eastleigh', 'eden', 'elmbridge', 'enfield', 'epping_forest', 'epsom_and_ewell', 'erewash', 'exeter', 'fareham', 'fenland', 'flintshire', 'folkestone_and_hythe', 'forest_of_dean', 'fylde', 'gateshead', 'gedling', 'gloucester', 'gosport', 'gravesham', 'great_yarmouth', 'greenwich', 'guildford', 'gwynedd', 'hackney', 'halton', 'hambleton', 'hammersmith_and_fulham', 'harborough', 'haringey', 'harlow', 'harrogate', 'harrow', 'hart', 'hartlepool', 'hastings', 'havant', 'havering', 'herefordshire,_county_of', 'hertsmere', 'high_peak', 'hillingdon', 'hinckley_and_bosworth', 'horsham', 'hounslow', 'huntingdonshire', 'hyndburn', 'ipswich', 'isle_of_anglesey', 'isle_of_wight', 'isles_of_scilly', 'islington', 'kensington_and_chelsea', 'kettering', "king's_lynn_and_west_norfolk", 'kingston_upon_hull,_city_of', 'kingston_upon_thames', 'kirklees', 'knowsley', 'lambeth', 'lancaster', 'leeds', 'leicester', 'lewes', 'lewisham', 'lichfield', 'lincoln', 'liverpool', 'luton', 'maidstone', 'maldon', 'malvern_hills', 'manchester', 'mansfield', 'medway', 'melton', 'mendip', 'merthyr_tydfil', 'merton', 'mid_devon', 'mid_suffolk', 'mid_sussex', 'middlesbrough', 'milton_keynes', 'mole_valley', 'monmouthshire', 'neath_port_talbot', 'new_forest', 'newark_and_sherwood', 'newcastle_under_lyme', 'newcastle_upon_tyne', 'newham', 'newport', 'north_devon', 'north_east_derbyshire', 'north_east_lincolnshire', 'north_hertfordshire', 'north_kesteven', 'north_lincolnshire', 'north_norfolk', 'north_somerset', 'north_tyneside', 'north_warwickshire', 'north_west_leicestershire', 'northampton', 'northumberland', 'norwich', 'nottingham', 'nuneaton_and_bedworth', 'oadby_and_wigston', 'oldham', 'oxford', 'pembrokeshire', 'pendle', 'peterborough', 'plymouth', 'portsmouth', 'powys', 'preston', 'reading', 'redbridge', 'redcar_and_cleveland', 'redditch', 'reigate_and_banstead', 'rhondda_cynon_taf', 'ribble_valley', 'richmond_upon_thames', 'richmondshire', 'rochdale', 'rochford', 'rossendale', 'rother', 'rotherham', 'rugby', 'runnymede', 'rushcliffe', 'rushmoor', 'rutland', 'ryedale', 'salford', 'sandwell', 'scarborough', 'sedgemoor', 'sefton', 'selby', 'sevenoaks', 'sheffield', 'shropshire', 'slough', 'solihull', 'somerset_west_and_taunton', 'south_cambridgeshire', 'south_derbyshire', 'south_gloucestershire', 'south_hams', 'south_holland', 'south_kesteven', 'south_lakeland', 'south_norfolk', 'south_northamptonshire', 'south_oxfordshire', 'south_ribble', 'south_somerset', 'south_staffordshire', 'south_tyneside', 'southampton', 'southend_on_sea', 'southwark', 'spelthorne', 'st._helens', 'st_albans', 'stafford', 'staffordshire_moorlands', 'stevenage', 'stockport', 'stockton_on_tees', 'stoke_on_trent', 'stratford_on_avon', 'stroud', 'sunderland', 'surrey_heath', 'sutton', 'swale', 'swansea', 'swindon', 'tameside', 'tamworth', 'tandridge', 'teignbridge', 'telford_and_wrekin', 'tendring', 'test_valley', 'tewkesbury', 'thanet', 'three_rivers', 'thurrock', 'tonbridge_and_malling', 'torbay', 'torfaen', 'torridge', 'tower_hamlets', 'trafford', 'tunbridge_wells', 'uttlesford', 'vale_of_glamorgan', 'vale_of_white_horse', 'wakefield', 'walsall', 'waltham_forest', 'wandsworth', 'warrington', 'warwick', 'watford', 'waverley', 'wealden', 'wellingborough', 'welwyn_hatfield', 'west_berkshire', 'west_devon', 'west_lancashire', 'west_lindsey', 'west_oxfordshire', 'west_suffolk', 'westminster', 'wigan', 'wiltshire', 'winchester', 'windsor_and_maidenhead', 'wirral', 'woking', 'wokingham', 'wolverhampton', 'worcester', 'worthing', 'wrexham', 'wychavon', 'wyre', 'wyre_forest', 'york']
COUNCILS = SCOTLAND_COUNCILS + ENGLAND_COUNCILS
REGIONS = ['ayrshire_arran', 'borders', 'dumfries_galloway', 'fife', 'forth_valley', 'grampian', 'greater_glasgow_clyde', 'highland', 'lanarkshire', 'lothian', 'orkney', 'shetland', 'tayside', 'western_isles']
COUNTRIES = ['england', 'scotland', 'wales']
LOCATIONS = COUNCILS + REGIONS + COUNTRIES
TOPICS = ['vaccination', 'all_deaths', 'covid_deaths', 'tests_carried_out', 'hospital_confirmed', 'icu_confirmed', 'tests_reported', 'new_cases']
TIMES = ['daily', 'weekly']
GROUPS = ['place_of_death', 'all_sexes_agegroups']

with open('temp-data/name_mapping.json') as f:
    name_mapping = json.load(f)

In [101]:
def find_keyword(keywords, check_list):
    "Return the keyword in the check list."
    for c in check_list:
        if c in keywords:
            return c
    return None

def max_loc(locs):
    for loc in locs:
        if loc in COUNTRIES:
            return loc
    for loc in locs:
        if loc in REGIONS:
            return loc
    for loc in locs:
        if loc in COUNCILS:
            return loc
    return None

def same_keyword(keywords):
    if len(set(keywords)) == 1:
        return keywords[0]
    return None

def generate_title(keywords_list):
    locs, times, topics, groups = [], [], [], []
    for keywords in keywords_list:
        loc = find_keyword(keywords, LOCATIONS)
        if loc is None:
            raise Exception(keywords, 'location missing')
        locs.append(loc)
            
        time = find_keyword(keywords, TIMES)
        if time is None:
            raise Exception(keywords, 'should have daily or weekly')
        times.append(time)
            
        topic = find_keyword(keywords, TOPICS)
        if topic is None:
            raise Exception(keywords, 'topic missing')
        topics.append(topic)
    
        group = find_keyword(keywords, GROUPS)
        groups.append(group)

    # Single stream
    if len(keywords_list) == 1:
        return comnbine_to_title(locs[0], times[0], topics[0], groups[0])
    
    # Multiple streams
    return comnbine_to_title(max_loc(locs), same_keyword(times), same_keyword(topics), same_keyword(groups))
    
def comnbine_to_title(loc, time, topic, group):
    if topic is None:
        return name_mapping[loc]
    result = ''
    if time is None:
        result = f'{name_mapping[loc]} - {name_mapping[topic]}'
    if loc and time and topic:
        result = f'{name_mapping[loc]} - {name_mapping[time]} {name_mapping[topic]}'
    if group is not None:
        result += ' by ' + name_mapping[group]
    return result

In [108]:
k1 = ["opendata", "scotland", "vaccination", "daily", "fife"]
k2 = ["opendata", "scotland", "vaccination", "daily", "entire_country", "all_sexes_agegroups"]
k3 = ["ons", "england", "mortality", "weekly", "local_authority", "place_of_death", "all_deaths", "carehome", "ashford"]
k4 = ["ons", "england", "mortality", "weekly", "local_authority", "place_of_death", "all_deaths", "hospital", "ashford"]
k5 = ["data_product", "scotland", "mortality", "weekly", "local_authority", "covid_deaths"]
k6 = ["data_product", "scotland", "testing", "daily", "testing_location", "tests_carried_out"]
assert generate_title([k1]) == 'Fife - Daily vaccination'
assert generate_title([k2]) == 'Scotland - Daily vaccination by genders, age groups'
assert generate_title([k3]) == 'Ashford - Weekly all deaths by places'
assert generate_title([k4]) == 'Ashford - Weekly all deaths by places'
assert generate_title([k5]) == 'Scotland - Weekly COVID deaths'
assert generate_title([k6]) == 'Scotland - Daily tests carried out'
assert generate_title([k1, k2]) == 'Scotland - Daily vaccination'
assert generate_title([k3, k4]) == 'Ashford - Weekly all deaths by places'
assert generate_title([k5, k6]) == 'Scotland'

In [119]:
with open('temp-data/onto_data') as f:
    onto_data = json.load(f)
    
with open('temp-data/onto_page') as f:
    onto_page = json.load(f)

In [183]:
streams = {}
excluded_keywords = ['xl', 'mock', 'analytics', 'model']
for d in onto_data:
    keywords = d['keywords'].split(', ')
    if any(k in keywords for k in excluded_keywords):
        continue
        
    streams[d['_id']['$oid']] = {
        'keywords': keywords,
        'description': d['description']
    }

for s in streams.values():
    try:
        generate_title([s['keywords']])
    except Exception as e:
        print(e)

In [184]:
pages = []
for p in onto_page:
    dataIds = p['bindings'][0]['dataIds']
    if any(id not in streams for id in dataIds):
        continue
    pages.append(dataIds)

for dataIds in pages:
    keywords_list = [streams[dataId]['keywords'] for dataId in dataIds]
    print(generate_title(keywords_list))

Oxford - Weekly all deaths by places
Oxford - Weekly all deaths by places
Cannock Chase - Weekly all deaths by places
Maidstone - Weekly all deaths by places
Monmouthshire - Weekly all deaths by places
Newcastle upon Tyne - Weekly all deaths by places
Richmond upon Thames - Weekly all deaths by places
Sevenoaks - Weekly all deaths by places
South Staffordshire - Weekly all deaths by places
St Albans - Weekly all deaths by places
Tameside - Weekly all deaths by places
Westminster - Weekly all deaths by places
Bassetlaw - Weekly all deaths by places
Brent - Weekly all deaths by places
Cambridge - Weekly all deaths by places
Colchester - Weekly all deaths by places
Copeland - Weekly all deaths by places
Croydon - Weekly all deaths by places
East Staffordshire - Weekly all deaths by places
Folkestone and Hythe - Weekly all deaths by places
Fylde - Weekly all deaths by places
Havant - Weekly all deaths by places
Hinckley and Bosworth - Weekly all deaths by places
Ipswich - Weekly all deaths