In [3]:
import pandas as pd
import requests
import re

In [4]:
#Get Family Group Table information from the Improvement Service's website
html = requests.get('https://www.improvementservice.org.uk/benchmarking/how-do-we-compare-councils').content
htmlTables = pd.read_html(html)
FamilyGroups_CSWH = htmlTables[0]
FamilyGroups_ECLEDC = htmlTables[1]

#Unpivot the data in ECLEDC
FamilyGroups_ECLEDC = FamilyGroups_ECLEDC.assign(id= 1)
FamilyGroups_ECLEDC = pd.melt(FamilyGroups_ECLEDC, id_vars=['id'], value_vars=['Family Group 1', 'Family Group 2', 'Family Group 3', 'Family Group 4'],var_name='Family_Group', value_name='Local_Authority')
FamilyGroups_ECLEDC = FamilyGroups_ECLEDC.drop(columns = ['id'])

#The table on the Improvement Service's website does not separate the local authority values into separate table rows but instead puts them all in as one piece of text without a delimeter. In the HTML they are separated by </br> but this does not work with pandas read_html. The only delimeter that can be used is the transition from lower case to uppercase. This is because the only time words run together are between local authorities. Here we replace those transitions with ";" so they can be expanded to new rows.
FamilyGroups = []
Local_Authority = []
Type = []
for row in FamilyGroups_ECLEDC.itertuples() :
    la_list = re.sub(r'(?<=[a-z])(?=[A-Z])', ';', row.Local_Authority)
    Local_Authority.append(la_list)
    FamilyGroups.append(row.Family_Group)
    Type.append('Environmental, Culture & Leisure, Economic Development, Corporate and Property indicators')
    
#Create dataframe to contain the results and rename columns appropriately
FamilyGroups_ECLEDC = pd.DataFrame(Local_Authority)
FamilyGroups_ECLEDC = FamilyGroups_ECLEDC.assign(Type = Type)
FamilyGroups_ECLEDC = FamilyGroups_ECLEDC.assign(Family_Group = FamilyGroups)
FamilyGroups_ECLEDC = FamilyGroups_ECLEDC.rename(columns = {0 : 'Local_Authority'})

#Unpivot the data in CSWH
FamilyGroups_CSWH = FamilyGroups_CSWH.assign(id= 1)
FamilyGroups_CSWH = pd.melt(FamilyGroups_CSWH, id_vars=['id'], value_vars=['Family Group 1', 'Family Group 2', 'Family Group 3', 'Family Group 4'],var_name='Family_Group', value_name='Local_Authority')
FamilyGroups_CSWH = FamilyGroups_CSWH.drop(columns = ['id'])

#The table on the Improvement Service's website does not separate the local authority values into separate table rows but instead puts them all in as one piece of text without a delimeter. In the HTML they are separated by </br> but this does not work with pandas read_html. The only delimeter that can be used is the transition from lower case to uppercase. This is because the only time words run together are between local authorities. Here we replace those transitions with ";" so they can be expanded to new rows.
FamilyGroups = []
Local_Authority = []
Type = []
for row in FamilyGroups_CSWH.itertuples() :
    la_list = re.sub(r'(?<=[a-z])(?=[A-Z])', ';', row.Local_Authority)
    Local_Authority.append(la_list)
    FamilyGroups.append(row.Family_Group)
    Type.append('Children, Social Work and Housing indicators')
    
#Create dataframe to contain the results and rename columns appropriately
FamilyGroups_CSWH = pd.DataFrame(Local_Authority)
FamilyGroups_CSWH = FamilyGroups_CSWH.assign(Type = Type)
FamilyGroups_CSWH = FamilyGroups_CSWH.assign(Family_Group = FamilyGroups)
FamilyGroups_CSWH = FamilyGroups_CSWH.rename(columns = {0 : 'Local_Authority'})


#Concatenate dataframes together and expand the delimeted local authorities to new rows.
Family_Groups = pd.concat([FamilyGroups_ECLEDC, FamilyGroups_CSWH])
Family_Groups = Family_Groups.assign(Local_Authority=Family_Groups.Local_Authority.str.split(";")).explode('Local_Authority')
#There is a mismatch between the naming of Edinburgh on the Improvement Services Family Groupings web page and the naming in the raw data file. Replace the text here to allow merges with the raw data file later.
Family_Groups['Local_Authority'] = Family_Groups['Local_Authority'].str.replace('Edinburgh, City of','Edinburgh City')
Family_Groups

Unnamed: 0,Local_Authority,Type,Family_Group
0,Eilean Siar,"Environmental, Culture & Leisure, Economic Dev...",Family Group 1
0,Argyll & Bute,"Environmental, Culture & Leisure, Economic Dev...",Family Group 1
0,Shetland Islands,"Environmental, Culture & Leisure, Economic Dev...",Family Group 1
0,Highland,"Environmental, Culture & Leisure, Economic Dev...",Family Group 1
0,Orkney Islands,"Environmental, Culture & Leisure, Economic Dev...",Family Group 1
...,...,...,...
3,North Ayrshire,"Children, Social Work and Housing indicators",Family Group 4
3,North Lanarkshire,"Children, Social Work and Housing indicators",Family Group 4
3,Inverclyde,"Children, Social Work and Housing indicators",Family Group 4
3,West Dunbartonshire,"Children, Social Work and Housing indicators",Family Group 4


In [5]:
Family_Groups.to_csv("Data Files//Family Groups.csv", index = False, encoding='utf-8-sig')