In [69]:
import pandas as pd
import numpy as np
from random import randrange

In [70]:
## THIS IS NOT PART OF THE PIPELINE BUT CREATES SYNTHETIC 2021 DATA
## 
# Read data table
df = pd.read_csv('./ethnicity_2011.csv')

# For some reason some areas have an astrix in the title
df['Area Name'] = df['Area Name'].str.replace("*", "").str.strip() 

# CONVERT STRINGS TO INTEGERS

def convert_to_int(x):
    try:
        return int(x.replace(",", ""))
    except:
        try:
            return x.replace("’", "'")
        except:
            return x

df = df.applymap(convert_to_int)

# Create synthetic data in place of 2021 data
from random import random

def create_synth(x):
    if type(x) is int:
        num = random()+0.6
        return round(x * num)
    else:
        return x

dfsynth = df.applymap(create_synth)


In [71]:
dfsynth.to_csv('ethnicity_2021.csv', index = False)

In [72]:
def rand_list(length):
    r_list =[]
    for i in range(length):
        r_list.append(randrange(100, 100000))

    return r_list

In [73]:
dfempty = dfsynth[['Area Code','Area Name']]

In [74]:
care_df = dfempty.copy()
religion_df = dfempty.copy()
ethnicity_df = dfempty.copy()
health_df = dfempty.copy()
economic_df = dfempty.copy()
household_df = dfempty.copy()
marital_df = dfempty.copy()
hours_df = dfempty.copy()
tenure_df = dfempty.copy()
disability_df = dfempty.copy()
national_df = dfempty.copy()
welsh_df = dfempty.copy()
population_df = dfempty.copy()
density_df = dfempty.copy()
age10yr_df = dfempty.copy()
agemed_df = dfempty.copy()
cob_df = dfempty.copy()


In [75]:
df_COBvars = pd.read_csv('./CountryOfBirthVars.csv')
df_COBvars = df_COBvars.drop(['Unnamed: 1'], axis=1)
cob_variables = [i for i in df_COBvars['Country of Birth']]

In [76]:
population_variables = ['Population']
density_variables = ['Density']
age10yr_variables = ['0-9', '10-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70-79', '80plus']
agemed_variables = ['Median Age']
welsh_variables = ['Can speak Welsh', "Cannot speak Welsh"]
care_variables = ['Provides no unpaid care',
'Provides 19 or less hours unpaid care a week',
'Provides 20 to 49 hours unpaid care a week',
'Provides 50 or more hours unpaid care a week']
religion_variables = ['No religion',
'Christian',
'Buddhist',
'Hindu',
'Jewish',
'Muslim',
'Sikh',
'Other religion',
'Not answered']
ethnicity_variables = ['Asian, Asian British or Asian Welsh',
'Black, Black British, Black Welsh, Caribbean or African',
'Mixed or Multiple ethnic groups',
'White',
'Other ethnic group']
health_variables = ['Very good or good health',
'Fair health',
'Very bad or bad health']
economic_variables = ['Economically active (excluding full-time students): In employment',
'Economically active (excluding full-time students): Unemployed',
'Economically active and a full-time student: In employment',
'Economically active and a full-time student: Unemployed',
'Economically inactive (excluding full-time students)',
'Economically inactive and a full-time student']
household_variables = ['One person household: Aged 66 years and over',
'One person household: Other',
'Single family household: All aged 66 years and over',
'Single family household: Married or civil partnership couple: No children',
'Single family household: Married or civil partnership couple: Dependent children',
'Single family household: Married or civil partnership couple: All children non-dependent',
'Single family household: Cohabiting couple family : No children',
'Single family household: Cohabiting couple family : With dependent children',
'Single family household: Cohabiting couple family : All children non-dependent',
'Single family household: Lone parent family : With dependent children',
'Single family household: Lone parent family : All children non-dependent',
'Single family household: Other single family household: Other family composition',
'Multiple-family household: With dependent children',
'Multiple-Family Household : Other, including all full-time students and all aged 66 years and over']
marital_variables = [
'Never married and never registered a civil partnership',
'Married or in a registered civil partnership',
'Separated, but still legally married or still legally in a civil partnership',
'Divorced or civil partnership dissolved',
'Widowed or surviving civil partnership partner'
]
hours_variables = [
'Part-time: 15 hours or less worked',
'Part-time: 16 to 30 hours worked',
'Full-time: 31 to 48 hours worked',
'Full-time: 49 or more hours worked'
]
tenure_variables = [
'Owns outright or with a mortgage or loan',
'Shared ownership',
'Rented from council or Local Authority',
'Other social rented',
'Private rented',
'Lives rent free'
]
disability_variables = [
'Disabled under the Equality Act: Day-to-day activities limited a lot',
'Disabled under the Equality Act: Day-to-day activities limited a little',
'Not disabled under the Equality Act'
]
national_variables = [
'British only identity',
'Welsh only identity',
'Welsh and British only identity',
'English only identity',
'English and British only identity',
'Any other combination of only UK identities',
'Non-UK identity only',
'UK identity and non-UK identity'
]

In [77]:
vars_df = [
    {'vars': care_variables, 'df': care_df, 'name': 'care'},
    {'vars': religion_variables, 'df': religion_df, 'name': 'religion'},
    {'vars': ethnicity_variables, 'df': ethnicity_df, 'name':'ethnicity'},
    {'vars': health_variables, 'df': health_df, 'name': 'health'},
    {'vars': economic_variables, 'df': economic_df, 'name': 'economic'},
    {'vars': household_variables, 'df': household_df, 'name': 'household'},
    {'vars': marital_variables, 'df': marital_df, 'name': 'marital'},
    {'vars': hours_variables, 'df': hours_df, 'name': 'hours'},
    {'vars': tenure_variables, 'df': tenure_df, 'name': 'tenure'},
    {'vars': disability_variables, 'df': disability_df, 'name': 'disability'},
    {'vars': national_variables, 'df': national_df, 'name': 'national'},
    {'vars': welsh_variables, 'df': welsh_df, 'name': 'welsh'},
    {'vars': population_variables, 'df': population_df, 'name': 'population'},
    {'vars': density_variables, 'df': density_df, 'name': 'density'},
    {'vars': age10yr_variables, 'df': age10yr_df, 'name': 'age10yr'},
    {'vars': agemed_variables, 'df': agemed_df, 'name': 'agemed'},
    {'vars': cob_variables, 'df': cob_df, 'name': 'cob'}
]

In [78]:
for ob in vars_df:
    print('name', ob['name'])
    for var in ob['vars']:
        ob['df'][var] = rand_list(ob['df'].shape[0])
        dfsynth = ob['df'].applymap(create_synth)
        ob['df'].to_csv(ob['name']+'_2011.csv', index = False)
        dfsynth.to_csv(ob['name']+'_2021.csv', index = False)

name care
name religion
name ethnicity
name health
name economic
name household
name marital
name hours
name tenure
name disability
name national
name welsh
name population
name density
name age10yr
name agemed
name cob


In [None]:
national_df

Unnamed: 0,Area Code,Area Name,British only identity,English only identity,English and British only identity,Welsh only identity,Welsh and British only identity,Scottish only identity,Scottish and British only identity,Northern Irish only identity,Northern Irish and British only identity,Cornish only identity,Cornish and British only identity,Any other combination of UK identities (UK only),Irish only identity,Irish and at least one UK identity,Other identity only,Other identity and at least one UK identity
0,K04000001,ENGLAND AND WALES,29140,51633,31423,62408,14103,73919,42005,79962,59869,34302,51741,27827,88378,6707,93919,50035
1,E92000001,ENGLAND,77072,2312,80555,4149,34079,53477,77703,83932,58088,10558,87503,2030,63730,21588,77269,38567
2,E12000001,NORTH EAST,42108,60897,64460,3623,81484,68253,38256,97170,39684,82791,86196,79781,94353,41448,33013,61689
3,E06000047,County Durham,69268,41194,99850,95025,26509,89845,20116,22227,48912,19144,93537,52565,69748,37787,85949,39701
4,E06000005,Darlington,33978,21771,39265,56872,53141,53021,51830,82722,48059,38157,15280,21109,78934,49148,80802,420
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
370,W06000018,Caerphilly,18938,8931,29676,10641,60009,28205,53952,21074,10293,75427,75382,12700,45956,81514,35205,10589
371,W06000019,Blaenau Gwent,24109,30690,66926,80239,8287,34043,13843,80114,15678,42012,66897,69281,80920,81043,39087,92663
372,W06000020,Torfaen,78139,83997,33906,41849,46017,52519,16837,65351,32001,94248,49074,28845,18180,55186,94569,51951
373,W06000021,Monmouthshire,11160,42746,52246,21798,99607,32411,95747,39091,9528,8041,63794,42633,33102,87908,51413,33176


In [None]:
# Create hierarchy lookups
lacountydistcountry = pd.read_csv('./csv/lists/Ward_to_Local_Authority_District_to_County_to_Region_to_Country_(December_2021)_Lookup_in_United_Kingdom.csv')
lacountydistcountry = lacountydistcountry[['LAD21CD', 'LAD21NM', 'CTY21CD', 'CTY21NM', 'RGN21CD', 'RGN21NM', 'CTRY21CD', 'CTRY21NM']]

lacountydistcountry = lacountydistcountry[lacountydistcountry['LAD21CD'].apply(lambda x: x[0] not in ['S', 'N'])]

lacountydistcountry = lacountydistcountry.drop_duplicates()

for i in lacountydistcountry.index:
    # Check if nan
    if lacountydistcountry.loc[i]['CTY21CD'] != lacountydistcountry.loc[i]['CTY21CD']:
        lacountydistcountry.loc[i]['CTY21CD'] = lacountydistcountry.loc[i]['LAD21CD']
        lacountydistcountry.loc[i]['CTY21NM'] = lacountydistcountry.loc[i]['LAD21NM']

    if lacountydistcountry.loc[i]['RGN21CD'] != lacountydistcountry.loc[i]['RGN21CD']:
        lacountydistcountry.loc[i]['RGN21CD'] = lacountydistcountry.loc[i]['CTRY21CD']
        lacountydistcountry.loc[i]['RGN21NM'] = lacountydistcountry.loc[i]['CTRY21NM']

lacountydistcountry.to_csv('LAD_County_Region_Country_2021.csv', index = False)

In [27]:
for ob in vars_df:
    for var in ob['vars']:
        ob['df'][var] = rand_list(ob['df'].shape[0])
        dfsynth = ob['df'].applymap(create_synth)
        ob['df'].to_csv(ob['name']+'_2011.csv', index = False)
        dfsynth.to_csv(ob['name']+'_2021.csv', index = False)

In [31]:
{i: [i] for i in cob_variables}

{'Elsewhere': ['Elsewhere'],
 'Afghanistan': ['Afghanistan'],
 'Albania': ['Albania'],
 'Antarctica': ['Antarctica'],
 'Algeria': ['Algeria'],
 'American Samoa': ['American Samoa'],
 'Andorra': ['Andorra'],
 'Angola': ['Angola'],
 'Antigua and Barbuda': ['Antigua and Barbuda'],
 'Azerbaijan': ['Azerbaijan'],
 'Argentina': ['Argentina'],
 'Australia': ['Australia'],
 'Austria': ['Austria'],
 'Bahamas, The': ['Bahamas, The'],
 'Bahrain': ['Bahrain'],
 'Bangladesh': ['Bangladesh'],
 'Armenia': ['Armenia'],
 'Barbados': ['Barbados'],
 'Belgium': ['Belgium'],
 'Bermuda': ['Bermuda'],
 'Bhutan': ['Bhutan'],
 'Bolivia': ['Bolivia'],
 'Bosnia and Herzegovina': ['Bosnia and Herzegovina'],
 'Botswana': ['Botswana'],
 'Bouvet Island': ['Bouvet Island'],
 'Brazil': ['Brazil'],
 'Belize': ['Belize'],
 'British Indian Ocean Territory': ['British Indian Ocean Territory'],
 'Solomon Islands': ['Solomon Islands'],
 'British Virgin Islands': ['British Virgin Islands'],
 'Brunei': ['Brunei'],
 'Bulgaria'

In [32]:
# population_variables = ['Population']
# density_variables = ['Density']
# age10yr_variables = ['0-9', '10-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70-79', '80plus']
# agemed_variables = ['Median Age']
# welsh_variables = ['Speaks Welsh', "Doesn't speak Welsh"]
# care_variables = ['Not a carer', 'Yes, 9 hours a week or less', 'Yes, 10 to 19 hours a week', 'Yes, 20 to 34 hours a week', 'Yes, 35 to 49 hours a week', 'Yes, 50 or more hours a week']
# religion_variables = ['No religion',
# 'Christian',
# 'Buddhist',
# 'Hindu',
# 'Jewish',
# 'Muslim',
# 'Sikh',
# 'Other religion',
# 'Not stated']
# ethnicity_variables = ['Asian, Asian British or Asian Welsh',
# 'Black, Black British, Black Welsh, Carribean or African',
# 'Mixed or Multiple ethnic groups',
# 'White',
# 'Other ethnic group']
# health_variables = ['Very good health',
# 'Good health',
# 'Fair health',
# 'Bad health',
# 'Very bad health',]
# economic_variables = ['Economically active (excluding full-time students): In employment: Employee: Part-time',
# 'Economically active (excluding full-time students): In employment: Employee: Full-time',
# 'Economically active (excluding full-time students): In employment: Self-employed: Part-time',
# 'Economically active (excluding full-time students): In employment: Self-employed: Full-time',
# 'Economically active: Unemployed: Not a full-time student',
# 'Economically active: Full-time student',
# 'Economically inactive: Retired',
# 'Economically inactive (including full-time students): Student',
# 'Economically inactive: Looking after home or family',
# 'Economically inactive: Long-term sick or disabled',
# 'Economically inactive: Other']
# household_variables = ['One person household: Aged 66 years and over',
# 'One person household: Other',
# 'Single family household: All aged 66 years and over',
# 'Single family household: Married or civil partnership couple: No children',
# 'Single family household: Married or civil partnership couple: Dependent children',
# 'Single family household: Married or civil partnership couple: All children non-dependent',
# 'Single family household: Cohabiting couple family : No children',
# 'Single family household: Cohabiting couple family : With dependent children',
# 'Single family household: Cohabiting couple family : All children non-dependent',
# 'Single family household: Lone parent family : With dependent children',
# 'Single family household: Lone parent family : All children non-dependent',
# 'Single family household: Other single family household: Other family composition',
# 'Multiple-family household: With dependent children',
# 'Multiple-Family Household : Other, including all full-time students and all aged 66 years and over']
# marital_variables = ['Single never married or in a civil partnership',
# 'Married or in a registered civil partnership',
# 'Separated (but still legally married or still legally in a civil partnership)',
# 'Divorced or civil partnership dissolved',
# 'Widowed or surviving civil partnership partner' ]
# hours_variables = ['Part-time: 15 hours or less worked',
# 'Part-time: 16 to 30 hours worked',
# 'Full-time: 31 to 48 hours worked',
# 'Full-time: 49 or more hours worked']
# tenure_variables = ['Owned: Owns outright',
# 'Owned: Owns with a mortgage or loan',
# 'Shared ownership: Shared ownership',
# 'Social rented: Rents from council or Local Authority',
# 'Social rented: Other social rented',
# 'Private rented: Private landlord or letting agency',
# 'Private rented: Other private rented',
# 'Lives rent free']
# disability_variables = ['Disabled under the Equality Act: Day-to-day activities limited a lot',
# 'Disabled under the Equality Act: Day-to-day activities limited a little',
# 'Not disabled under the Equality Act :Has long term physical or mental health condition but day-to-day activities are not limited',
# 'Not disabled under the Equality Act: No long term physical or mental health conditions']
# national_variables = ['British only identity',
# 'English only identity',
# 'English and British only identity',
# 'Welsh only identity',
# 'Welsh and British only identity',
# 'Scottish only identity',
# 'Scottish and British only identity',
# 'Northern Irish only identity',
# 'Northern Irish and British only identity',
# 'Cornish only identity',
# 'Cornish and British only identity',
# 'Any other combination of UK identities (UK only)',
# 'Irish only identity',
# 'Irish and at least one UK identity',
# 'Other identity only',
# 'Other identity and at least one UK identity']