imports

In [74]:
import pandas as pd
import os
from tqdm import tqdm
import numpy as np
import re

datadir = "../data/original"
savedir = "../data/processed"

#### Simple Transformations

Open Population Data

In [75]:
# load population data
population_data = pd.read_csv(os.path.join(datadir, "ABS_Population_and_people_by_2021_SA2_(Beta).csv"))

### Look at changes from SA2 2016 to 2022 codes

In [76]:
changes_path="SA2_changes_16to21/"

##### Label Changes (only label changes; no SA2 code changes)

In [77]:
SA2_label_changes = pd.read_csv(changes_path+"SA2 label changes from 2016.csv")
SA2_label_changes.head()

Unnamed: 0,SA2_9DIG16,SA2_NAME16,SA2_9DIG21,SA2_NAME21
0,101031015,Cooma Region,101031015,Cooma Surrounds
1,101051540,Goulburn Region,101051540,Goulburn Surrounds
2,101061542,Yass Region,101061542,Yass Surrounds
3,101061544,Young Region,101061544,Young Surrounds
4,103011060,Bathurst Region,103011060,Bathurst Surrounds


##### Area Redesigns (split into multiple areas / change SA2 code)

In [78]:
SA2_redesign = pd.read_csv(changes_path+"SA2 redesign from 2016.csv")
lst1=[]
for i in range(len(SA2_redesign)):
    if not pd.isna(SA2_redesign.loc[i,"From (2016):"]):
        lst1.append(SA2_redesign.loc[i,"From (2016):"])
    else:
        lst1.append(lst[-1])
SA2_redesign["From (2016):"]=lst1
lst2=[]
for i in range(len(SA2_redesign)):
    if not pd.isna(SA2_redesign.loc[i,"To (Edition 3):"]):
        lst2.append(SA2_redesign.loc[i,"To (Edition 3):"])
    else:
        lst2.append(lst[-1])
SA2_redesign["To (Edition 3):"]=lst2 

SA2_redesign["SA2_9DIG16"] = [re.findall(r'\d+', item)[0] if not pd.isna(item) else np.nan for item in SA2_redesign["From (2016):"]]
SA2_redesign["SA2_NAME16"] = [re.findall(r'\D+', item)[0] if not pd.isna(item) else np.nan for item in SA2_redesign["From (2016):"]]

SA2_redesign["SA2_9DIG21"] = [re.findall(r'\d+',item)[0] if not pd.isna(item) else np.nan for item in SA2_redesign["To (Edition 3):"]]
SA2_redesign["SA2_NAME21"] = [re.findall(r'\D+',item)[0] if not pd.isna(item) else np.nan for item in SA2_redesign["To (Edition 3):"]]

SA2_redesign.head()

Unnamed: 0,From (2016):,To (Edition 3):,SA2_9DIG16,SA2_NAME16,SA2_9DIG21,SA2_NAME21
0,106021115 Maitland - East,106021614 East Maitland - Metford,106021115,Maitland - East,106021614,East Maitland - Metford
1,801101138 Molonglo - North,106021617 Tenambit - East Maitland,801101138,Molonglo - North,106021617,Tenambit - East Maitland
2,106021118 Thornton - Millers Forest,106021618 Thornton - Millers Forest,106021118,Thornton - Millers Forest,106021618,Thornton - Millers Forest
3,115011559 Kellyville,115011621 Kellyville - East,115011559,Kellyville,115011621,Kellyville - East
4,801101138 Molonglo - North,115011622 Kellyville - West,801101138,Molonglo - North,115011622,Kellyville - West


##### Area Splits (changes in label and SA2 code)

In [79]:
SA2_splits= pd.read_csv(changes_path+"SA2 splits from 2016.csv")
lst=[]
for i in range(len(SA2_splits)):
    if not pd.isna(SA2_splits.loc[i,"From (2016):"]):
        lst.append(SA2_splits.loc[i,"From (2016):"])
    else:
        lst.append(lst[-1])
SA2_splits["From (2016):"]=lst 
lst2=[]
for i in range(len(SA2_splits)):
    if not pd.isna(SA2_splits.loc[i,"To (Edition 3):"]):
        lst2.append(SA2_splits.loc[i,"To (Edition 3):"])
    else:
        lst2.append(lst[-1])
SA2_splits["To (Edition 3):"]=lst2 

SA2_splits["SA2_9DIG16"] = [re.findall(r'\d+', item)[0] if not pd.isna(item) else np.nan for item in SA2_splits["From (2016):"]]
SA2_splits["SA2_NAME16"] = [re.findall(r'\D+', item)[0] if not pd.isna(item) else np.nan for item in SA2_splits["From (2016):"]]

SA2_splits["SA2_9DIG21"] = [re.findall(r'\d+',item)[0] if not pd.isna(item) else np.nan for item in SA2_splits["To (Edition 3):"]]
SA2_splits["SA2_NAME21"] = [re.findall(r'\D+',item)[0] if not pd.isna(item) else np.nan for item in SA2_splits["To (Edition 3):"]]
SA2_splits.head(5)

Unnamed: 0,From (2016):,To (Edition 3):,SA2_9DIG16,SA2_NAME16,SA2_9DIG21,SA2_NAME21
0,101021011 Queanbeyan Region,101021610 Googong,101021011,Queanbeyan Region,101021610,Googong
1,101021011 Queanbeyan Region,101021611 Queanbeyan Surrounds,101021011,Queanbeyan Region,101021611,Queanbeyan Surrounds
2,103011058 Bathurst,103011612 Bathurst - South,103011058,Bathurst,103011612,Bathurst - South
3,103011058 Bathurst,103011613 Bathurst - West,103011058,Bathurst,103011613,Bathurst - West
4,106021117 Maitland - West,106021615 Rutherford (North) - Aberglasslyn,106021117,Maitland - West,106021615,Rutherford (North) - Aberglasslyn


## Transformations

Add 5-digit SA2 code

In [80]:
# add 5-digit SA2 code
population_data["SA2_5DIG21"]=[i[0]+i[-4:] for i in population_data["SA2_CODE_2021"]]

Compute Percentages of the Total, the Male and the Female Population per SA2

In [81]:
# Compute Percentages

# compute percentages ASSUMING THAT THE NAN VALUES ARE EQUAL TO 0, BECAUSE 2021 POPULATION IS EQUAL TO 25.69mil WHICH IS ALMOST EQUAL TO 25.688.079
tot_people21 = population_data["ERP_P_202021"].sum()
population_data["ERP_P_202021 (%)"] = [(population_data.loc[i,"ERP_P_202021"]/tot_people21)*100 if not pd.isna(population_data.loc[i,"ERP_P_202021"]) else np.nan for i in range(len(population_data))]
# >>> population_data["ERP_P_202021 (%)"].sum()
# >>> 100.0

# compute percentages for male population
totM_people21 = population_data["ERP_M_202021"].sum()
population_data["ERP_M_202021 (%)"] = [(population_data.loc[i,"ERP_M_202021"]/totM_people21)*100 if not pd.isna(population_data.loc[i,"ERP_M_202021"]) else np.nan for i in range(len(population_data))]
# compute percentages for female population
totF_people21 = population_data["ERP_F_202021"].sum()
population_data["ERP_F_202021 (%)"] = [(population_data.loc[i,"ERP_F_202021"]/totF_people21)*100 if not pd.isna(population_data.loc[i,"ERP_F_202021"]) else np.nan for i in range(len(population_data))]

# add 5-digit SA2 code
population_data["SA2_5DIG21"]=[i[0]+i[-4:] for i in population_data["SA2_CODE_2021"]]

# create mapping from 2021 to 2016 SA2 code
dct21to16code = {SA2_redesign.loc[i,"SA2_9DIG21"]:SA2_redesign.loc[i,"SA2_9DIG16"] for i in range(len(SA2_redesign))}|{SA2_splits.loc[i,"SA2_9DIG21"]:SA2_splits.loc[i,"SA2_9DIG16"] for i in range(len(SA2_splits))}
# append mapping to population data
population_data["SA2_CODE_2016"] = [dct21to16code[population_data.loc[i,"SA2_CODE_2021"]] if population_data.loc[i,"SA2_CODE_2021"] in dct21to16code.keys() else population_data.loc[i,"SA2_CODE_2021"] for i in range(len(population_data))]
# add 5-digit SA2 code
population_data["SA2_5DIG16"]=[i[0]+i[-4:] for i in population_data["SA2_CODE_2016"]]

# restructure data
population_data = population_data[list(population_data.columns[:2])+["SA2_5DIG21","SA2_CODE_2016","SA2_5DIG16"]+list(population_data.columns[2:6])+["ERP_P_202021 (%)",'ERP_212021', 'ERP_M_202021', 'ERP_M_202021 (%)', 'ERP_F_202021','ERP_F_202021 (%)']+list(population_data.columns[9:])]

population_data.head()

Unnamed: 0,OBJECTID,SA2_CODE_2021,SA2_5DIG21,SA2_CODE_2016,SA2_5DIG16,SA2_NAME_2021,AREA_ALBERS_SQKM,ASGS_LOCI_URI_2021,ERP_P_202021,ERP_P_202021 (%),...,ADFS_42021,ADFS_52021,SHAPE_Length,SHAPE_Area,SA2_5DIG21.1,ERP_P_202021 (%).1,ERP_M_202021 (%),ERP_F_202021 (%),SA2_CODE_2016.1,SA2_5DIG16.1
0,1,101021007,11007,101021007,11007,Braidwood,3418.3525,https://linked.data.gov.au/dataset/asgsed3/SA2...,4330.0,0.016856,...,0.5,5.0,3.913695,0.339397,11007,0.016856,0.017632,0.016092,101021007,11007
1,2,101021008,11008,101021008,11008,Karabar,6.9825,https://linked.data.gov.au/dataset/asgsed3/SA2...,8546.0,0.033268,...,1.6,3.7,0.138642,0.000693,11008,0.033268,0.033915,0.032631,101021008,11008
2,3,101021009,11009,101021009,11009,Queanbeyan,4.762,https://linked.data.gov.au/dataset/asgsed3/SA2...,11370.0,0.044262,...,1.6,3.7,0.10715,0.000472,11009,0.044262,0.045398,0.043143,101021009,11009
3,4,101021010,11010,101021010,11010,Queanbeyan - East,13.0032,https://linked.data.gov.au/dataset/asgsed3/SA2...,5093.0,0.019826,...,3.5,4.6,0.189549,0.00129,11010,0.019826,0.02095,0.018719,101021010,11010
4,5,101021012,11012,101021012,11012,Queanbeyan West - Jerrabomberra,13.6748,https://linked.data.gov.au/dataset/asgsed3/SA2...,12743.0,0.049607,...,5.8,5.9,0.193368,0.001356,11012,0.049607,0.050096,0.049125,101021012,11012


#### ---

In [82]:
print(r"303/2473 SA2 areas (12.25% of all 2021 SA2s) in the 2021 version are different from the ones in the 2016 version")

303/2473 SA2 areas (12.25% of all 2021 SA2s) in the 2021 version are different from the ones in the 2016 version


In [83]:
for code in [str(i) for i in changedSA_CodeLst]:
    if code not in list(population_data["SA2_CODE_2021"]):
        print(code)
print("All codes that are different are also in the population data")

All codes that are different are also in the population data
