In [2]:
import pandas as pd
import re

In [3]:
pd.set_option('display.max_rows', 100)

In [4]:
# partial files because provinces and locals must be added as names - their codes are integers.
# and districts as codes, their codes are municipal demarcation board, and their names exclude their code.
census2011prov = pd.read_csv("census-2011-2016-boundaries-pr-age-population.csv", skiprows=9, dtype={"Age in completed years": str})
census2011localmun = pd.read_csv("census-2011-2016-boundaries-mn-age-population.csv", skiprows=9, dtype={"Age in completed years": str})
census2011districtmun = pd.read_csv("census-2011-2016-boundaries-dc-age-population.csv", skiprows=9, dtype={"Age in completed years": str})
census2011 = pd.concat([census2011prov, census2011districtmun, census2011localmun])

In [5]:
census2011 = census2011.drop(columns=["Summation Options", "Unnamed: 4"])
census2011.columns = ["Geography", "Age", "Count"]
census2011 = census2011[census2011["Geography"] != "Total"]
census2011 = census2011[census2011["Age"] != "Total"]
census2011 = census2011.dropna()

In [6]:
commsurv2016prov = pd.read_csv("community-survey-2016-pr-age-population.csv", skiprows=9, dtype={"Age": str})
commsurv2016mun = pd.read_csv("community-survey-2016-dc-mn-age-population.csv", skiprows=9, dtype={"Age": str})
commsurv2016 = pd.concat([commsurv2016prov, commsurv2016mun])

In [7]:
commsurv2016 = commsurv2016.drop(columns=["Summation Options", "Unnamed: 4"])
commsurv2016 = commsurv2016[["Geography hierarchy 2016", "Age", "Count"]]
commsurv2016.columns = ["Geography", "Age", "Count"]
commsurv2016 = commsurv2016[commsurv2016["Geography"] != "Total"]
commsurv2016 = commsurv2016[commsurv2016["Age"] != "Total"]
commsurv2016 = commsurv2016.dropna()

In [8]:
commsurv2016["Geography"] = commsurv2016["Geography"].apply(lambda cell: re.sub(" *:.+", "", cell))
commsurv2016 = commsurv2016.drop_duplicates(["Geography", "Age"])

In [9]:
census2011["Geography"] = census2011["Geography"].apply(lambda cell: re.sub(" *:.+", "", cell))
census2011 = census2011.drop_duplicates(["Geography", "Age"])

In [84]:
pop2011_2016 = census2011.merge(commsurv2016, how="right", on=["Geography", "Age"], suffixes=("_2011", "_2016"))
replacements = [
    ("Eastern Cape", "EC"),
    ("Northern Cape", "NC"),
    ("Western Cape", "WC"),
    ("Gauteng", "GT"),
    ("Limpopo", "LIM"),
    ("Mpumalanga", "MP"),
    ("KwaZulu-Natal", "KZN"),
    ("Free State", "FS"),
    ("North West", "NW"),
]
for (prov, code) in replacements:
    pop2011_2016["Geography"].replace(prov, code, inplace=True)
pop2011_2016["absolute_change"] = pop2011_2016.apply(lambda row: row["Count_2016"] - row.Count_2011, axis = 1)

In [85]:
pop2011_2016

Unnamed: 0,Geography,Age,Count_2011,Count_2016,absolute_change
0,WC,0,117650.51541,112560.644948,-5089.870462
1,EC,0,145067.14456,144153.659291,-913.485269
2,NC,0,24678.68558,22959.908748,-1718.776832
3,FS,0,58976.36112,51272.282969,-7704.078151
4,KZN,0,237444.27713,243054.420864,5610.143734
...,...,...,...,...,...
31117,LIM368,116,0.00000,0.000000,0.000000
31118,LIM471,116,0.00000,0.000000,0.000000
31119,LIM472,116,0.00000,0.000000,0.000000
31120,LIM473,116,0.00000,0.000000,0.000000


In [86]:
bins = [0, 15, 25, 35, 45, 55, 65, float("inf")]
labels = ["0-14", "15-24", "25-34", "35-44", "45-54", "55-64", "65+"]
pop2011_2016["Age group"] = pd.cut(pop2011_2016.Age.astype(int), bins=bins, labels=labels, right=False)

In [87]:
pop2011_2016

Unnamed: 0,Geography,Age,Count_2011,Count_2016,absolute_change,Age group
0,WC,0,117650.51541,112560.644948,-5089.870462,0-14
1,EC,0,145067.14456,144153.659291,-913.485269,0-14
2,NC,0,24678.68558,22959.908748,-1718.776832,0-14
3,FS,0,58976.36112,51272.282969,-7704.078151,0-14
4,KZN,0,237444.27713,243054.420864,5610.143734,0-14
...,...,...,...,...,...,...
31117,LIM368,116,0.00000,0.000000,0.000000,65+
31118,LIM471,116,0.00000,0.000000,0.000000,65+
31119,LIM472,116,0.00000,0.000000,0.000000,65+
31120,LIM473,116,0.00000,0.000000,0.000000,65+


In [96]:
pop2011_2016_age_grouped = pop2011_2016.groupby(["Geography", "Age group"]).sum(numeric_only=True)

In [97]:
pop2011_2016_age_grouped

Unnamed: 0_level_0,Unnamed: 1_level_0,Count_2011,Count_2016,absolute_change
Geography,Age group,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
BUF,0-14,207251.47372,263537.435480,56285.961760
BUF,15-24,149446.23804,140357.932284,-9088.305756
BUF,25-34,131911.14793,147875.633966,15964.486036
BUF,35-44,103506.20062,106830.650515,3324.449895
BUF,45-54,86188.95151,83120.916322,-3068.035188
...,...,...,...,...
WC053,25-34,6955.18850,8521.849012,1566.660512
WC053,35-44,6561.13090,6239.627996,-321.502904
WC053,45-54,5511.05895,5251.883376,-259.175574
WC053,55-64,3701.53360,3380.797352,-320.736248


In [105]:
pop2011_2016_geo_grouped = pop2011_2016.groupby(["Geography"]).sum(numeric_only=True)

def calculate_percentage(row):
    if row.Count_2016 == 0:
        return 0
    if row.Count_2011 == 0:
        return None
    return row["absolute_change"] / row.Count_2011
        
pop2011_2016_geo_grouped["percentage_change"] = pop2011_2016_geo_grouped.apply(calculate_percentage, axis = 1)

In [106]:
pop2011_2016_geo_grouped

Unnamed: 0_level_0,Count_2011,Count_2016,absolute_change,percentage_change
Geography,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
BUF,7.810218e+05,8.349971e+05,53975.299311,0.069109
CPT,3.740012e+06,4.005016e+06,265003.306220,0.070856
DC1,3.917633e+05,4.364029e+05,44639.641964,0.113945
DC10,4.505831e+05,4.799229e+05,29339.720375,0.065115
DC12,8.557929e+05,8.807903e+05,24997.472888,0.029210
...,...,...,...,...
WC047,4.916187e+04,5.915718e+04,9995.317214,0.203314
WC048,6.865666e+04,7.383507e+04,5178.415085,0.075425
WC051,8.289205e+03,8.895396e+03,606.190277,0.073130
WC052,1.313575e+04,1.427186e+04,1136.113267,0.086490


In [107]:
pop2011_geo_grouped = pop2011_2016_geo_grouped[["Count_2011"]].copy()
pop2011_geo_grouped.columns = ["Count"]
pop2011_geo_grouped["category"] = "Total population"
pop2011_geo_grouped.to_excel("pop2011_geo_grouped.xlsx")

In [108]:
pop2016_geo_grouped = pop2011_2016_geo_grouped[["Count_2016"]].copy()
pop2016_geo_grouped.columns = ["Count"]
pop2016_geo_grouped["category"] = "Total population"
pop2016_geo_grouped.to_excel("pop2016_geo_grouped.xlsx")

In [109]:
pop2011_2016_abs_geo_grouped = pop2011_2016_geo_grouped[["absolute_change"]].copy()
pop2011_2016_abs_geo_grouped.columns = ["Count"]
pop2011_2016_abs_geo_grouped["category"] = "Population change (absolute)"
pop2011_2016_abs_geo_grouped.to_excel("pop2011_2016_abs_geo_grouped.xlsx")

In [110]:
pop2011_2016_pct_geo_grouped = pop2011_2016_geo_grouped[["percentage_change"]].copy()
pop2011_2016_pct_geo_grouped.columns = ["Count"]
pop2011_2016_pct_geo_grouped["category"] = "Population change (%)"
pop2011_2016_pct_geo_grouped.to_excel("pop2011_2016_pct_geo_grouped.xlsx")
pop2011_2016_pct_geo_grouped

Unnamed: 0_level_0,Count,category
Geography,Unnamed: 1_level_1,Unnamed: 2_level_1
BUF,0.069109,Population change (%)
CPT,0.070856,Population change (%)
DC1,0.113945,Population change (%)
DC10,0.065115,Population change (%)
DC12,0.029210,Population change (%)
...,...,...
WC047,0.203314,Population change (%)
WC048,0.075425,Population change (%)
WC051,0.073130,Population change (%)
WC052,0.086490,Population change (%)


In [95]:
pop2011_2016_abs_geo_grouped.index



Index(['BUF', 'CPT', 'DC1', 'DC10', 'DC12', 'DC13', 'DC14', 'DC15', 'DC16',
       'DC18',
       ...
       'WC042', 'WC043', 'WC044', 'WC045', 'WC047', 'WC048', 'WC051', 'WC052',
       'WC053', 'Western Cape'],
      dtype='object', name='Geography', length=266)