In [263]:
import pandas as pd
import re

In [264]:
pd.set_option('display.max_rows', 100)

In [265]:
# partial files because provinces and locals must be added as names - their codes are integers.
# and districts as codes, their codes are municipal demarcation board, and their names exclude their code.
census2011prov = pd.read_csv("census-2011-2016-boundaries-pr-age-population.csv", skiprows=9, dtype={"Age in completed years": str})
census2011localmun = pd.read_csv("census-2011-2016-boundaries-mn-age-population.csv", skiprows=9, dtype={"Age in completed years": str})
census2011districtmun = pd.read_csv("census-2011-2016-boundaries-dc-age-population.csv", skiprows=9, dtype={"Age in completed years": str})
census2011 = pd.concat([census2011prov, census2011districtmun, census2011localmun])

In [266]:
census2011 = census2011.drop(columns=["Summation Options", "Unnamed: 4"])
census2011.columns = ["Geography", "Age", "Count"]
census2011 = census2011[census2011["Geography"] != "Total"]
census2011 = census2011[census2011["Age"] != "Total"]
census2011 = census2011.dropna()

In [270]:
commsurv2016prov = pd.read_csv("community-survey-2016-pr-age-population.csv", skiprows=9, dtype={"Age": str})
commsurv2016mun = pd.read_csv("community-survey-2016-dc-mn-age-population.csv", skiprows=9, dtype={"Age": str})
commsurv2016 = pd.concat([commsurv2016prov, commsurv2016mun])

In [271]:
commsurv2016 = commsurv2016.drop(columns=["Summation Options", "Unnamed: 4"])
commsurv2016 = commsurv2016[["Geography hierarchy 2016", "Age", "Count"]]
commsurv2016.columns = ["Geography", "Age", "Count"]
commsurv2016 = commsurv2016[commsurv2016["Geography"] != "Total"]
commsurv2016 = commsurv2016[commsurv2016["Age"] != "Total"]
commsurv2016 = commsurv2016.dropna()

In [277]:
commsurv2016["Geography"] = commsurv2016["Geography"].apply(lambda cell: re.sub(" *:.+", "", cell))
commsurv2016 = commsurv2016.drop_duplicates(["Geography", "Age"])

In [279]:
census2011["Geography"] = census2011["Geography"].apply(lambda cell: re.sub(" *:.+", "", cell))
census2011 = census2011.drop_duplicates(["Geography", "Age"])

In [280]:
pop2011_2016 = census2011.merge(commsurv2016, how="right", on=["Geography", "Age"], suffixes=("_2011", "_2016"))

In [281]:
pop2011_2016

Unnamed: 0,Geography,Age,Count_2011,Count_2016
0,Western Cape,0,117650.51541,112560.644948
1,Eastern Cape,0,145067.14456,144153.659291
2,Northern Cape,0,24678.68558,22959.908748
3,Free State,0,58976.36112,51272.282969
4,KwaZulu-Natal,0,237444.27713,243054.420864
...,...,...,...,...
31117,LIM368,116,0.00000,0.000000
31118,LIM471,116,0.00000,0.000000
31119,LIM472,116,0.00000,0.000000
31120,LIM473,116,0.00000,0.000000


In [282]:
pop2011_2016.to_csv("pop2011_2016.csv")

In [283]:
bins = [0, 15, 25, 35, 45, 55, 65, float("inf")]
labels = ["0-14", "15-24", "25-34", "35-44", "45-54", "55-64", "65+"]
pop2011_2016["Age group"] = pd.cut(pop2011_2016.Age.astype(int), bins=bins, labels=labels, right=False)

In [284]:
pop2011_2016

Unnamed: 0,Geography,Age,Count_2011,Count_2016,Age group
0,Western Cape,0,117650.51541,112560.644948,0-14
1,Eastern Cape,0,145067.14456,144153.659291,0-14
2,Northern Cape,0,24678.68558,22959.908748,0-14
3,Free State,0,58976.36112,51272.282969,0-14
4,KwaZulu-Natal,0,237444.27713,243054.420864,0-14
...,...,...,...,...,...
31117,LIM368,116,0.00000,0.000000,65+
31118,LIM471,116,0.00000,0.000000,65+
31119,LIM472,116,0.00000,0.000000,65+
31120,LIM473,116,0.00000,0.000000,65+


In [286]:
pop2011_2016_age_grouped = pop2011_2016.groupby(["Geography", "Age group"]).sum(numeric_only=True)

In [287]:
pop2011_2016_age_grouped

Unnamed: 0_level_0,Unnamed: 1_level_0,Count_2011,Count_2016
Geography,Age group,Unnamed: 2_level_1,Unnamed: 3_level_1
BUF,0-14,2.072515e+05,2.635374e+05
BUF,15-24,1.494462e+05,1.403579e+05
BUF,25-34,1.319111e+05,1.478756e+05
BUF,35-44,1.035062e+05,1.068307e+05
BUF,45-54,8.618895e+04,8.312092e+04
...,...,...,...
Western Cape,25-34,1.074148e+06,1.068125e+06
Western Cape,35-44,8.316753e+05,9.399271e+05
Western Cape,45-54,6.405509e+05,7.101152e+05
Western Cape,55-64,4.066556e+05,4.748624e+05


In [288]:
pop2011_2016_age_grouped.to_csv("pop2011_2016_age_grouped.csv")