# Join national maps data
 - population
 - income

In [1]:
from pathlib import Path


DATA_DIR = (Path().resolve().parent / "data" )
income_path = DATA_DIR / "01_interim" / "nationalmaps" / "Census_2021_G17_Total_personal_income_weekly_by_age_by_sex_Main_Statistical_Areas_Level_2_and_up_SA2_.shp"
if not income_path.exists():
    print("File not found: ", income_path)
    
population_path = DATA_DIR / "00_raw" / "nationalmaps" / "32350_ERP_Age_Sex_SA2_2022_gpkg" / "SA2 ERP by Age and Sex GeoPackage 2022.gpkg"
if not population_path.exists():
    print("File not found: ", population_path)
    
joined_data_path = DATA_DIR / "01_interim" / "nationalmaps" / "joined_nationalmaps.shp"


# Load both frames

In [28]:
import geopandas as gpd
df_income = gpd.read_file(income_path)

df_pop = gpd.read_file(population_path)

In [43]:
df_income.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 2472 entries, 0 to 2471
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   DATAFLOW    2472 non-null   object  
 1   SEXP        2472 non-null   object  
 2   INCP        2472 non-null   object  
 3   AGEP        2472 non-null   object  
 4   REGION      2472 non-null   object  
 5   REGION_TYP  2472 non-null   object  
 6   STATE       2472 non-null   object  
 7   TIME_PERIO  2472 non-null   object  
 8   OBS_VALUE   2472 non-null   object  
 9   SA2_CODE21  2472 non-null   int64   
 10  SA2_NAME21  2472 non-null   object  
 11  STE_NAME21  2472 non-null   object  
 12  gender      2472 non-null   object  
 13  geometry    2454 non-null   geometry
dtypes: geometry(1), int64(1), object(12)
memory usage: 270.5+ KB


In [44]:
df_pop.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 2454 entries, 0 to 2453
Data columns (total 70 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   STE_CODE21          2454 non-null   int64   
 1   STE_NAME21          2454 non-null   object  
 2   GCC_CODE21          2454 non-null   object  
 3   GCC_NAME21          2454 non-null   object  
 4   SA4_CODE21          2454 non-null   int64   
 5   SA4_NAME21          2454 non-null   object  
 6   SA3_CODE21          2454 non-null   int64   
 7   SA3_NAME21          2454 non-null   object  
 8   SA2_CODE21          2454 non-null   int64   
 9   SA2_NAME21          2454 non-null   object  
 10  Males               2454 non-null   int64   
 11  Females             2454 non-null   int64   
 12  Persons             2454 non-null   int64   
 13  Sex_ratio           2408 non-null   float64 
 14  Median_age_persons  2417 non-null   float64 
 15  M0_4                2454 non-n

In [45]:
# area Codes not in both files
set(df_pop.SA2_CODE21) - set(df_income.SA2_CODE21)

set()

In [16]:
df_pop.head(1)

Unnamed: 0,STE_CODE21,STE_NAME21,GCC_CODE21,GCC_NAME21,SA4_CODE21,SA4_NAME21,SA3_CODE21,SA3_NAME21,SA2_CODE21,SA2_NAME21,...,P45_49,P50_54,P55_59,P60_64,P65_69,P70_74,P75_79,P80_84,P85_and_over,geometry
0,1,New South Wales,1RNSW,Rest of NSW,101,Capital Region,10102,Queanbeyan,101021007,Braidwood,...,280,318,355,447,343,270,260,128,87,"POLYGON ((149.58423 -35.44427, 149.58444 -35.4..."


In [41]:
# cast to same type as df_pop
df_income.SA2_CODE21 = df_income.SA2_CODE21.astype('int64')

In [108]:
print(f"unique statistical area 2 codes: income = {len(df_income.SA2_CODE21.unique())}, pop = {len(df_pop.SA2_CODE21.unique())}")


df = df_pop.merge(df_income.loc[:,['SA2_CODE21','OBS_VALUE']], left_on='SA2_CODE21', right_on='SA2_CODE21')
display('Shape:',df.shape)

unique statistical area 2 codes: income = 2472, pop = 2454


In [109]:
df.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 2454 entries, 0 to 2453
Data columns (total 71 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   STE_CODE21          2454 non-null   int64   
 1   STE_NAME21          2454 non-null   object  
 2   GCC_CODE21          2454 non-null   object  
 3   GCC_NAME21          2454 non-null   object  
 4   SA4_CODE21          2454 non-null   int64   
 5   SA4_NAME21          2454 non-null   object  
 6   SA3_CODE21          2454 non-null   int64   
 7   SA3_NAME21          2454 non-null   object  
 8   SA2_CODE21          2454 non-null   int64   
 9   SA2_NAME21          2454 non-null   object  
 10  Males               2454 non-null   int64   
 11  Females             2454 non-null   int64   
 12  Persons             2454 non-null   int64   
 13  Sex_ratio           2408 non-null   float64 
 14  Median_age_persons  2417 non-null   float64 
 15  M0_4                2454 non-n

## Missing data

In [110]:
na_cols = df.columns[df.isna().sum() > 0]
display("Missing rows;", df[na_cols].isna().sum())
display('Co-occuring missing rows;', df[na_cols].isna().all().sum())
df.loc[df[na_cols[0]].isna(),:].head()

'Missing rows;'

Sex_ratio             46
Median_age_persons    37
dtype: int64

'Co-occuring missing rows;'

0

Unnamed: 0,STE_CODE21,STE_NAME21,GCC_CODE21,GCC_NAME21,SA4_CODE21,SA4_NAME21,SA3_CODE21,SA3_NAME21,SA2_CODE21,SA2_NAME21,...,P50_54,P55_59,P60_64,P65_69,P70_74,P75_79,P80_84,P85_and_over,geometry,OBS_VALUE
76,1,New South Wales,1RNSW,Rest of NSW,103,Central West,10303,Lithgow - Mudgee,103031075,Wollangambe - Wollemi,...,0,0,0,0,0,0,0,0,"POLYGON ((150.34697 -33.14450, 150.34693 -33.1...",0
135,1,New South Wales,1RNSW,Rest of NSW,107,Illawarra,10701,Dapto - Port Kembla,107011133,Port Kembla Industrial,...,0,2,1,1,0,0,0,0,"MULTIPOLYGON (((150.94708 -34.46453, 150.94712...",5
373,1,New South Wales,1GSYD,Greater Sydney,118,Sydney - Eastern Suburbs,11801,Eastern Suburbs - North,118011342,Centennial Park,...,0,0,0,0,0,0,0,0,"POLYGON ((151.23775 -33.89066, 151.23873 -33.8...",0
727,2,Victoria,2RVIC,Rest of Vic.,205,Latrobe - Gippsland,20502,Gippsland - East,205021080,Alps - East,...,0,0,0,0,0,0,0,0,"POLYGON ((147.96384 -36.94797, 147.96383 -36.9...",0
730,2,Victoria,2RVIC,Rest of Vic.,205,Latrobe - Gippsland,20502,Gippsland - East,205021083,Lake King,...,0,0,0,0,0,0,0,0,"POLYGON ((147.69718 -37.84794, 147.69752 -37.8...",0


## Join with .DAT data

## Save to file

In [120]:
import json

df = gpd.GeoDataFrame(df)
df.to_file(joined_data_path, driver='ESRI Shapefile', index=False)

# Column names are truncated at 10 characters, save mapping to their original names
with open(joined_data_path.parent / 'columnnames.json', 'w') as f: 
    json.dump({c: c[:10] for c in df.columns}, f)

  df.to_file(joined_data_path, driver='ESRI Shapefile', index=False)
