# Preprocessing Income and Population Data
- This notebook preprocesses the income and population data.
- The preprocessed datasets are exported into the `./data/raw` directory.

Import relevant libraries.

In [2]:
import sys, os
sys.path.append(os.path.abspath('../'))
from scripts.utils import create_dir, get_runtime
import time 
start_time = time.time()

import pandas as pd
import numpy as np
import geopandas as gpd

## Income

### Load 2012-2016 and 2017-2021 Income datasets

In [5]:
income_1_df_unclean = pd.read_excel("../data/landing/income/income-2012-2016.xls",
                            "Table 1.4",
                            header=6)
income_2_df_unclean = pd.read_excel("../data/landing/income/income-2017-2021.xlsx",
                            "Table 1.4",
                            header=6)

### Clean 2012-2016 Income

In [6]:
income_1_df = income_1_df_unclean.copy()[[
    'SA2',
    'SA2 NAME',
    '2011-12.3',
    '2012-13.3',
    '2013-14.3',
    '2014-15.3',
    '2015-16.3'
]]

# Filter for Victoria SA2s (9-digit codes starting with '2')
income_1_df['SA2'] = income_1_df['SA2'].astype(str)
income_1_df = income_1_df.loc[
    (income_1_df['SA2'].str.len() == 9) &
    (income_1_df['SA2'].str.startswith('2'))
]
# drop rows with value 'np'
income_1_df = income_1_df.replace('np', np.nan)
income_1_df = income_1_df.dropna()
# Rename columns
income_1_df.columns = [
    'sa2_code',
    'sa2_name',
    '2012',
    '2013',
    '2014',
    '2015',
    '2016'
]
income_1_df

  income_1_df = income_1_df.replace('np', np.nan)


Unnamed: 0,sa2_code,sa2_name,2012,2013,2014,2015,2016
579,201011001,Alfredton,43838.0,44866.0,46535.0,48532.0,49385.0
580,201011002,Ballarat,42890.0,45719.0,47061.0,47894.0,49564.0
581,201011003,Ballarat - North,40757.0,42518.0,43360.0,44494.0,45816.0
582,201011004,Ballarat - South,38360.0,39436.0,40106.0,40846.0,41544.0
583,201011005,Buninyong,43428.0,44474.0,46283.0,46972.0,47511.0
...,...,...,...,...,...,...,...
1036,217031476,Otway,31038.0,28549.0,30555.0,31322.0,33020.0
1037,217041477,Moyne - East,36423.0,32888.0,38507.0,40775.0,40053.0
1038,217041478,Moyne - West,37042.0,36057.0,38261.0,39961.0,41751.0
1039,217041479,Warrnambool - North,39265.0,40642.0,41775.0,42635.0,43536.0


### Clean 2017-2021 Income

In [7]:
income_2_df = income_2_df_unclean.copy()[[
    'SA2',
    'SA2 NAME',
    '2016-17.3',
    '2017-18.3',
    '2018-19.3',
    '2019-20.3',
    '2020-21.3'
]]

# Filter for Victoria SA2s (9-digit codes starting with '2')
income_2_df['SA2'] = income_2_df['SA2'].astype(str)
income_2_df = income_2_df.loc[
    (income_2_df['SA2'].str.len() == 9) &
    (income_2_df['SA2'].str.startswith('2'))
]

# drop rows with value 'np'
income_2_df = income_2_df.replace('np', np.nan)
income_2_df = income_2_df.dropna()

# Rename columns
income_2_df.columns = [
    'sa2_code',
    'sa2_name',
    '2017',
    '2018',
    '2019',
    '2020',
    '2021'
]

income_2_df.sort_values(by='sa2_name', inplace=True)
income_2_df

Unnamed: 0,sa2_code,sa2_name,2017,2018,2019,2020,2021
801,206071139,Abbotsford,58219,61476,64090,67457,71394
905,210011226,Airport West,53423,55912,58506,60083,62051
788,206051128,Albert Park,65352,66627,67518,68933,73107
703,204011054,Alexandra,36773,37890,39452,40199,42833
645,201011001,Alfredton,50596,52448,53932,55204,58036
...,...,...,...,...,...,...,...
755,205051104,Yarram,35097,38003,37232,38845,41677
1064,213031352,Yarraville,63932,66233,69410,71512,76253
1146,216021414,Yarrawonga,38354,40572,41980,43654,46794
1125,215011394,Yarriambiack,40505,46030,44238,50474,48604


### Add SA2 geometries

Use 2016 SA2 Digital Boundaries shapefile for `income_1_df`.

In [8]:
sa2_2016 = gpd.read_file('../data/landing/sa2/sa2-16-shp/')
sa2_2016
sa2_2016 = sa2_2016[['SA2_MAIN16', 'geometry']]
sa2_2016.columns = ['sa2_code', 'geometry']
income_1_df = pd.merge(income_1_df, sa2_2016, on='sa2_code')
income_1_df = income_1_df.melt(
    id_vars=['sa2_code', 'sa2_name', 'geometry'],
    value_vars=['2012', '2013', '2014', '2015', '2016'],
    var_name='year',
    value_name='median_income'
)
income_1_df['year'] = income_1_df['year'].astype(int)
income_1_df

Unnamed: 0,sa2_code,sa2_name,geometry,year,median_income
0,201011001,Alfredton,"POLYGON ((143.70477 -37.51935, 143.70482 -37.5...",2012,43838.0
1,201011002,Ballarat,"POLYGON ((143.81896 -37.55583, 143.81644 -37.5...",2012,42890.0
2,201011003,Ballarat - North,"POLYGON ((143.85013 -37.54247, 143.85011 -37.5...",2012,40757.0
3,201011004,Ballarat - South,"POLYGON ((143.82821 -37.57559, 143.8284 -37.57...",2012,38360.0
4,201011005,Buninyong,"POLYGON ((143.8417 -37.61597, 143.84175 -37.61...",2012,43428.0
...,...,...,...,...,...
2275,217031476,Otway,"MULTIPOLYGON (((143.40263 -38.78153, 143.40252...",2016,33020.0
2276,217041477,Moyne - East,"POLYGON ((142.41438 -38.09304, 142.41399 -38.0...",2016,40053.0
2277,217041478,Moyne - West,"MULTIPOLYGON (((142.00869 -38.41716, 142.00876...",2016,41751.0
2278,217041479,Warrnambool - North,"POLYGON ((142.43668 -38.35545, 142.43658 -38.3...",2016,43536.0


Use 2021 SA2 Digital Boundaries shapefile for `income_2_df`.

In [9]:
sa2_2021 = gpd.read_file('../data/landing/sa2/sa2-21-shp/')
sa2_2021
sa2_2021 = sa2_2021[['SA2_CODE21', 'geometry']]
sa2_2021.columns = ['sa2_code', 'geometry']
income_2_df = pd.merge(income_2_df, sa2_2021, on='sa2_code')
income_2_df = income_2_df.melt(
    id_vars=['sa2_code', 'sa2_name', 'geometry'],
    value_vars=['2017', '2018', '2019', '2020', '2021'],
    var_name='year',
    value_name='median_income'
)
income_2_df['year'] = income_2_df['year'].astype(int)
income_2_df

Unnamed: 0,sa2_code,sa2_name,geometry,year,median_income
0,206071139,Abbotsford,"POLYGON ((144.99255 -37.80249, 144.99266 -37.8...",2017,58219
1,210011226,Airport West,"POLYGON ((144.86706 -37.72471, 144.86798 -37.7...",2017,53423
2,206051128,Albert Park,"POLYGON ((144.96767 -37.83737, 144.96789 -37.8...",2017,65352
3,204011054,Alexandra,"POLYGON ((145.59015 -37.22477, 145.58638 -37.2...",2017,36773
4,201011001,Alfredton,"POLYGON ((143.78282 -37.56666, 143.75558 -37.5...",2017,50596
...,...,...,...,...,...
2590,205051104,Yarram,"MULTIPOLYGON (((146.6219 -38.75021, 146.62185 ...",2021,41677
2591,213031352,Yarraville,"POLYGON ((144.85915 -37.81764, 144.85984 -37.8...",2021,76253
2592,216021414,Yarrawonga,"POLYGON ((146.00051 -36.00877, 146.00128 -36.0...",2021,46794
2593,215011394,Yarriambiack,"POLYGON ((142.239 -35.99787, 142.23898 -35.997...",2021,48604


### Push Income datasets to `raw` layer

In [10]:
create_dir('../data/raw/income')
income_1_df.to_csv('../data/raw/income/income-12-16.csv', index=False)
income_2_df.to_csv('../data/raw/income/income-17-21.csv', index=False)

Created directory: ../data/raw/income



## Population

### Load Population Dataset

In [11]:
population_df = pd.read_excel("../data/landing/population/population.xlsx",
                         "Table 1",
                         header = [5,6])

### Clean

In [12]:
population_df.columns = [
    (col[1] if str(col[0]).startswith('Unnamed') else col[0]) \
    for col in list(population_df.columns.values)
]
population_df = population_df.drop(
    ["GCCSA code",
    "GCCSA name",
    "SA3 code",
    "SA3 name",
    "SA4 code",
    "SA4 name"],
    axis=1
)
population_df = population_df.dropna()
population_df["SA2 code"] = population_df["SA2 code"].astype(int).astype(str)
population_df = population_df.rename(columns={"SA2 code": "sa2_code",
                                              "SA2 name": "sa2_name"})
population_df = population_df.loc[
    (population_df['sa2_code'].str.len() == 9) &
    (population_df['sa2_code'].str.startswith('2'))
]
year_cols = population_df.columns[2:]
population_df = population_df.melt(
    id_vars=['sa2_code', 'sa2_name'],
    value_vars=year_cols,
    var_name='year',
    value_name='population'
)
population_df['year'] = population_df['year'].astype(int)
population_df

Unnamed: 0,sa2_code,sa2_name,year,population
0,201011001,Alfredton,2001,5756.0
1,201011002,Ballarat,2001,11497.0
2,201011005,Buninyong,2001,5320.0
3,201011006,Delacombe,2001,4154.0
4,201011007,Smythes Creek,2001,3317.0
...,...,...,...,...
12001,217031476,Otway,2023,3983.0
12002,217041477,Moyne - East,2023,7132.0
12003,217041478,Moyne - West,2023,10148.0
12004,217041479,Warrnambool - North,2023,22762.0


### Add SA2 Geometry

In [13]:
population_df = pd.merge(population_df, sa2_2021, on='sa2_code')
population_df

Unnamed: 0,sa2_code,sa2_name,year,population,geometry
0,201011001,Alfredton,2001,5756.0,"POLYGON ((143.78282 -37.56666, 143.75558 -37.5..."
1,201011002,Ballarat,2001,11497.0,"POLYGON ((143.81896 -37.55582, 143.81644 -37.5..."
2,201011005,Buninyong,2001,5320.0,"POLYGON ((143.84171 -37.61596, 143.84176 -37.6..."
3,201011006,Delacombe,2001,4154.0,"POLYGON ((143.7505 -37.59119, 143.75044 -37.59..."
4,201011007,Smythes Creek,2001,3317.0,"POLYGON ((143.73296 -37.62333, 143.73263 -37.6..."
...,...,...,...,...,...
12001,217031476,Otway,2023,3983.0,"MULTIPOLYGON (((143.40263 -38.78152, 143.40252..."
12002,217041477,Moyne - East,2023,7132.0,"POLYGON ((142.41438 -38.09303, 142.414 -38.072..."
12003,217041478,Moyne - West,2023,10148.0,"MULTIPOLYGON (((142.0087 -38.41715, 142.00876 ..."
12004,217041479,Warrnambool - North,2023,22762.0,"POLYGON ((142.43668 -38.35544, 142.43658 -38.3..."


### Push Population Dataset to `raw` layer

In [14]:
create_dir('../data/raw/population')
population_df.to_csv('../data/raw/population/population-01-23.csv', index=False)

Created directory: ../data/raw/population



Get notebook runtime.

In [15]:
print(get_runtime(start_time))

Runtime: 29 minutes and 9 seconds
