# Preprocessing External Data
- This notebook preprocesses the population, income, SA2, PTV, and school data.
- The preprocessed datasets are exported into the `./data/raw` directory.

Import relevant libraries.

In [10]:
import sys, os
sys.path.append(os.path.abspath('../'))
from scripts.utils import create_dir, get_runtime
import time 
start_time = time.time()

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import geopandas as gpd
import folium
from geopy import distance
import openrouteservice as ors


## Income

### Load 2012-2016 and 2017-2021 Income datasets

In [11]:
income_1_df_unclean = pd.read_excel("../data/landing/income/income-2012-2016.xls",
                            "Table 1.4",
                            header=6)
income_2_df_unclean = pd.read_excel("../data/landing/income/income-2017-2021.xlsx",
                            "Table 1.4",
                            header=6)

### Clean 2012-2016 Income

In [12]:
income_1_df = income_1_df_unclean.copy()[[
    'SA2',
    'SA2 NAME',
    '2011-12.3',
    '2012-13.3',
    '2013-14.3',
    '2014-15.3',
    '2015-16.3'
]]

# Filter for Victoria SA2s (9-digit codes starting with '2')
income_1_df['SA2'] = income_1_df['SA2'].astype(str)
income_1_df = income_1_df.loc[
    (income_1_df['SA2'].str.len() == 9) &
    (income_1_df['SA2'].str.startswith('2'))
]
# drop rows with value 'np'
income_1_df = income_1_df.replace('np', np.nan)
income_1_df = income_1_df.dropna()
# Rename columns
income_1_df.columns = [
    'sa2_code',
    'sa2_name',
    '2012',
    '2013',
    '2014',
    '2015',
    '2016'
]
income_1_df

  income_1_df = income_1_df.replace('np', np.nan)


Unnamed: 0,sa2_code,sa2_name,2012,2013,2014,2015,2016
579,201011001,Alfredton,43838.0,44866.0,46535.0,48532.0,49385.0
580,201011002,Ballarat,42890.0,45719.0,47061.0,47894.0,49564.0
581,201011003,Ballarat - North,40757.0,42518.0,43360.0,44494.0,45816.0
582,201011004,Ballarat - South,38360.0,39436.0,40106.0,40846.0,41544.0
583,201011005,Buninyong,43428.0,44474.0,46283.0,46972.0,47511.0
...,...,...,...,...,...,...,...
1036,217031476,Otway,31038.0,28549.0,30555.0,31322.0,33020.0
1037,217041477,Moyne - East,36423.0,32888.0,38507.0,40775.0,40053.0
1038,217041478,Moyne - West,37042.0,36057.0,38261.0,39961.0,41751.0
1039,217041479,Warrnambool - North,39265.0,40642.0,41775.0,42635.0,43536.0


### Clean 2017-2021 Income

In [13]:
income_2_df = income_2_df_unclean.copy()[[
    'SA2',
    'SA2 NAME',
    '2016-17.3',
    '2017-18.3',
    '2018-19.3',
    '2019-20.3',
    '2020-21.3'
]]

# Filter for Victoria SA2s (9-digit codes starting with '2')
income_2_df['SA2'] = income_2_df['SA2'].astype(str)
income_2_df = income_2_df.loc[
    (income_2_df['SA2'].str.len() == 9) &
    (income_2_df['SA2'].str.startswith('2'))
]

# drop rows with value 'np'
income_2_df = income_2_df.replace('np', np.nan)
income_2_df = income_2_df.dropna()

# Rename columns
income_2_df.columns = [
    'sa2_code',
    'sa2_name',
    '2017',
    '2018',
    '2019',
    '2020',
    '2021'
]

income_2_df.sort_values(by='sa2_name', inplace=True)
income_2_df

Unnamed: 0,sa2_code,sa2_name,2017,2018,2019,2020,2021
801,206071139,Abbotsford,58219,61476,64090,67457,71394
905,210011226,Airport West,53423,55912,58506,60083,62051
788,206051128,Albert Park,65352,66627,67518,68933,73107
703,204011054,Alexandra,36773,37890,39452,40199,42833
645,201011001,Alfredton,50596,52448,53932,55204,58036
...,...,...,...,...,...,...,...
755,205051104,Yarram,35097,38003,37232,38845,41677
1064,213031352,Yarraville,63932,66233,69410,71512,76253
1146,216021414,Yarrawonga,38354,40572,41980,43654,46794
1125,215011394,Yarriambiack,40505,46030,44238,50474,48604


### Add SA2 geometries

Use 2016 SA2 Digital Boundaries shapefile for `income_1_df`.

In [14]:
sa2_2016 = gpd.read_file('../data/landing/sa2/sa2-16-shp/')
sa2_2016
sa2_2016 = sa2_2016[['SA2_MAIN16', 'geometry']]
sa2_2016.columns = ['sa2_code', 'geometry']
income_1_df = pd.merge(income_1_df, sa2_2016, on='sa2_code')
income_1_df

Unnamed: 0,sa2_code,sa2_name,2012,2013,2014,2015,2016,geometry
0,201011001,Alfredton,43838.0,44866.0,46535.0,48532.0,49385.0,"POLYGON ((143.70477 -37.51935, 143.70482 -37.5..."
1,201011002,Ballarat,42890.0,45719.0,47061.0,47894.0,49564.0,"POLYGON ((143.81896 -37.55583, 143.81644 -37.5..."
2,201011003,Ballarat - North,40757.0,42518.0,43360.0,44494.0,45816.0,"POLYGON ((143.85013 -37.54247, 143.85011 -37.5..."
3,201011004,Ballarat - South,38360.0,39436.0,40106.0,40846.0,41544.0,"POLYGON ((143.82821 -37.57559, 143.8284 -37.57..."
4,201011005,Buninyong,43428.0,44474.0,46283.0,46972.0,47511.0,"POLYGON ((143.8417 -37.61597, 143.84175 -37.61..."
...,...,...,...,...,...,...,...,...
451,217031476,Otway,31038.0,28549.0,30555.0,31322.0,33020.0,"MULTIPOLYGON (((143.40263 -38.78153, 143.40252..."
452,217041477,Moyne - East,36423.0,32888.0,38507.0,40775.0,40053.0,"POLYGON ((142.41438 -38.09304, 142.41399 -38.0..."
453,217041478,Moyne - West,37042.0,36057.0,38261.0,39961.0,41751.0,"MULTIPOLYGON (((142.00869 -38.41716, 142.00876..."
454,217041479,Warrnambool - North,39265.0,40642.0,41775.0,42635.0,43536.0,"POLYGON ((142.43668 -38.35545, 142.43658 -38.3..."


Use 2021 SA2 Digital Boundaries shapefile for `income_2_df`.

In [15]:
sa2_2021 = gpd.read_file('../data/landing/sa2/sa2-21-shp/')
sa2_2021
sa2_2021 = sa2_2021[['SA2_CODE21', 'geometry']]
sa2_2021.columns = ['sa2_code', 'geometry']
income_2_df = pd.merge(income_2_df, sa2_2021, on='sa2_code')
income_2_df

Unnamed: 0,sa2_code,sa2_name,2017,2018,2019,2020,2021,geometry
0,206071139,Abbotsford,58219,61476,64090,67457,71394,"POLYGON ((144.99255 -37.80249, 144.99266 -37.8..."
1,210011226,Airport West,53423,55912,58506,60083,62051,"POLYGON ((144.86706 -37.72471, 144.86798 -37.7..."
2,206051128,Albert Park,65352,66627,67518,68933,73107,"POLYGON ((144.96767 -37.83737, 144.96789 -37.8..."
3,204011054,Alexandra,36773,37890,39452,40199,42833,"POLYGON ((145.59015 -37.22477, 145.58638 -37.2..."
4,201011001,Alfredton,50596,52448,53932,55204,58036,"POLYGON ((143.78282 -37.56666, 143.75558 -37.5..."
...,...,...,...,...,...,...,...,...
514,205051104,Yarram,35097,38003,37232,38845,41677,"MULTIPOLYGON (((146.6219 -38.75021, 146.62185 ..."
515,213031352,Yarraville,63932,66233,69410,71512,76253,"POLYGON ((144.85915 -37.81764, 144.85984 -37.8..."
516,216021414,Yarrawonga,38354,40572,41980,43654,46794,"POLYGON ((146.00051 -36.00877, 146.00128 -36.0..."
517,215011394,Yarriambiack,40505,46030,44238,50474,48604,"POLYGON ((142.239 -35.99787, 142.23898 -35.997..."


### Push Income datasets to `raw` layer

In [16]:
create_dir('../data/raw/income')
income_1_df.to_csv('../data/raw/income/income-12-16.csv', index=False)
income_2_df.to_csv('../data/raw/income/income-17-21.csv', index=False)

Directory already exists: ../data/raw/income



## Population

### Table 1(SA2)

In [17]:
pop_tab1 = pd.read_excel("../data/landing/population/population.xlsx", "Table 1", header = [5,6])
pop_tab1.columns = [ (col[1]  if str(col[0]).startswith('Unnamed') else col[0]) for col in list(pop_tab1.columns.values)]
pop_tab1 = pop_tab1.drop(["SA3 code", "SA3 name", "SA4 code", "SA4 name"], axis=1)
# pop_tab1[0:2]
pop_tab1

Unnamed: 0,GCCSA code,GCCSA name,SA2 code,SA2 name,2001,2002,2003,2004,2005,2006,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,1RNSW,Rest of NSW,101021007.0,Braidwood,2760.0,2811.0,2835.0,2844.0,2847.0,2965.0,...,3762.0,3849.0,3950.0,4041.0,4145.0,4218.0,4282.0,4332.0,4366.0,4396.0
1,1RNSW,Rest of NSW,101021008.0,Karabar,9129.0,9199.0,9263.0,9277.0,9209.0,9212.0,...,8731.0,8603.0,8531.0,8530.0,8516.0,8500.0,8535.0,8548.0,8528.0,8483.0
2,1RNSW,Rest of NSW,101021009.0,Queanbeyan,9717.0,9513.0,9522.0,9400.0,9595.0,9682.0,...,11199.0,11213.0,11230.0,11362.0,11460.0,11468.0,11460.0,11375.0,11391.0,11420.0
3,1RNSW,Rest of NSW,101021010.0,Queanbeyan - East,3925.0,4073.0,4219.0,4218.0,4187.0,4319.0,...,4967.0,4961.0,4970.0,5016.0,5079.0,5126.0,5089.0,5097.0,5091.0,5099.0
4,1RNSW,Rest of NSW,101021012.0,Queanbeyan West - Jerrabomberra,9425.0,10257.0,11085.0,11549.0,12046.0,12358.0,...,13193.0,13164.0,13150.0,13090.0,13022.0,12955.0,12821.0,12748.0,12781.0,12873.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2451,9OTER,Other Territories,901021002.0,Cocos (Keeling) Islands,600.0,568.0,558.0,573.0,588.0,590.0,...,556.0,555.0,546.0,552.0,553.0,591.0,608.0,603.0,616.0,631.0
2452,9OTER,Other Territories,901031003.0,Jervis Bay,542.0,464.0,441.0,428.0,413.0,386.0,...,361.0,367.0,402.0,398.0,386.0,367.0,335.0,309.0,307.0,307.0
2453,9OTER,Other Territories,901041004.0,Norfolk Island,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1757.0,1845.0,1938.0,2015.0,2102.0,2221.0,2211.0,2209.0
2454,,,,Total Australia,19274701.0,19495210.0,19720737.0,19932722.0,20176844.0,20450966.0,...,23475686.0,23815995.0,24190907.0,24592588.0,24963258.0,25334826.0,25649248.0,25685412.0,26014399.0,26648878.0


In [18]:
pop_tab1

Unnamed: 0,GCCSA code,GCCSA name,SA2 code,SA2 name,2001,2002,2003,2004,2005,2006,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,1RNSW,Rest of NSW,101021007.0,Braidwood,2760.0,2811.0,2835.0,2844.0,2847.0,2965.0,...,3762.0,3849.0,3950.0,4041.0,4145.0,4218.0,4282.0,4332.0,4366.0,4396.0
1,1RNSW,Rest of NSW,101021008.0,Karabar,9129.0,9199.0,9263.0,9277.0,9209.0,9212.0,...,8731.0,8603.0,8531.0,8530.0,8516.0,8500.0,8535.0,8548.0,8528.0,8483.0
2,1RNSW,Rest of NSW,101021009.0,Queanbeyan,9717.0,9513.0,9522.0,9400.0,9595.0,9682.0,...,11199.0,11213.0,11230.0,11362.0,11460.0,11468.0,11460.0,11375.0,11391.0,11420.0
3,1RNSW,Rest of NSW,101021010.0,Queanbeyan - East,3925.0,4073.0,4219.0,4218.0,4187.0,4319.0,...,4967.0,4961.0,4970.0,5016.0,5079.0,5126.0,5089.0,5097.0,5091.0,5099.0
4,1RNSW,Rest of NSW,101021012.0,Queanbeyan West - Jerrabomberra,9425.0,10257.0,11085.0,11549.0,12046.0,12358.0,...,13193.0,13164.0,13150.0,13090.0,13022.0,12955.0,12821.0,12748.0,12781.0,12873.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2451,9OTER,Other Territories,901021002.0,Cocos (Keeling) Islands,600.0,568.0,558.0,573.0,588.0,590.0,...,556.0,555.0,546.0,552.0,553.0,591.0,608.0,603.0,616.0,631.0
2452,9OTER,Other Territories,901031003.0,Jervis Bay,542.0,464.0,441.0,428.0,413.0,386.0,...,361.0,367.0,402.0,398.0,386.0,367.0,335.0,309.0,307.0,307.0
2453,9OTER,Other Territories,901041004.0,Norfolk Island,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1757.0,1845.0,1938.0,2015.0,2102.0,2221.0,2211.0,2209.0
2454,,,,Total Australia,19274701.0,19495210.0,19720737.0,19932722.0,20176844.0,20450966.0,...,23475686.0,23815995.0,24190907.0,24592588.0,24963258.0,25334826.0,25649248.0,25685412.0,26014399.0,26648878.0


## School Zone

## PTV

In [19]:
shape_1 = pd.read_csv("../data/landing/ptv/1/1/shapes.txt")
shape_1

Unnamed: 0,shape_id,shape_pt_lat,shape_pt_lon,shape_pt_sequence,shape_dist_traveled
0,1-ABY-mjp-10.1.H,-36.084262,146.924527,1,0.00
1,1-ABY-mjp-10.1.H,-36.085028,146.924381,2,86.20
2,1-ABY-mjp-10.1.H,-36.088625,146.923734,3,490.37
3,1-ABY-mjp-10.1.H,-36.090905,146.923265,4,747.44
4,1-ABY-mjp-10.1.H,-36.091681,146.923064,5,835.51
...,...,...,...,...,...
1107801,1-WBL-mjp-9.6.R,-37.811439,144.945707,1352,274128.87
1107802,1-WBL-mjp-9.6.R,-37.813874,144.948934,1353,274520.89
1107803,1-WBL-mjp-9.6.R,-37.814151,144.949289,1354,274564.74
1107804,1-WBL-mjp-9.6.R,-37.816379,144.950966,1355,274852.95


In [20]:
stops_1 = pd.read_csv("../data/landing/ptv/1/1/stops.txt")
stops_1 = stops_1.loc[:,["stop_name","stop_lat","stop_lon"]]
stops_1

Unnamed: 0,stop_name,stop_lat,stop_lon
0,Wallan Railway Station (Wallan),-37.416861,145.005372
1,Melton Railway Station (Melton South),-37.703359,144.572216
2,Rockbank Railway Station (Rockbank),-37.729261,144.650631
3,Deer Park Railway Station (Deer Park),-37.777764,144.772304
4,Sunbury Railway Station (Sunbury),-37.579206,144.728165
...,...,...,...
105,Raywood Railway Station (Raywood),-36.531959,144.201161
106,Huntly Railway Station (Huntly),-36.665848,144.369820
107,East Pakenham Railway Station (Pakenham),-38.084285,145.506314
108,Goornong Railway Station (Goornong),-36.615183,144.503474


In [21]:
gdf_stops_1 = gpd.GeoDataFrame(
    stops_1,
    geometry=gpd.points_from_xy(stops_1['stop_lon'], stops_1['stop_lat'])
)
gdf_stops_1

Unnamed: 0,stop_name,stop_lat,stop_lon,geometry
0,Wallan Railway Station (Wallan),-37.416861,145.005372,POINT (145.00537 -37.41686)
1,Melton Railway Station (Melton South),-37.703359,144.572216,POINT (144.57222 -37.70336)
2,Rockbank Railway Station (Rockbank),-37.729261,144.650631,POINT (144.65063 -37.72926)
3,Deer Park Railway Station (Deer Park),-37.777764,144.772304,POINT (144.7723 -37.77776)
4,Sunbury Railway Station (Sunbury),-37.579206,144.728165,POINT (144.72816 -37.57921)
...,...,...,...,...
105,Raywood Railway Station (Raywood),-36.531959,144.201161,POINT (144.20116 -36.53196)
106,Huntly Railway Station (Huntly),-36.665848,144.369820,POINT (144.36982 -36.66585)
107,East Pakenham Railway Station (Pakenham),-38.084285,145.506314,POINT (145.50631 -38.08428)
108,Goornong Railway Station (Goornong),-36.615183,144.503474,POINT (144.50347 -36.61518)


In [22]:
print(get_runtime(start_time))

Runtime: 0 minutes and 5 seconds
