Liveability score is defined to be the weighted average of below features

1. Housing affordability
2. Access to public transport
3. Access to city center


# 0.Setup

In [3]:
import sys
sys.path.append('../')
from scripts.proximity import proximity_sjoin

In [4]:
import pandas as pd
import numpy as np
import geopandas as gpd
# rental data used as the main data for analysis
rental_data = pd.read_csv('../data/curated/rental-17-24.csv')
suburb_shape = gpd.read_file('../data/curated/shapefiles/postcode/POSTCODE_POLYGON.shp')[['POSTCODE', 'geometry']]
ptv_time = pd.read_csv('../data/raw/time_ptv.csv')
crime_data = pd.read_excel('../data/landing/crime2024.xlsx', sheet_name='Table 03', header=0)

In [5]:
rental_gdf = gpd.GeoDataFrame(rental_data, geometry=gpd.points_from_xy(rental_data.lng, rental_data.lat), crs='EPSG:7844')
rental_gdf = gpd.sjoin(rental_gdf, suburb_shape, how='left', predicate='within')\
    .drop(columns=['index_right'])
assert rental_gdf.shape[0] == rental_data.shape[0]  # check if all rows are joined
assert rental_gdf[rental_gdf['POSTCODE'].isnull()].shape[0] == 0 # check if there are any rows that are not joined
rental_gdf.rename(columns={'POSTCODE': 'postcode'}, inplace=True)

## postcodes with at least 100 instances


In [36]:
threshold = 100
count_ins = rental_gdf.groupby('postcode').size().reset_index(name='count')
valid_postcode_list = count_ins[count_ins['count'] > threshold]['postcode'].tolist()

In [47]:
def postcode_count_filter(df, valid_postcode_list):
    return df[df['postcode'].isin(valid_postcode_list)]

# 1.Housing affordability

Housing affordability can be expressed as the ratio of housing costs to gross household income (ABS 2022a).

In [37]:
affordability = pd.DataFrame(rental_gdf) # making a copy
# get rent as a percentage of income (just to make it meaningful)
affordability['price/income'] = affordability['rented_price']/7*365 / affordability['median_income']
affordability = affordability.groupby('postcode').agg({'price/income': 'mean'}).reset_index()

In [38]:
affordability.sort_values('price/income', ascending=False, inplace=True)

In [60]:
affordability_rank = postcode_count_filter(affordability,valid_postcode_list).reset_index(drop=True)
affordability_rank['rank'] = affordability_rank.index + 1
affordability_rank = affordability_rank[['postcode', 'rank']]
affordability_rank

Unnamed: 0,postcode,rank
0,3000,1
1,3939,2
2,3944,3
3,3943,4
4,3929,5
...,...,...
334,3480,335
335,3414,336
336,3304,337
337,3390,338


Unnamed: 0,postcode,rank
0,3000,1
1,3939,2
2,3944,3
3,3943,4
4,3929,5
...,...,...
334,3480,335
335,3414,336
336,3304,337
337,3390,338


In [59]:
affordability

Unnamed: 0,postcode,price/income
297,3392,1.667066
626,3990,1.154472
451,3704,0.980728
610,3958,0.814203
0,3000,0.774013
...,...,...
359,3520,0.232992
238,3289,0.229505
381,3566,0.218298
308,3424,0.216318


# 2.Access to Public Transport

defined as the average time to public transport for properties in that suburb

In [44]:
public_transport = proximity_sjoin(pd.DataFrame(rental_gdf).drop(columns=['geometry']), ptv_time,'ptv_time')
assert public_transport[public_transport['ptv_time'].isna()].shape[0] == 0
public_transport = public_transport[public_transport['ptv_time'] != -1] # filter out error entries




In [45]:
# get average time to ptv stops
public_transport_res = public_transport.groupby('postcode').agg({'ptv_time': 'mean'}).reset_index()
# get count
count_by_suburb = public_transport.groupby('postcode').size().reset_index(name='count')
public_transport_res = pd.merge(public_transport_res, count_by_suburb, on='postcode')
# sort
public_transport_res.sort_values('ptv_time', ascending=True, inplace=True)
# join suburb names
public_transport_res

Unnamed: 0,postcode,ptv_time,count
352,3512,57.700000,4
351,3509,57.700000,1
357,3520,63.900000,2
154,3179,65.500000,3
48,3057,80.559420,69
...,...,...,...
249,3314,21149.490909,11
449,3704,24488.400000,4
251,3317,27677.000000,1
239,3292,29633.237500,8


In [53]:
ptv_rank = postcode_count_filter(public_transport_res, valid_postcode_list).reset_index(drop=True)
# get postcode and index
ptv_rank['rank'] = ptv_rank.index +1
ptv_rank = ptv_rank[['postcode', 'rank']]
ptv_rank

Unnamed: 0,postcode,rank
0,3000,1
1,3003,2
2,3055,3
3,3008,4
4,3053,5
...,...,...
334,3664,335
335,3939,336
336,3723,337
337,3401,338


# 3.Crime

https://www.crimestatistics.vic.gov.au/crime-statistics/latest-crime-data-by-area

In [72]:
crime_rank = crime_data[['Postcode', 'Incidents Recorded']].groupby('Postcode').size().reset_index(name='count').sort_values('count').reset_index(drop=True).rename(columns={'Postcode': 'postcode'})
crime_rank['postcode'] = crime_rank['postcode'].astype(str)
crime_rank = postcode_count_filter(crime_rank, valid_postcode_list).reset_index(drop=True)

In [None]:
crime_rank['rank'] = crime_rank.index + 1
crime_rank = crime_rank[['postcode', 'rank']]
crime_rank

# 4.Proximity to CBDs

In [16]:
city_rank = rental_gdf[rental_gdf['time_city'] != -1] # filter out error entries
city_rank = city_rank.groupby('postcode').agg({'time_city': 'mean'}).reset_index()


In [77]:
city_rank.sort_values('time_city', ascending=True, inplace=True)
city_rank = postcode_count_filter(city_rank, valid_postcode_list).reset_index(drop=True)
city_rank['rank'] = city_rank.index + 1
city_rank = city_rank[['postcode', 'rank']]
city_rank

Unnamed: 0,postcode,rank
0,3220,1
1,3000,2
2,3053,3
3,3350,4
4,3006,5
...,...,...
334,3498,335
335,3888,336
336,3500,337
337,3501,338


# 5.Proximity Parks

In [27]:
park = pd.read_csv('../data/raw/time_park.csv')
# park = gpd.GeoDataFrame(park, geometry=gpd.points_from_xy(park.lng, park.lat), crs='EPSG:7844')
park_rental = proximity_sjoin(pd.DataFrame(rental_gdf).drop(columns=['geometry']), park, 'time_park')
park_rental = park_rental[park_rental['time_park'] != -1] # filter out error entries
park_rank = park_rental.groupby('postcode').agg({'time_park': 'mean'}).reset_index()




In [78]:
park_rank = park_rank.sort_values('time_park', ascending=True).reset_index(drop=True)
park_rank = postcode_count_filter(park_rank, valid_postcode_list).reset_index(drop=True)

In [82]:
park_rank['rank'] = park_rank.index + 1
park_rank = park_rank[['postcode', 'rank']]
park_rank

Unnamed: 0,postcode,rank
0,3975,1
1,3980,2
2,3976,3
3,3177,4
4,3806,5
...,...,...
334,3318,335
335,3498,336
336,3500,337
337,3501,338


# inf.Combine

In [108]:

liveability_weight = 1
affordability_weight = 1

## liveability

In [95]:
liveability_list = [ptv_rank, crime_rank, city_rank, park_rank]
suffixes = ['_ptv', '_crime', '_city', '_park']
weight = [2,2,1,1]
for i, df in enumerate(liveability_list):
    liveability_list[i] = df.rename(columns={'rank': 'rank' + suffixes[i]})

In [96]:
joined_rank = liveability_list[0].rename(columns={'rank': 'rank' + suffixes[0]})
for i in range(1, len(liveability_list)):
    joined_rank = pd.merge(joined_rank, liveability_list[i], on='postcode')

In [105]:
joined_rank['liveability_rank'] = joined_rank.apply(lambda x: sum([x['rank' + suffixes[i]] * weight[i] / sum(weight) for i in range(len(suffixes))]), axis=1)

In [106]:
joined_rank.sort_values('liveability_rank', ascending=True, inplace=True)
joined_rank

Unnamed: 0,postcode,rank_ptv,rank_crime,rank_city,rank_park,overall_rank,rank_affordability,liveability_rank
0,3143,6,71,42,60,42.666667,279,42.666667
1,3102,42,40,52,103,53.166667,82,53.166667
2,3142,37,87,29,61,56.333333,295,56.333333
3,3122,8,115,39,63,58.000000,320,58.000000
4,3161,26,88,60,68,59.333333,187,59.333333
...,...,...,...,...,...,...,...,...
334,3909,290,283,334,305,297.500000,71,297.500000
335,3401,338,251,305,321,300.666667,202,300.666667
336,3851,332,303,316,273,309.833333,106,309.833333
337,3875,303,333,326,290,314.666667,233,314.666667


## affordability

In [103]:
joined_rank = pd.merge(joined_rank, affordability_rank.rename(columns={'rank': 'rank_affordability'}), on='postcode')
joined_rank

Unnamed: 0,postcode,rank_ptv,rank_crime,rank_city,rank_park,overall_rank,rank_affordability
0,3143,6,71,42,60,42.666667,279
1,3102,42,40,52,103,53.166667,82
2,3142,37,87,29,61,56.333333,295
3,3122,8,115,39,63,58.000000,320
4,3161,26,88,60,68,59.333333,187
...,...,...,...,...,...,...,...
334,3909,290,283,334,305,297.500000,71
335,3401,338,251,305,321,300.666667,202
336,3851,332,303,316,273,309.833333,106
337,3875,303,333,326,290,314.666667,233


In [110]:
joined_rank['final_rank'] = joined_rank.apply(lambda x: (x['liveability_rank']*liveability_weight + x['rank_affordability']*affordability_weight) / (liveability_weight+affordability_weight), axis=1)
joined_rank.sort_values('final_rank', ascending=True, inplace=True)

In [115]:
joined_rank.reset_index(drop=True).head(10)

Unnamed: 0,postcode,rank_ptv,rank_crime,rank_city,rank_park,rank_affordability,liveability_rank,final_rank
0,3053,5,133,3,116,13,65.833333,39.416667
1,3151,101,72,121,42,8,84.833333,46.416667
2,3104,91,82,65,81,15,82.0,48.5
3,3000,1,235,2,107,1,96.833333,48.916667
4,3109,87,110,105,70,6,94.833333,50.416667
5,3108,75,136,85,67,9,95.666667,52.333333
6,3003,2,132,7,112,43,64.5,53.75
7,3107,94,90,82,94,17,90.666667,53.833333
8,3125,49,174,96,41,14,97.166667,55.583333
9,3051,14,147,12,119,37,75.5,56.25


### save

In [116]:
joined_rank.to_csv('../data/curated/liveability_final.csv', index=False)