In [2]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point

In [47]:
def find_location_id(lat, lon, gdf):
    """
    Find the SA2_CODE21 for a given latitude and longitude based on a GeoDataFrame of SA2 zones.
    
    Parameters:
    lat (float): Latitude of the location.
    lon (float): Longitude of the location.
    gdf (GeoDataFrame): GeoDataFrame containing SA2 zones and their geometries.
    
    Returns:
    int or None: SA2_CODE21 if the point is within an SA2 zone, otherwise None.
    """
    point = Point(lon, lat)
    for idx, zone in gdf.iterrows():
        if zone['geometry'] is not None and zone['geometry'].contains(point):
            return zone['SA2_CODE21']
    return None

In [21]:
sf2 = gpd.read_file("/root/MAST30034_Python/data/vic_zones/SA2_2021_AUST_GDA2020.shp")
sf2.head()

Unnamed: 0,SA2_CODE21,SA2_NAME21,CHG_FLAG21,CHG_LBL21,SA3_CODE21,SA3_NAME21,SA4_CODE21,SA4_NAME21,GCC_CODE21,GCC_NAME21,STE_CODE21,STE_NAME21,AUS_CODE21,AUS_NAME21,AREASQKM21,LOCI_URI21,geometry
0,101021007,Braidwood,0,No change,10102,Queanbeyan,101,Capital Region,1RNSW,Rest of NSW,1,New South Wales,AUS,Australia,3418.3525,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((149.58424 -35.44426, 149.58444 -35.4..."
1,101021008,Karabar,0,No change,10102,Queanbeyan,101,Capital Region,1RNSW,Rest of NSW,1,New South Wales,AUS,Australia,6.9825,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((149.21899 -35.36738, 149.218 -35.366..."
2,101021009,Queanbeyan,0,No change,10102,Queanbeyan,101,Capital Region,1RNSW,Rest of NSW,1,New South Wales,AUS,Australia,4.762,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((149.21326 -35.34325, 149.21619 -35.3..."
3,101021010,Queanbeyan - East,0,No change,10102,Queanbeyan,101,Capital Region,1RNSW,Rest of NSW,1,New South Wales,AUS,Australia,13.0032,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((149.24034 -35.34781, 149.24024 -35.3..."
4,101021012,Queanbeyan West - Jerrabomberra,0,No change,10102,Queanbeyan,101,Capital Region,1RNSW,Rest of NSW,1,New South Wales,AUS,Australia,13.6748,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((149.19572 -35.36126, 149.1997 -35.35..."


In [62]:
rent_data = pd.read_json("/root/project-2-group-real-estate-industry-project-34/data/landing/rental_listings.json")

In [63]:
# Extract latitude and longitude from the 'geoLocation' field safely
rent_data['latitude'] = rent_data['geoLocation'].apply(lambda x: x.get('latitude') if isinstance(x, dict) else None)
rent_data['longitude'] = rent_data['geoLocation'].apply(lambda x: x.get('longitude') if isinstance(x, dict) else None)
rent_data['price'] = rent_data['priceDetails'].apply(lambda x: x.get('price'))

# Display the updated dataframe with the new 'latitude' and 'longitude' columns
print(rent_data["price"].head())


0    440.0
1    450.0
2    425.0
3    380.0
4    300.0
Name: price, dtype: float64


In [8]:
# Assuming 'rent_data' is your DataFrame
null_values_count = rent_data["price"].isnull().sum()

print(f"Number of null values in 'price' column: {null_values_count}")


Number of null values in 'price' column: 16565


In [9]:
rent_data = rent_data.dropna(subset=["price"])

In [11]:
# Apply the 'find_location_id' function to each row in 'rent_data'
rent_data['SA2_CODE21'] = rent_data.apply(
    lambda row: find_location_id(row['latitude'], row['longitude'], sf2), axis=1
)

In [12]:
rent_data.columns

Index(['objective', 'propertyTypes', 'status', 'saleMode', 'channel',
       'addressParts', 'advertiserIdentifiers', 'apmIdentifiers', 'bathrooms',
       'bedrooms', 'carspaces', 'dateAvailable', 'dateUpdated', 'dateListed',
       'description', 'features', 'geoLocation', 'headline', 'id',
       'inspectionDetails', 'isNewDevelopment', 'media', 'priceDetails',
       'propertyId', 'rentalDetails', 'seoUrl', 'buildingAreaSqm',
       'virtualTourUrl', 'landAreaSqm', 'energyEfficiencyRating', 'latitude',
       'longitude', 'price', 'SA2_CODE21'],
      dtype='object')

In [13]:
# List of columns to drop
to_drop = ['objective', 'status', 'saleMode', 'advertiserIdentifiers',
           'description', 'headline', 'id', 'inspectionDetails', 
           'media', 'propertyId', 'seoUrl', 'virtualTourUrl']

# Drop the columns
rent_data_cleaned = rent_data.drop(columns=to_drop)


In [15]:
rent_data.count()

objective                 36475
propertyTypes             36475
status                    36475
saleMode                  36475
channel                   36475
addressParts              36475
advertiserIdentifiers     36475
apmIdentifiers            36475
bathrooms                 36475
bedrooms                  36475
carspaces                 36475
dateAvailable             16726
dateUpdated               36475
dateListed                36475
description               36475
features                  19846
geoLocation               31912
headline                  36475
id                        36475
inspectionDetails         10380
isNewDevelopment          36475
media                     34971
priceDetails              36475
propertyId                19102
rentalDetails             36475
seoUrl                    36475
buildingAreaSqm               1
virtualTourUrl            18678
landAreaSqm                   2
energyEfficiencyRating        0
latitude                  31912
longitud

In [14]:
# Save the cleaned DataFrame to a Parquet file
rent_data_cleaned.to_parquet("cleaned_rent_data.parquet", index=False)


In [39]:
rent_data2 = pd.read_parquet("/root/project-2-group-real-estate-industry-project-34/data/landing/filtered_rental_listings.parquet")
rent_data2.head()

Unnamed: 0,objective,propertyTypes,status,saleMode,channel,bathrooms,bedrooms,carspaces,dateUpdated,dateListed,...,statementOfInformation.comparableData.declarationText,statementOfInformation.documentationUrl,saleDetails.annualReturn,statementOfInformation.suburbMedianPrice.postcode,saleDetails.tenantDetails.leaseStartDate,saleDetails.tenantDetails.leaseEndDate,advertiserIdentifiers.conjunctionContactIds,advertiserIdentifiers.conjunctionAgentIds,saleDetails.tenderDetails.tenderEndDate,year
0,rent,[apartmentUnitFlat],archived,archived,residential,2.0,2.0,2.0,2009-09-09T05:33:21.927Z,2009-05-08 01:26:01+00:00,...,,,,,,,,,,2009
1,rent,[apartmentUnitFlat],archived,archived,residential,2.0,3.0,1.0,2009-06-18T09:28:23.563Z,2009-05-08 01:26:03+00:00,...,,,,,,,,,,2009
2,rent,[house],archived,archived,residential,2.0,3.0,4.0,2010-09-29T01:25:37.107Z,2009-05-08 01:26:05+00:00,...,,,,,,,,,,2009
3,rent,[apartmentUnitFlat],archived,archived,residential,3.0,3.0,2.0,2010-06-19T05:02:41Z,2009-05-11 05:37:31+00:00,...,,,,,,,,,,2009
4,rent,[apartmentUnitFlat],archived,archived,residential,3.0,3.0,2.0,2010-11-22T05:31:15.987Z,2009-05-11 09:28:10+00:00,...,,,,,,,,,,2009


In [41]:
rent_data2.columns

Index(['objective', 'propertyTypes', 'status', 'saleMode', 'channel',
       'bathrooms', 'bedrooms', 'carspaces', 'dateUpdated', 'dateListed',
       'description', 'features', 'headline', 'id', 'isNewDevelopment',
       'media', 'propertyId', 'seoUrl', 'virtualTourUrl',
       'addressParts.stateAbbreviation', 'addressParts.displayType',
       'addressParts.streetNumber', 'addressParts.unitNumber',
       'addressParts.street', 'addressParts.suburb', 'addressParts.suburbId',
       'addressParts.postcode', 'addressParts.displayAddress',
       'advertiserIdentifiers.advertiserType',
       'advertiserIdentifiers.advertiserId',
       'advertiserIdentifiers.contactIds', 'advertiserIdentifiers.agentIds',
       'apmIdentifiers.suburbId', 'geoLocation.latitude',
       'geoLocation.longitude', 'priceDetails.price',
       'priceDetails.canDisplayPrice', 'priceDetails.displayPrice',
       'priceDetails.bond', 'rentalDetails.rentalMethod',
       'rentalDetails.source', 'rentalDetails.

In [48]:
# Apply the 'find_location_id' function to each row in 'rent_data'
rent_data2['SA2_CODE21'] = rent_data2.apply(
    lambda row: find_location_id(row['geoLocation.latitude'], row['geoLocation.longitude'], sf2), axis=1
)

In [62]:
# List of columns to drop
to_drop = ['objective', 'status', 'saleMode', 'description', 'headline', 'id', 
           'media', 'propertyId', 'seoUrl', 'virtualTourUrl','advertiserIdentifiers.advertiserId',
 'advertiserIdentifiers.advertiserType',
 'advertiserIdentifiers.agentIds',
 'advertiserIdentifiers.conjunctionAgentIds',
 'advertiserIdentifiers.conjunctionContactIds',
 'advertiserIdentifiers.contactIds','inspectionDetails.inspections', 'inspectionDetails.pastInspections',
       'inspectionDetails.isByAppointmentOnly', 'saleDetails.tenantDetails.leaseDateVariable',
 'saleDetails.tenantDetails.leaseEndDate',
 'saleDetails.tenantDetails.leaseOptions',
 'saleDetails.tenantDetails.leaseStartDate',
 'saleDetails.tenantDetails.tenantName',
 'saleDetails.tenantDetails.tenantRentDetails',
 'saleDetails.tenderDetails.tenderAddress',
 'saleDetails.tenderDetails.tenderEndDate',
 'saleDetails.tenderDetails.tenderRecipientName', 'saleDetails.auctionDetails.auctionSchedule.locationDescription',
 'saleDetails.auctionDetails.auctionSchedule.openingDateTime',
 'saleDetails.auctionDetails.auctionSchedule.terms',
 'saleDetails.auctionDetails.auctionedDate']

# Drop the columns
rent_data2_cleaned = rent_data2.drop(columns=to_drop)
len(rent_data2_cleaned.columns)

67

In [64]:
# Save the cleaned DataFrame to a Parquet file
rent_data2_cleaned.to_parquet("cleaned_rent_data2.parquet", index=False)

In [63]:
sorted(rent_data2_cleaned.columns)

['SA2_CODE21',
 'addressParts.displayAddress',
 'addressParts.displayType',
 'addressParts.postcode',
 'addressParts.stateAbbreviation',
 'addressParts.street',
 'addressParts.streetNumber',
 'addressParts.suburb',
 'addressParts.suburbId',
 'addressParts.unitNumber',
 'apmIdentifiers.suburbId',
 'bathrooms',
 'bedrooms',
 'buildingAreaSqm',
 'carspaces',
 'channel',
 'dateAvailable',
 'dateListed',
 'dateUpdated',
 'energyEfficiencyRating',
 'features',
 'geoLocation.latitude',
 'geoLocation.longitude',
 'highlights',
 'isNewDevelopment',
 'landAreaSqm',
 'numberOfDwellings',
 'price',
 'priceDetails.bond',
 'priceDetails.canDisplayPrice',
 'priceDetails.displayPrice',
 'priceDetails.price',
 'priceDetails.priceFrom',
 'priceDetails.pricePrefix',
 'priceDetails.priceTo',
 'propertyTypes',
 'rentalDetails.canDisplayPrice',
 'rentalDetails.leaseOutgoings',
 'rentalDetails.leasedDate',
 'rentalDetails.leasedMonths',
 'rentalDetails.leasedPrice',
 'rentalDetails.rentalMethod',
 'rentalDet

In [3]:
rent_data_cleaned = pd.read_parquet("/root/project-2-group-real-estate-industry-project-34/notebooks/preprocessed_rent_data.parquet")
rent_data2_cleaned = pd.read_parquet("/root/project-2-group-real-estate-industry-project-34/notebooks/cleaned_rent_data2.parquet")

In [4]:
rent_data2_cleaned.columns

Index(['propertyTypes', 'channel', 'bathrooms', 'bedrooms', 'carspaces',
       'dateUpdated', 'dateListed', 'features', 'isNewDevelopment',
       'addressParts.stateAbbreviation', 'addressParts.displayType',
       'addressParts.streetNumber', 'addressParts.unitNumber',
       'addressParts.street', 'addressParts.suburb', 'addressParts.suburbId',
       'addressParts.postcode', 'addressParts.displayAddress',
       'apmIdentifiers.suburbId', 'geoLocation.latitude',
       'geoLocation.longitude', 'priceDetails.price',
       'priceDetails.canDisplayPrice', 'priceDetails.displayPrice',
       'priceDetails.bond', 'rentalDetails.rentalMethod',
       'rentalDetails.source', 'rentalDetails.canDisplayPrice', 'landAreaSqm',
       'rentalDetails.leasedDate', 'dateAvailable', 'energyEfficiencyRating',
       'rentalDetails.leasedPrice', 'rentalDetails.leasedMonths',
       'priceDetails.pricePrefix', 'buildingAreaSqm', 'highlights',
       'rentalDetails.termOfLeaseFrom', 'rentalDetails.te

In [5]:
rent_data_cleaned.columns

Index(['address', 'state', 'suburb', 'bedrooms', 'bathrooms', 'propertyTypes',
       'carspaces', 'date_listed', 'latitude', 'longitude',
       'is_new_development', 'price', 'propertyId', 'year', 'month', 'day',
       'SA2_CODE21'],
      dtype='object')

In [6]:
# Ensure 'dateAvailable' is in datetime format
rent_data_cleaned['dateAvailable'] = pd.to_datetime(rent_data_cleaned['date_listed'])

# Create separate columns for year, month, and day
rent_data_cleaned['year'] = rent_data_cleaned['dateAvailable'].dt.year
rent_data_cleaned['month'] = rent_data_cleaned['dateAvailable'].dt.month
rent_data_cleaned['day'] = rent_data_cleaned['dateAvailable'].dt.day

# Display the updated DataFrame
print(rent_data_cleaned[['dateAvailable', 'year', 'month', 'day']].head())

  dateAvailable  year  month  day
0    2009-05-08  2009      5    8
1    2009-05-08  2009      5    8
2    2009-05-08  2009      5    8
3    2009-05-11  2009      5   11
4    2009-05-11  2009      5   11


In [11]:
# Ensure 'dateAvailable' is in datetime format
rent_data2_cleaned['dateAvailable'] = pd.to_datetime(rent_data2_cleaned['dateListed'])

# Create separate columns for year, month, and day
rent_data2_cleaned['year'] = rent_data2_cleaned['dateAvailable'].dt.year
rent_data2_cleaned['month'] = rent_data2_cleaned['dateAvailable'].dt.month
rent_data2_cleaned['day'] = rent_data2_cleaned['dateAvailable'].dt.day

# Display the updated DataFrame
print(rent_data2_cleaned[['dateAvailable', 'year', 'month', 'day']].head())

              dateAvailable  year  month  day
0 2009-05-08 01:26:01+00:00  2009      5    8
1 2009-05-08 01:26:03+00:00  2009      5    8
2 2009-05-08 01:26:05+00:00  2009      5    8
3 2009-05-11 05:37:31+00:00  2009      5   11
4 2009-05-11 09:28:10+00:00  2009      5   11


In [6]:
# Check unique years in the 'year' column
unique_years = rent_data2_cleaned['year'].unique()

print(f"Unique years in the dataset: {unique_years}")

Unique years in the dataset: [2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2024
 2023 2022 2004 2005 2006 2007 2008]


In [12]:
# Get the counts of each unique year
year_counts = rent_data_cleaned['year'].value_counts(dropna=False)

print(year_counts)


year
2021    4026
2020    3920
2022    3239
2023    2792
2014    2567
2013    2534
2015    2378
2012    2098
2019    2061
2005    2002
2007    1779
2006    1773
2024    1747
2016    1658
2008    1618
2004     204
2017      70
2018       9
Name: count, dtype: int64


In [42]:
# Filter for the years 2022 and 2023 in the first 3 months
filtered_data = rent_data_cleaned[
    (rent_data_cleaned['year'].isin([2022, 2023])) & 
    (rent_data_cleaned['month'].isin([1, 2, 3]))
]

filtered_data2 = rent_data2_cleaned[
    (rent_data2_cleaned['year'].isin([2022, 2023])) & 
    (rent_data2_cleaned['month'].isin([1, 2, 3]))
]


In [43]:
data1 = filtered_data[['SA2_CODE21', 'year', 'price']]
data2 = filtered_data2[['SA2_CODE21', 'year', 'priceDetails.price']]
data2 = data2.rename(columns={'priceDetails.price': 'price'})

In [44]:
union_data = pd.concat([data1, data2], ignore_index=True)


In [45]:
# Group by 'SA2_CODE21', 'sa2id', and 'year', then calculate the average price
grouped_data = union_data.groupby(['SA2_CODE21', 'year'])['price'].mean().reset_index()


In [46]:
grouped_data.head()

Unnamed: 0,SA2_CODE21,year,price
0,206011106,2022,400.0
1,206011106,2023,600.0
2,206011107,2022,315.0
3,206021112,2022,560.0
4,206031115,2022,400.0


In [47]:
# Pivot the data to have years as columns, indexed by 'SA2_CODE21' and 'sa2id'
pivoted_data = grouped_data.pivot(index=['SA2_CODE21'], columns='year', values='price').reset_index()

# Calculate the percentage increase from 2021 to 2022
pivoted_data['price_increase'] = ((pivoted_data[2023] - pivoted_data[2022]) / pivoted_data[2022]) * 100

# Drop rows with NaN values (if any combination of 'SA2_CODE21' and 'sa2id' does not have data for both years)
pivoted_data.dropna(subset=['price_increase'], inplace=True)

# Display the result
print(pivoted_data.head())


year SA2_CODE21        2022        2023  price_increase
0     206011106  400.000000  600.000000       50.000000
5     206041117  461.586207  620.000000       34.319438
6     206041118  481.403509  588.557692       22.258704
8     206041124  420.000000  400.000000       -4.761905
9     206041125  488.000000  547.222222       12.135701


In [48]:
gdf = gpd.GeoDataFrame(
    pd.merge(pivoted_data, sf2, on='SA2_CODE21', how='inner')
)

gdf.head()

Unnamed: 0,SA2_CODE21,2022,2023,price_increase,SA2_NAME21,CHG_FLAG21,CHG_LBL21,SA3_CODE21,SA3_NAME21,SA4_CODE21,SA4_NAME21,GCC_CODE21,GCC_NAME21,STE_CODE21,STE_NAME21,AUS_CODE21,AUS_NAME21,AREASQKM21,LOCI_URI21,geometry
0,206011106,400.0,600.0,50.0,Brunswick East,0,No change,20601,Brunswick - Coburg,206,Melbourne - Inner,2GMEL,Greater Melbourne,2,Victoria,AUS,Australia,2.1682,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((144.97307 -37.76386, 144.9734 -37.76..."
1,206041117,461.586207,620.0,34.319438,Carlton,0,No change,20604,Melbourne City,206,Melbourne - Inner,2GMEL,Greater Melbourne,2,Victoria,AUS,Australia,1.8187,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((144.97488 -37.79794, 144.97477 -37.7..."
2,206041118,481.403509,588.557692,22.258704,Docklands,0,No change,20604,Melbourne City,206,Melbourne - Inner,2GMEL,Greater Melbourne,2,Victoria,AUS,Australia,2.444,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((144.94525 -37.81208, 144.94545 -37.8..."
3,206041124,420.0,400.0,-4.761905,Parkville,0,No change,20604,Melbourne City,206,Melbourne - Inner,2GMEL,Greater Melbourne,2,Victoria,AUS,Australia,4.0491,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((144.93897 -37.78156, 144.93874 -37.7..."
4,206041125,488.0,547.222222,12.135701,South Yarra - West,0,No change,20604,Melbourne City,206,Melbourne - Inner,2GMEL,Greater Melbourne,2,Victoria,AUS,Australia,1.5027,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((144.97455 -37.83467, 144.97445 -37.8..."


In [49]:
geoJSON = gdf[['SA2_CODE21', 'geometry']].drop_duplicates('SA2_CODE21').to_json()


In [39]:
import folium

In [55]:
# Count proportion of demand from each drop-off location
proportions = pivoted_data[['SA2_CODE21','price_increase']] \
                .groupby('SA2_CODE21') \
                .agg(
                    {
                        'price_increase': 'mean' # count number of instances from sample
                    }
                ) \

# Reset the index so SA2_CODE21 becomes a column
proportions = proportions.reset_index()
proportions.head()

year,SA2_CODE21,price_increase
0,206011106,50.0
1,206041117,34.319438
2,206041118,22.258704
3,206041124,-4.761905
4,206041125,12.135701


In [60]:
import folium

# Step 1: Create a new column to classify the price change as either 'positive' or 'negative'
proportions['price_bin'] = proportions['price_increase'].apply(lambda x: 'positive' if x > 0 else 'negative')

# Define a simple color map for the bins
bin_colors = {
    'positive': 'red',    # Red for positive price changes
    'negative': 'blue'    # Blue for negative price changes
}

# Plot the map
m = folium.Map(location=[-37.83467, 144.97445], zoom_start=10)

# Create the choropleth map without specifying fill_color
c = folium.Choropleth(
    geo_data=geoJSON,  # geoJSON 
    name='choropleth',  # name of plot
    data=proportions.reset_index(),  # data source
    columns=['SA2_CODE21', 'price_increase'],  # the columns required
    key_on='properties.SA2_CODE21',  # this is from the geoJSON's properties
    nan_fill_color='black',
    legend_name='2021-2022 percentage price change'
)

# Add the choropleth to the map
c.add_to(m)

# Step 2: Apply the bin colors
folium.GeoJson(
    geoJSON,
    style_function=lambda feature: {
        'fillColor': bin_colors[
            proportions.set_index('SA2_CODE21').loc[feature['properties']['SA2_CODE21'], 'price_bin']
        ] if feature['properties']['SA2_CODE21'] in proportions['SA2_CODE21'].values else 'black',
        'color': 'black',
        'weight': 1,
        'fillOpacity': 0.7,
    }
).add_to(m)

# Display the map
m


In [51]:
# Plot the map
m = folium.Map(location=[-37.83467, 144.97445], zoom_start=10)

# refer to the folium documentations on more information on how to plot aggregated data.
c = folium.Choropleth(
    geo_data=geoJSON, # geoJSON 
    name='choropleth', # name of plot
    data=proportions.reset_index(), # data source
    columns=['SA2_CODE21','price_increase'], # the columns required
    key_on='properties.SA2_CODE21', # this is from the geoJSON's properties
    fill_color='PRGn', # color scheme
    nan_fill_color='black',
    legend_name='2021-2022 percentage price change'
)

c.add_to(m)

m


In [27]:
m.save("price_change_2021-2022.html")