The second part of the preprocessing of the rental data, same steps as the previous notebook except preprocessing is done for the JSONs not processed yet.

Combine all the JSONs from the second directory.

In [1]:
import os
import pandas as pd
import json

# Function to load a JSON file into a DataFrame
def load_json_to_df(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return pd.json_normalize(data)

# Directory containing the JSON files (New Directory)
json_dir_new = "../data/landing/new/"

# List all JSON files in the new directory
json_files_new = [f for f in os.listdir(json_dir_new) if f.endswith('.json')]

# Initialize list to store DataFrames for the new directory
dfs_new = []

# Loop through each JSON file in the new directory, load it, and append to dfs_new
for json_file in json_files_new:
    file_path = os.path.join(json_dir_new, json_file)
    df = load_json_to_df(file_path)
    
    # Check if the DataFrame is empty
    if not df.empty:
        dfs_new.append(df)
    else:
        print(f"Skipped empty DataFrame for file: {json_file}")

# Concatenate all DataFrames from the new directory into a single DataFrame
print(f"Concatenating {len(dfs_new)} DataFrames from the new directory")
compiled_df = pd.concat(dfs_new, ignore_index=True)




Concatenating 112 DataFrames from the old directory


In [2]:
print(compiled_df.count())

objective                                      583466
propertyTypes                                  583420
status                                         583466
saleMode                                       583466
channel                                        583466
                                                ...  
advertiserIdentifiers.conjunctionContactIds         6
advertiserIdentifiers.conjunctionAgentIds           6
saleDetails.tenderDetails.tenderEndDate            18
saleDetails.tenantDetails.leaseEndDate              3
devProjectId                                        6
Length: 99, dtype: int64


Drop any duplicate rows

In [None]:
# Function to recursively convert unhashable types (dict, list) to hashable types (tuple)
def make_hashable(item):
    if isinstance(item, dict):
        return tuple((key, make_hashable(value)) for key, value in sorted(item.items()))
    elif isinstance(item, list):
        return tuple(make_hashable(i) for i in item)
    else:
        return item

# Load the JSON file into a DataFrame
pdf = compiled_df

# Apply the recursive conversion to all elements in the DataFrame
for col in pdf.columns:
    pdf[col] = pdf[col].apply(make_hashable)

# Now you can drop duplicates
pdf_cleaned = pdf.drop_duplicates()

# Display the cleaned DataFrame
print(pdf_cleaned.count())

Filter for only rows that are rental data and not sales

In [3]:
filtered_df = compiled_df[compiled_df['objective'] == 'rent']


In [4]:
filtered_df.count()

objective                                      389071
propertyTypes                                  389048
status                                         389071
saleMode                                       389071
channel                                        389071
                                                ...  
advertiserIdentifiers.conjunctionContactIds         4
advertiserIdentifiers.conjunctionAgentIds           4
saleDetails.tenderDetails.tenderEndDate             0
saleDetails.tenantDetails.leaseEndDate              0
devProjectId                                        0
Length: 99, dtype: int64

In [None]:
# Step 1: Remove the dollar sign and commas
filtered_df['priceDetails.displayPrice'] = filtered_df['priceDetails.displayPrice'].str.replace('$', '', regex=False)  # Remove dollar sign
filtered_df['priceDetails.displayPrice'] = filtered_df['priceDetails.displayPrice'].str.replace(',', '', regex=False)  # Remove commas

# Step 2: Convert the cleaned strings to floats
filtered_df['priceDetails.displayPrice'] = pd.to_numeric(filtered_df['priceDetails.displayPrice'], errors='coerce')  # Convert to float

Filter for listings above 200 AUD as those below were found to mostly be carparks

In [6]:
filtered_df = filtered_df[filtered_df['priceDetails.price'] >= 200]
filtered_df.count()

objective                                    152345
propertyTypes                                152338
status                                       152345
saleMode                                     152345
channel                                      152345
                                              ...  
advertiserIdentifiers.conjunctionAgentIds         0
saleDetails.tenderDetails.tenderEndDate           0
saleDetails.tenantDetails.leaseEndDate            0
devProjectId                                      0
year                                         152345
Length: 100, dtype: int64

Only keep listings with a reasonable amount of bedrooms and bathrooms

In [7]:
# Step 3: Filter out unrealistic numbers of bathrooms and bedrooms
filtered_df = filtered_df[(filtered_df['bathrooms'] >= 1) & (filtered_df['bathrooms'] <= 10)]
filtered_df = filtered_df[(filtered_df['bedrooms'] >= 1) & (filtered_df['bedrooms'] <= 10)]

In [8]:
filtered_df.count()

objective                                    151173
propertyTypes                                151166
status                                       151173
saleMode                                     151173
channel                                      151173
                                              ...  
advertiserIdentifiers.conjunctionAgentIds         0
saleDetails.tenderDetails.tenderEndDate           0
saleDetails.tenantDetails.leaseEndDate            0
devProjectId                                      0
year                                         151173
Length: 100, dtype: int64

In [9]:
df = filtered_df

# Convert 'dateListed' to datetime
df['dateListed'] = pd.to_datetime(df['dateListed'])

# Extract year from 'dateListed'
df['year'] = df['dateListed'].dt.year

# Count number of listings per year
listings_per_year = df.groupby('year').size().reset_index(name='number_of_listings')

# Print results
print(listings_per_year)

    year  number_of_listings
0   2004                 757
1   2005                7667
2   2006                6475
3   2007                6885
4   2008                7839
5   2009                5353
6   2010                3311
7   2011                4612
8   2012                8694
9   2013                8943
10  2014                7623
11  2015                7996
12  2016                5492
13  2017                1710
14  2018                3536
15  2019               11024
16  2020               13583
17  2021               10608
18  2022                9959
19  2023               10577
20  2024                8529


In [10]:
filtered_df.to_parquet('../data/raw/filtered_rental_listings_2.parquet')

: 