In [2]:
import os
import re

# Directories
json_dir = "../data/landing/"
json_dir_new = "../data/landing/new/"

# Function to extract agency IDs from filenames
def extract_agency_id(filename):
    # Assuming filenames follow the pattern "listings_agency_<agency_id>.json" or similar
    match = re.search(r'listings_agency_(\d+)', filename)
    if match:
        return match.group(1)
    return None

# List all JSON files in both directories
json_files = [f for f in os.listdir(json_dir) if f.endswith('.json')]
json_files_new = [f for f in os.listdir(json_dir_new) if f.endswith('.json')]

# Extract agency IDs from filenames in both directories
agency_ids_landing = set(filter(None, [extract_agency_id(f) for f in json_files]))
agency_ids_new = set(filter(None, [extract_agency_id(f) for f in json_files_new]))

# Find duplicates by checking for common agency IDs between the two directories
duplicate_agency_ids = agency_ids_landing.intersection(agency_ids_new)

# Output results
if duplicate_agency_ids:
    print(f"Duplicate agency IDs found: {duplicate_agency_ids}")
else:
    print("No duplicate agency IDs found.")


No duplicate agency IDs found.


In [1]:
import os
import pandas as pd
import json

# Function to load a JSON file into a DataFrame
def load_json_to_df(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return pd.json_normalize(data)

# Directory containing the JSON files (Old Directory)
json_dir = "../data/landing/"

# List all JSON files in the old directory
json_files = [f for f in os.listdir(json_dir) if f.endswith('.json')]

# Initialize list to store DataFrames for the old directory
dfs_old = []

# Loop through each JSON file in the old directory, load it, and append to dfs_old
for json_file in json_files:
    file_path = os.path.join(json_dir, json_file)
    df = load_json_to_df(file_path)
    
    # Check if the DataFrame is empty
    if not df.empty:
        dfs_old.append(df)
    else:
        print(f"Skipped empty DataFrame for file: {json_file}")

# Concatenate all DataFrames from the old directory into a single DataFrame
print(f"Concatenating {len(dfs_old)} DataFrames from the old directory")
compiled_df = pd.concat(dfs_old, ignore_index=True)


Concatenating 112 DataFrames from the old directory


In [2]:
print(compiled_df.count())

objective                                      583466
propertyTypes                                  583420
status                                         583466
saleMode                                       583466
channel                                        583466
                                                ...  
advertiserIdentifiers.conjunctionContactIds         6
advertiserIdentifiers.conjunctionAgentIds           6
saleDetails.tenderDetails.tenderEndDate            18
saleDetails.tenantDetails.leaseEndDate              3
devProjectId                                        6
Length: 99, dtype: int64


In [None]:
# Function to recursively convert unhashable types (dict, list) to hashable types (tuple)
def make_hashable(item):
    if isinstance(item, dict):
        return tuple((key, make_hashable(value)) for key, value in sorted(item.items()))
    elif isinstance(item, list):
        return tuple(make_hashable(i) for i in item)
    else:
        return item

# Load the JSON file into a DataFrame
pdf = compiled_df

# Apply the recursive conversion to all elements in the DataFrame
for col in pdf.columns:
    pdf[col] = pdf[col].apply(make_hashable)

# Now you can drop duplicates
pdf_cleaned = pdf.drop_duplicates()

# Display the cleaned DataFrame
print(pdf_cleaned.count())

In [3]:
filtered_df = compiled_df[compiled_df['objective'] == 'rent']


In [4]:
filtered_df.count()

objective                                            136047
propertyTypes                                        135951
status                                               136047
saleMode                                             136047
channel                                              136047
                                                      ...  
saleDetails.tenantDetails.tenantInfoTermOfLeaseTo         0
saleDetails.tenantDetails.leaseStartDate                  0
saleDetails.tenantDetails.leaseEndDate                    0
advertiserIdentifiers.conjunctionContactIds               0
advertiserIdentifiers.conjunctionAgentIds                 0
Length: 98, dtype: int64

In [4]:
filtered_df.count()

objective                                      389071
propertyTypes                                  389048
status                                         389071
saleMode                                       389071
channel                                        389071
                                                ...  
advertiserIdentifiers.conjunctionContactIds         4
advertiserIdentifiers.conjunctionAgentIds           4
saleDetails.tenderDetails.tenderEndDate             0
saleDetails.tenantDetails.leaseEndDate              0
devProjectId                                        0
Length: 99, dtype: int64

Filtered for only apartments and removed carparks as they skew the data

In [None]:
price_filtered = filtered_df[filtered_df['priceDetails.price'] <= 200]

# Select relevant columns for inspection
columns_of_interest = ['priceDetails.price', 'propertyTypes', 'headline', 'description', 'addressParts.displayAddress', 'dateListed']

# Display the relevant information for these listings
print(price_filtered.count())


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd


# Plotting
plt.figure(figsize=(10, 6))
sns.histplot(filtered_df['priceDetails.price'], kde=False)  # Let seaborn choose the number of bins automatically
# Set x-axis limits
plt.xlim(-500, 1400)
plt.title('Distribution of Prices')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()


In [None]:
# Step 1: Remove the dollar sign and commas
filtered_df['priceDetails.displayPrice'] = filtered_df['priceDetails.displayPrice'].str.replace('$', '', regex=False)  # Remove dollar sign
filtered_df['priceDetails.displayPrice'] = filtered_df['priceDetails.displayPrice'].str.replace(',', '', regex=False)  # Remove commas

# Step 2: Convert the cleaned strings to floats
filtered_df['priceDetails.displayPrice'] = pd.to_numeric(filtered_df['priceDetails.displayPrice'], errors='coerce')  # Convert to float

In [None]:
# Check for missing values in the 'priceDetails.price' column
missing_values = filtered_df['priceDetails.displayPrice'].isna().sum()
print(f"Number of missing values: {missing_values}")

In [5]:
df = filtered_df

# Convert 'dateListed' to datetime
df['dateListed'] = pd.to_datetime(df['dateListed'])

# Extract year from 'dateListed'
df['year'] = df['dateListed'].dt.year

# Count number of listings per year
listings_per_year = df.groupby('year').size().reset_index(name='number_of_listings')

# Print results
print(listings_per_year)

    year  number_of_listings
0   2004                1014
1   2005                9625
2   2006                7955
3   2007                8373
4   2008                9470
5   2009               10318
6   2010               12017
7   2011               14122
8   2012               18799
9   2013               20492
10  2014               20056
11  2015               20848
12  2016               20190
13  2017               19754
14  2018               21910
15  2019               23525
16  2020               32077
17  2021               32010
18  2022               30950
19  2023               31451
20  2024               24115


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['dateListed'] = pd.to_datetime(df['dateListed'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['year'] = df['dateListed'].dt.year


In [6]:
filtered_df = filtered_df[filtered_df['priceDetails.price'] >= 200]
filtered_df.count()

objective                                    152345
propertyTypes                                152338
status                                       152345
saleMode                                     152345
channel                                      152345
                                              ...  
advertiserIdentifiers.conjunctionAgentIds         0
saleDetails.tenderDetails.tenderEndDate           0
saleDetails.tenantDetails.leaseEndDate            0
devProjectId                                      0
year                                         152345
Length: 100, dtype: int64

In [7]:
# Step 3: Filter out unrealistic numbers of bathrooms and bedrooms
filtered_df = filtered_df[(filtered_df['bathrooms'] >= 1) & (filtered_df['bathrooms'] <= 10)]
filtered_df = filtered_df[(filtered_df['bedrooms'] >= 1) & (filtered_df['bedrooms'] <= 10)]

In [8]:
filtered_df.count()

objective                                    151173
propertyTypes                                151166
status                                       151173
saleMode                                     151173
channel                                      151173
                                              ...  
advertiserIdentifiers.conjunctionAgentIds         0
saleDetails.tenderDetails.tenderEndDate           0
saleDetails.tenantDetails.leaseEndDate            0
devProjectId                                      0
year                                         151173
Length: 100, dtype: int64

In [9]:
df = filtered_df

# Convert 'dateListed' to datetime
df['dateListed'] = pd.to_datetime(df['dateListed'])

# Extract year from 'dateListed'
df['year'] = df['dateListed'].dt.year

# Count number of listings per year
listings_per_year = df.groupby('year').size().reset_index(name='number_of_listings')

# Print results
print(listings_per_year)

    year  number_of_listings
0   2004                 757
1   2005                7667
2   2006                6475
3   2007                6885
4   2008                7839
5   2009                5353
6   2010                3311
7   2011                4612
8   2012                8694
9   2013                8943
10  2014                7623
11  2015                7996
12  2016                5492
13  2017                1710
14  2018                3536
15  2019               11024
16  2020               13583
17  2021               10608
18  2022                9959
19  2023               10577
20  2024                8529


In [10]:
filtered_df.to_parquet('../data/raw/filtered_rental_listings_old.parquet')

: 

In [5]:
import pandas as pd

# Load the Parquet file
df = pd.read_parquet('../data/raw/filtered_rental_listings.parquet')

# Select only the important columns
selected_columns = [
    'addressParts.displayAddress',  # Full address
    'addressParts.stateAbbreviation', 
    'addressParts.suburb', 
    'bedrooms',                     # Number of bedrooms
    'bathrooms',                    # Number of bathrooms
    'propertyTypes',
    'carspaces',                    # Number of car spaces
    'dateListed',                   # Listing date
    'geoLocation.latitude',         # Latitude
    'geoLocation.longitude',        # Longitude
    'isNewDevelopment',             # New development flag
    'priceDetails.price',    # Price of the listing
    'propertyId',                    # Property ID
    'rentalDetails.leasedDate'      # Leased date
]

df.head()

# Keep only the selected columns
selected_df = df[selected_columns]

selected_df = selected_df.rename(columns={
    'addressParts.displayAddress': 'address',
    'addressParts.stateAbbreviation': 'state',
    'addressParts.suburb': 'suburb',
    'dateListed': 'date_listed',
    'geoLocation.latitude': 'latitude',
    'geoLocation.longitude': 'longitude',
    'isNewDevelopment': 'is_new_development',
    'priceDetails.price': 'price',
    'rentalDetails.leasedDate': 'leased_date'
})



In [6]:
selected_df = selected_df[selected_df['state'] == "vic"]
selected_df.count()

address               147647
state                 147647
suburb                147647
bedrooms              147647
bathrooms             147647
propertyTypes         147640
carspaces             147647
date_listed           147647
latitude              142701
longitude             142701
is_new_development    147647
price                 147647
propertyId             93827
leased_date            52293
dtype: int64

: 

In [None]:
# Ensure 'dateListed' is in datetime format
selected_df['date_listed'] = pd.to_datetime(selected_df['date_listed'])

# Extract only the date (without time)
selected_df['date_listed'] = selected_df['date_listed'].dt.date

# Add columns for year, month, and day
selected_df['year'] = pd.DatetimeIndex(selected_df['date_listed']).year
selected_df['month'] = pd.DatetimeIndex(selected_df['date_listed']).month
selected_df['day'] = pd.DatetimeIndex(selected_df['date_listed']).day

# Display the first few rows to verify
print(selected_df[['date_listed', 'year', 'month', 'day']].head())

In [1]:
# Convert 'date_listed' and 'leased_date' to datetime
selected_df['date_listed'] = pd.to_datetime(selected_df['date_listed'], errors='coerce')
selected_df['leased_date'] = pd.to_datetime(selected_df['leased_date'], errors='coerce')

# Calculate the number of days the listing was on the market (if leased_date is present)
selected_df['days_on_market'] = (selected_df['leased_date'] - selected_df['date_listed']).dt.days

NameError: name 'pd' is not defined

In [None]:
selected_df.count()

In [None]:
# Optionally, you can save the filtered DataFrame back as Parquet
selected_df.to_parquet('../data/curated/feature_selected_rental_listings.parquet')