In [3]:
import os
import pandas as pd
import json

# Function to load a JSON file into a DataFrame
def load_json_to_df(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return pd.json_normalize(data)

# Directory containing the JSON files (Old Directory)
json_dir = "../data/landing/"

# List all JSON files in the old directory
json_files = [f for f in os.listdir(json_dir) if f.endswith('.json')]

# Initialize list to store DataFrames for the old directory
dfs_old = []

# Loop through each JSON file in the old directory, load it, and append to dfs_old
for json_file in json_files:
    file_path = os.path.join(json_dir, json_file)
    df = load_json_to_df(file_path)
    
    # Check if the DataFrame is empty
    if not df.empty:
        dfs_old.append(df)
    else:
        print(f"Skipped empty DataFrame for file: {json_file}")

# Concatenate all DataFrames from the old directory into a single DataFrame
print(f"Concatenating {len(dfs_old)} DataFrames from the old directory")
df1 = pd.concat(dfs_old, ignore_index=True)


KeyboardInterrupt: 

In [1]:
import os
import pandas as pd
import json

# Function to load a JSON file into a DataFrame
def load_json_to_df(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return pd.json_normalize(data)

# Directory containing the JSON files (New Directory)
json_dir_new = "../data/landing/new/"

# List all JSON files in the new directory
json_files_new = [f for f in os.listdir(json_dir_new) if f.endswith('.json')]

# Initialize list to store DataFrames for the new directory
dfs_new = []

# Loop through each JSON file in the new directory, load it, and append to dfs_new
for json_file in json_files_new:
    file_path = os.path.join(json_dir_new, json_file)
    df = load_json_to_df(file_path)
    
    # Check if the DataFrame is empty
    if not df.empty:
        dfs_new.append(df)
    else:
        print(f"Skipped empty DataFrame for file: {json_file}")

# Concatenate all DataFrames from the new directory into a single DataFrame
print(f"Concatenating {len(dfs_new)} DataFrames from the new directory")
compiled_df = pd.concat(dfs_new, ignore_index=True)




Concatenating 77 DataFrames from the new directory


In [2]:
print(compiled_df.count())

objective                                              319378
propertyTypes                                          319193
status                                                 319378
saleMode                                               319378
channel                                                319378
                                                        ...  
saleDetails.tenantDetails.tenantInfoTermOfLeaseFrom        15
saleDetails.tenantDetails.tenantInfoTermOfLeaseTo          15
saleDetails.tenantDetails.leaseStartDate                    5
saleDetails.tenantDetails.leaseEndDate                      4
devProjectId                                                1
Length: 97, dtype: int64


In [None]:
# Function to recursively convert unhashable types (dict, list) to hashable types (tuple)
def make_hashable(item):
    if isinstance(item, dict):
        return tuple((key, make_hashable(value)) for key, value in sorted(item.items()))
    elif isinstance(item, list):
        return tuple(make_hashable(i) for i in item)
    else:
        return item

# Load the JSON file into a DataFrame
pdf = compiled_df

# Apply the recursive conversion to all elements in the DataFrame
for col in pdf.columns:
    pdf[col] = pdf[col].apply(make_hashable)

# Now you can drop duplicates
pdf_cleaned = pdf.drop_duplicates()

# Display the cleaned DataFrame
print(pdf_cleaned.count())

In [3]:
filtered_df = compiled_df[compiled_df['objective'] == 'rent']


In [4]:
filtered_df.count()

objective                                              149196
propertyTypes                                          149100
status                                                 149196
saleMode                                               149196
channel                                                149196
                                                        ...  
saleDetails.tenantDetails.tenantInfoTermOfLeaseFrom         0
saleDetails.tenantDetails.tenantInfoTermOfLeaseTo           0
saleDetails.tenantDetails.leaseStartDate                    0
saleDetails.tenantDetails.leaseEndDate                      0
devProjectId                                                0
Length: 97, dtype: int64

In [None]:
filtered_df.count()

Filtered for only apartments and removed carparks as they skew the data

In [None]:
price_filtered = filtered_df[filtered_df['priceDetails.price'] <= 200]

# Select relevant columns for inspection
columns_of_interest = ['priceDetails.price', 'propertyTypes', 'headline', 'description', 'addressParts.displayAddress', 'dateListed']

# Display the relevant information for these listings
print(price_filtered.count())


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd


# Plotting
plt.figure(figsize=(10, 6))
sns.histplot(filtered_df['priceDetails.price'], kde=False)  # Let seaborn choose the number of bins automatically
# Set x-axis limits
plt.xlim(-500, 1400)
plt.title('Distribution of Prices')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()


In [None]:
# Step 1: Remove the dollar sign and commas
filtered_df['priceDetails.displayPrice'] = filtered_df['priceDetails.displayPrice'].str.replace('$', '', regex=False)  # Remove dollar sign
filtered_df['priceDetails.displayPrice'] = filtered_df['priceDetails.displayPrice'].str.replace(',', '', regex=False)  # Remove commas

# Step 2: Convert the cleaned strings to floats
filtered_df['priceDetails.displayPrice'] = pd.to_numeric(filtered_df['priceDetails.displayPrice'], errors='coerce')  # Convert to float

In [None]:
# Check for missing values in the 'priceDetails.price' column
missing_values = filtered_df['priceDetails.displayPrice'].isna().sum()
print(f"Number of missing values: {missing_values}")

In [5]:
df = filtered_df

# Convert 'dateListed' to datetime
df['dateListed'] = pd.to_datetime(df['dateListed'])

# Extract year from 'dateListed'
df['year'] = df['dateListed'].dt.year

# Count number of listings per year
listings_per_year = df.groupby('year').size().reset_index(name='number_of_listings')

# Print results
print(listings_per_year)

    year  number_of_listings
0   2004                  51
1   2005                 877
2   2006                1275
3   2007                2065
4   2008                2226
5   2009                2815
6   2010                3303
7   2011                4565
8   2012                7433
9   2013                8272
10  2014                6813
11  2015                6735
12  2016                7760
13  2017                8511
14  2018                8034
15  2019               10151
16  2020               10566
17  2021                9705
18  2022               11254
19  2023               13118
20  2024               10518


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['dateListed'] = pd.to_datetime(df['dateListed'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['year'] = df['dateListed'].dt.year


In [5]:
filtered_df = filtered_df[filtered_df['priceDetails.price'] >= 200]
filtered_df.count()

objective                                              49282
propertyTypes                                          49279
status                                                 49282
saleMode                                               49282
channel                                                49282
                                                       ...  
saleDetails.tenantDetails.tenantInfoTermOfLeaseFrom        0
saleDetails.tenantDetails.tenantInfoTermOfLeaseTo          0
saleDetails.tenantDetails.leaseStartDate                   0
saleDetails.tenantDetails.leaseEndDate                     0
devProjectId                                               0
Length: 97, dtype: int64

In [6]:
# Step 3: Filter out unrealistic numbers of bathrooms and bedrooms
filtered_df = filtered_df[(filtered_df['bathrooms'] >= 1) & (filtered_df['bathrooms'] <= 10)]
filtered_df = filtered_df[(filtered_df['bedrooms'] >= 1) & (filtered_df['bedrooms'] <= 10)]

In [8]:
filtered_df.count()

objective                                      48230
propertyTypes                                  48226
status                                         48230
saleMode                                       48230
channel                                        48230
                                               ...  
saleDetails.tenantDetails.leaseStartDate           0
saleDetails.tenantDetails.leaseEndDate             0
advertiserIdentifiers.conjunctionContactIds        0
advertiserIdentifiers.conjunctionAgentIds          0
year                                           48230
Length: 99, dtype: int64

In [9]:
df = filtered_df

# Convert 'dateListed' to datetime
df['dateListed'] = pd.to_datetime(df['dateListed'])

# Extract year from 'dateListed'
df['year'] = df['dateListed'].dt.year

# Count number of listings per year
listings_per_year = df.groupby('year').size().reset_index(name='number_of_listings')

# Print results
print(listings_per_year)

    year  number_of_listings
0   2004                  20
1   2005                 437
2   2006                 787
3   2007                1377
4   2008                1618
5   2009                1827
6   2010                 780
7   2011                1399
8   2012                3269
9   2013                3867
10  2014                2472
11  2015                2687
12  2016                2630
13  2017                1845
14  2018                1297
15  2019                5140
16  2020                4001
17  2021                2907
18  2022                3278
19  2023                3630
20  2024                2962


In [7]:
filtered_df.to_parquet('../data/raw/filtered_rental_listings_new.parquet')