In [1]:
import pandas as pd

# Load the population data
population_df = pd.read_csv('../data/curated/forecasted_populations.csv')

# Extract only the relevant columns for 2025, 2026, and 2027
population_df = population_df[['Code', '2025_Forecast', '2026_Forecast', '2027_Forecast']]

# Rename columns for easier merging
population_df.columns = ['SA2_CODE21', 'pop_2025', 'pop_2026', 'pop_2027']

In [2]:
# Load the population data
crime_df = pd.read_parquet('../data/curated/crime_data_with_predictions.parquet')

# Extract only the relevant columns for 2025, 2026, and 2027
crime_df = crime_df[['SA2_ID', '2025', '2026', '2027']]

# Rename columns for easier merging
crime_df.columns = ['SA2_CODE21', 'crime_2025', 'crime_2026', 'crime_2027']

crime_df['SA2_CODE21'] = crime_df['SA2_CODE21'].astype('float')


In [3]:
crime_df.head()

Unnamed: 0,SA2_CODE21,crime_2025,crime_2026,crime_2027
0,206011106.0,820.581478,843.299306,866.763845
1,206011107.0,832.314597,832.694978,833.828636
2,206011109.0,587.676228,587.717975,588.461329
3,206011495.0,751.259716,751.603053,752.626311
4,206011496.0,757.494707,757.840894,758.872643


In [4]:
population_df.head()

Unnamed: 0,SA2_CODE21,pop_2025,pop_2026,pop_2027
0,206011106,14477,14871,15264
1,206011107,14684,14684,14684
2,206011109,10368,10364,10363
3,206011495,13254,13254,13254
4,206011496,13364,13364,13364


In [5]:

import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Load your training data
df = pd.read_parquet('../data/curated/model_data.parquet')  # Replace with your actual data loading method

# Calculate average distances and other relevant statistics for each SA2 code
sa2_stats = df.groupby('SA2_CODE21').agg({
    'distance_to_bus_stop': 'mean',
    'distance_to_train_stop': 'mean',
    'distance_to_tram_stop': 'mean',
    'distance_to_hospital': 'mean',
    'distance_to_mall': 'mean',
    'distance_to_park': 'mean',
    'distance_to_supermarket': 'mean',
    'distance_to_CBD': 'mean',
    'distance_to_school': 'mean',
    'distance_to_uni': 'mean',
}).reset_index()

# Define realistic feature combinations
# Each tuple represents (bedrooms, bathrooms, carspaces)
valid_combinations = [
    (1, 1, 1), (1, 1, 0),
    (2, 1, 1), (2, 1, 0), (2, 2, 1), (2, 2, 0),
    (3, 1, 1), (3, 1, 0), (3, 2, 1), (3, 2, 0)
]

# Years for prediction
years = [2025, 2026, 2027]

# Create an empty DataFrame to store the test data
test_data = pd.DataFrame()

# Define the reference date
reference_date = pd.Timestamp('2011-01-01')

# Define probabilities for is_furnished
furnished_probs = [0.5, 0.5]

crime_df['SA2_CODE21'] = crime_df['SA2_CODE21'].astype('int')
sa2_stats['SA2_CODE21'] = sa2_stats['SA2_CODE21'].astype('int')
population_df['SA2_CODE21'] = pd.to_numeric(population_df['SA2_CODE21'], errors='coerce')

# Create the test data
for sa2_code in sa2_stats['SA2_CODE21']:
    for bedrooms, bathrooms, carspaces in valid_combinations:
        for year in years:
            # Get the average distances for the current SA2 code
            avg_distances = sa2_stats[sa2_stats['SA2_CODE21'] == sa2_code].iloc[0]

            # Define the middle of the year (July 1)
            middle_of_year = datetime(year, 7, 1)

            # Calculate the number of days since the reference date
            date_numeric = (middle_of_year - reference_date).days

            # Generate is_furnished with 50/50 chance
            is_furnished = 0

            # Create bed_bath_interaction feature
            bed_bath_interaction = bedrooms * bathrooms
            

            # Create a DataFrame for each combination
            temp_df = pd.DataFrame({
                'SA2_CODE21': [sa2_code],
                'bedrooms': [bedrooms],
                'bathrooms': [bathrooms],
                'year': [year],
                'distance_to_bus_stop': [avg_distances['distance_to_bus_stop']],
                'distance_to_train_stop': [avg_distances['distance_to_train_stop']],
                'distance_to_tram_stop': [avg_distances['distance_to_tram_stop']],
                'distance_to_hospital': [avg_distances['distance_to_hospital']],
                'distance_to_mall': [avg_distances['distance_to_mall']],
                'distance_to_park': [avg_distances['distance_to_park']],
                'distance_to_supermarket': [avg_distances['distance_to_supermarket']],
                'distance_to_CBD': [avg_distances['distance_to_CBD']],
                'distance_to_school': [avg_distances['distance_to_school']],
                'distance_to_uni': [avg_distances['distance_to_uni']],
                'population': [population_df.loc[population_df['SA2_CODE21'] == sa2_code, f'pop_{year}'].values[0]],
                'date_numeric': [date_numeric],
                'bed_bath_interaction': [bed_bath_interaction],
                'carspaces': [carspaces],
                'is_furnished': [is_furnished],
                'crime': [crime_df.loc[crime_df['SA2_CODE21'] == sa2_code, f'crime_{year}'].values[0]],
            })

            # Append to the main DataFrame
            test_data = pd.concat([test_data, temp_df], ignore_index=True)

# Save to CSV or use directly
test_data.to_csv('../data/curated/test_data.csv', index=False)

print("Test data generation complete.")
print(test_data.head())


Test data generation complete.
   SA2_CODE21  bedrooms  bathrooms  year  distance_to_bus_stop  \
0   201011006         1          1  2025              0.741864   
1   201011006         1          1  2026              0.741864   
2   201011006         1          1  2027              0.741864   
3   201011006         1          1  2025              0.741864   
4   201011006         1          1  2026              0.741864   

   distance_to_train_stop  distance_to_tram_stop  distance_to_hospital  \
0                 0.91511               1.091632              0.036647   
1                 0.91511               1.091632              0.036647   
2                 0.91511               1.091632              0.036647   
3                 0.91511               1.091632              0.036647   
4                 0.91511               1.091632              0.036647   

   distance_to_mall  distance_to_park  distance_to_supermarket  \
0          0.007196           0.00017                 0.00823

In [6]:
print(crime_df.dtypes)  # Check data types in crime_df
print(type(sa2_code))   # Check the type of sa2_code
print(population_df.dtypes) 

SA2_CODE21      int64
crime_2025    float64
crime_2026    float64
crime_2027    float64
dtype: object
<class 'int'>
SA2_CODE21    float64
pop_2025        int64
pop_2026        int64
pop_2027        int64
dtype: object


In [7]:



# Print the number of unique SA2_CODE21 values in each DataFrame
print(f"Number of unique SA2_CODE21 in sa2_stats: {sa2_stats['SA2_CODE21'].nunique()}")
print(f"Number of unique SA2_CODE21 in crime_df: {crime_df['SA2_CODE21'].nunique()}")

# Check the intersection of SA2_CODE21 values
matching_sa2 = set(sa2_stats['SA2_CODE21']).intersection(set(crime_df['SA2_CODE21']))
print(f"Number of matching SA2_CODE21 values: {len(matching_sa2)}")

# Check the SA2_CODE21 values that are in sa2_stats but not in crime_df
missing_in_crime = set(sa2_stats['SA2_CODE21']).difference(set(crime_df['SA2_CODE21']))
print(f"Number of SA2_CODE21 values in sa2_stats but not in crime_df: {len(missing_in_crime)}")

# Check the SA2_CODE21 values that are in crime_df but not in sa2_stats
missing_in_sa2_stats = set(crime_df['SA2_CODE21']).difference(set(sa2_stats['SA2_CODE21']))
print(f"Number of SA2_CODE21 values in crime_df but not in sa2_stats: {len(missing_in_sa2_stats)}")

# Optionally, print the actual missing values if needed
print("SA2_CODE21 values in sa2_stats but not in crime_df:", missing_in_crime)
print("SA2_CODE21 values in crime_df but not in sa2_stats:", missing_in_sa2_stats)

Number of unique SA2_CODE21 in sa2_stats: 384
Number of unique SA2_CODE21 in crime_df: 603
Number of matching SA2_CODE21 values: 384
Number of SA2_CODE21 values in sa2_stats but not in crime_df: 0
Number of SA2_CODE21 values in crime_df but not in sa2_stats: 219
SA2_CODE21 values in sa2_stats but not in crime_df: set()
SA2_CODE21 values in crime_df but not in sa2_stats: {204031491, 204031492, 21001, 21002, 21003, 21004, 21005, 203031049, 20501, 20502, 20503, 20504, 20505, 209031210, 204011054, 204011055, 204011056, 204011057, 204011058, 204011059, 204011060, 204011061, 204011062, 215011386, 215011388, 215011389, 215011390, 215011391, 215011392, 215011393, 215011394, 217041477, 217041478, 217041479, 217041480, 201021009, 201021010, 201021011, 201021012, 210011227, 204031068, 204031069, 204031070, 204031071, 204031072, 21601, 21602, 204031075, 21603, 204031073, 215031400, 215031401, 215031402, 215031403, 215031404, 21101, 21102, 21103, 21104, 21105, 215031405, 20601, 20602, 20603, 20604,

In [8]:
# Print the number of unique SA2_CODE21 values in each DataFrame
print(f"Number of unique SA2_CODE21 in sa2_stats: {sa2_stats['SA2_CODE21'].nunique()}")
print(f"Number of unique SA2_CODE21 in population_df: {population_df['SA2_CODE21'].nunique()}")

# Check the intersection of SA2_CODE21 values
matching_sa2 = set(sa2_stats['SA2_CODE21']).intersection(set(population_df['SA2_CODE21']))
print(f"Number of matching SA2_CODE21 values: {len(matching_sa2)}")

# Check the SA2_CODE21 values that are in sa2_stats but not in population_df
missing_in_population = set(sa2_stats['SA2_CODE21']).difference(set(population_df['SA2_CODE21']))
print(f"Number of SA2_CODE21 values in sa2_stats but not in population_df: {len(missing_in_population)}")

# Check the SA2_CODE21 values that are in population_df but not in sa2_stats
missing_in_sa2_stats = set(population_df['SA2_CODE21']).difference(set(sa2_stats['SA2_CODE21']))
print(f"Number of SA2_CODE21 values in population_df but not in sa2_stats: {len(missing_in_sa2_stats)}")

# Optionally, print the actual missing values if needed
print("SA2_CODE21 values in sa2_stats but not in population_df:", missing_in_population)
print("SA2_CODE21 values in population_df but not in sa2_stats:", missing_in_sa2_stats)


Number of unique SA2_CODE21 in sa2_stats: 384
Number of unique SA2_CODE21 in population_df: 606
Number of matching SA2_CODE21 values: 384
Number of SA2_CODE21 values in sa2_stats but not in population_df: 0
Number of SA2_CODE21 values in population_df but not in sa2_stats: 224
SA2_CODE21 values in sa2_stats but not in population_df: set()
SA2_CODE21 values in population_df but not in sa2_stats: {2.0, 204031491.0, 204031492.0, 21001.0, 21002.0, 21003.0, 21004.0, 21005.0, 203031049.0, 20501.0, 20502.0, 20503.0, 20504.0, 20505.0, 209031210.0, 204011054.0, 204011055.0, 204011056.0, 204011057.0, 204011058.0, 204011059.0, 204011060.0, 204011061.0, 204011062.0, 215011386.0, 215011388.0, 215011389.0, 215011390.0, 215011391.0, 215011392.0, 215011393.0, 215011394.0, 217041477.0, 217041478.0, 217041479.0, 217041480.0, 201021009.0, 201021010.0, 201021011.0, 201021012.0, 210011227.0, 204031068.0, 204031069.0, 204031070.0, 204031071.0, 204031072.0, 21601.0, 21602.0, 204031075.0, 21603.0, 204031073.0

In [9]:
test_data.count()

SA2_CODE21                 11520
bedrooms                   11520
bathrooms                  11520
year                       11520
distance_to_bus_stop       11520
distance_to_train_stop     11520
distance_to_tram_stop      11520
distance_to_hospital       11520
distance_to_mall           11520
distance_to_park           11520
distance_to_supermarket    11520
distance_to_CBD            11520
distance_to_school         11520
distance_to_uni            11520
population                 11520
date_numeric               11520
bed_bath_interaction       11520
carspaces                  11520
is_furnished               11520
crime                      11520
dtype: int64

In [10]:
test_data.to_csv('../data/curated/test_data.csv', index=False)

In [11]:
print(test_data.head(20))

    SA2_CODE21  bedrooms  bathrooms  year  distance_to_bus_stop  \
0    201011006         1          1  2025              0.741864   
1    201011006         1          1  2026              0.741864   
2    201011006         1          1  2027              0.741864   
3    201011006         1          1  2025              0.741864   
4    201011006         1          1  2026              0.741864   
5    201011006         1          1  2027              0.741864   
6    201011006         2          1  2025              0.741864   
7    201011006         2          1  2026              0.741864   
8    201011006         2          1  2027              0.741864   
9    201011006         2          1  2025              0.741864   
10   201011006         2          1  2026              0.741864   
11   201011006         2          1  2027              0.741864   
12   201011006         2          2  2025              0.741864   
13   201011006         2          2  2026              0.74186