In [72]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

In [73]:
subjects_cleaned_df = pd.read_csv('datasets/data/subjects_cleaned.csv')
comps_cleaned_df = pd.read_csv('datasets/data/comps_cleaned.csv')
candidates_cleaned_df = pd.read_csv('datasets/data/candidates_cleaned.csv')

print("✅ Data loaded!")
print(f"Subjects:   {len(subjects_cleaned_df)}")
print(f"Comps:      {len(comps_cleaned_df)}")
print(f"Candidates: {len(candidates_cleaned_df)}")

✅ Data loaded!
Subjects:   88
Comps:      264
Candidates: 9820


import pandas as pd
import numpy as np
from datetime import datetime, timedelta

In [74]:
def gla_diff(subject_gla, property_gla):
    """Calculate GLA difference. Returns None if either is missing."""
    if pd.isna(subject_gla) or pd.isna(property_gla):
        return None
    return abs(subject_gla - property_gla)

In [75]:
def lot_size_diff(subject_lot, property_lot):
    """Calculate lot size difference. Returns None if either is missing."""
    if pd.isna(subject_lot) or pd.isna(property_lot):
        return None
    return abs(subject_lot - property_lot)

In [76]:
def bathroom_diff(subject_baths, property_baths):
    """Calculate bathroom difference. Returns None if either is missing."""
    if pd.isna(subject_baths) or pd.isna(property_baths):
        return None
    return abs(subject_baths - property_baths)

In [77]:
def bedroom_diff(subject_beds, property_beds):
    """Calculate bedroom difference. Returns None if either is missing."""
    if pd.isna(subject_beds) or pd.isna(property_beds):
        return None
    return abs(subject_beds - property_beds)

In [78]:
def room_count_diff(subject_rooms, property_rooms):
    """Calculate room count difference. Returns None if either is missing."""
    if pd.isna(subject_rooms) or pd.isna(property_rooms):
        return None
    return abs(subject_rooms - property_rooms)

In [79]:
def sold_recently(sale_date, reference_date, days=90):
    """
    Check if sold within specified days.
    
    Parameters:
    - sale_date: Date property was sold
    - reference_date: Date to compare against
    - days: Number of days threshold (default 90, can be 180, etc.)
    
    Returns:
    - 1 if sold within days
    - 0 if sold outside days
    - None if date is missing
    """
    if pd.isna(sale_date):
        return None
    
    # Convert to datetime if string
    if isinstance(sale_date, str):
        sale_date = pd.to_datetime(sale_date)
    if isinstance(reference_date, str):
        reference_date = pd.to_datetime(reference_date)
    
    # Calculate days difference
    days_diff = (reference_date - sale_date).days
    
    # Return 1 or 0
    return 1 if days_diff <= days else 0

In [80]:
def same_structure_type(subject_type, property_type):
    """
    Check if same structure type.
    
    Returns:
    - 1 if same
    - 0 if different
    - None if either is missing
    """
    if pd.isna(subject_type) or pd.isna(property_type):
        return None
    
    # Simple comparison
    return 1 if str(subject_type).strip().lower() == str(property_type).strip().lower() else 0

In [81]:
def same_storey_type(subject_storey, property_storey):
    """
    Check if same storey type.
    
    Returns:
    - 1 if same
    - 0 if different  
    - None if either is missing
    """
    if pd.isna(subject_storey) or pd.isna(property_storey):
        return None
    
    # Simple comparison
    return 1 if str(subject_storey).strip().lower() == str(property_storey).strip().lower() else 0


In [82]:
def test_all_functions():
    """Test all functions with simple examples."""
    
    print("Testing difference functions:")
    print(f"GLA diff (2000, 1800) = {gla_diff(2000, 1800)}")
    print(f"GLA diff (2000, None) = {gla_diff(2000, None)}")
    
    print(f"\nLot diff (5000, 4500) = {lot_size_diff(5000, 100000)}")
    print(f"Bathroom diff (2.5, 2) = {bathroom_diff(2.5, 2)}")
    print(f"Bedroom diff (3, 4) = {bedroom_diff(3, 4)}")
    print(f"Room diff (8, 7) = {room_count_diff(8, 7)}")
    
    print("\nTesting sold recently:")
    ref_date = datetime(2025, 5, 29)
    sale_date1 = datetime(2025, 4, 1)  # 58 days ago
    sale_date2 = datetime(2025, 1, 1)  # 148 days ago
    
    print(f"Sold 58 days ago (90 day check) = {sold_recently(sale_date1, ref_date, 90)}")
    print(f"Sold 148 days ago (90 day check) = {sold_recently(sale_date2, ref_date, 90)}")
    print(f"Sold 148 days ago (180 day check) = {sold_recently(sale_date2, ref_date, 180)}")
    
    print("\nTesting same type functions:")
    print(f"Same structure ('Detached', 'Detached') = {same_structure_type('Detached', 'Detached')}")
    print(f"Same structure ('Detached', 'Townhouse') = {same_structure_type('Detached', 'Townhouse')}")
    print(f"Same structure ('Detached', None) = {same_structure_type('Detached', None)}")
    
    print(f"\nSame storey ('2 Storey', '2 Storey') = {same_storey_type('2 Storey', '2 Storey')}")
    print(f"Same storey ('2 Storey', 'Bungalow') = {same_storey_type('2 Storey', 'Bungalow')}")

test_all_functions()

Testing difference functions:
GLA diff (2000, 1800) = 200
GLA diff (2000, None) = None

Lot diff (5000, 4500) = 95000
Bathroom diff (2.5, 2) = 0.5
Bedroom diff (3, 4) = 1
Room diff (8, 7) = 1

Testing sold recently:
Sold 58 days ago (90 day check) = 1
Sold 148 days ago (90 day check) = 0
Sold 148 days ago (180 day check) = 1

Testing same type functions:
Same structure ('Detached', 'Detached') = 1
Same structure ('Detached', 'Townhouse') = 0
Same structure ('Detached', None) = None

Same storey ('2 Storey', '2 Storey') = 1
Same storey ('2 Storey', 'Bungalow') = 0


In [83]:
def get_subject_val(orderID, col):
    return subjects_cleaned_df.loc[subjects_cleaned_df['orderID'] == orderID, col].values[0]

In [84]:
comps_pairs_df = comps_cleaned_df.copy()

comps_pairs_df['gla_diff'] = comps_pairs_df.apply(
    lambda row: gla_diff(get_subject_val(row['orderID'], 'gla_clean'), row['gla_clean']),
    axis=1
)
# Do the same for the other columns:
comps_pairs_df['lot_size_diff'] = comps_pairs_df.apply(
    lambda row: lot_size_diff(get_subject_val(row['orderID'], 'lot_size_clean'), row['lot_size_clean']),
    axis=1
)
comps_pairs_df['bedroom_diff'] = comps_pairs_df.apply(
    lambda row: bedroom_diff(get_subject_val(row['orderID'], 'bedrooms_clean'), row['bedrooms_clean']),
    axis=1
)
comps_pairs_df['bathroom_diff'] = comps_pairs_df.apply(
    lambda row: bathroom_diff(get_subject_val(row['orderID'], 'bathrooms_clean'), row['bathrooms_clean']),
    axis=1
)
comps_pairs_df['room_count_diff'] = comps_pairs_df.apply(
    lambda row: room_count_diff(get_subject_val(row['orderID'], 'room_count_clean'), row['room_count_clean']),
    axis=1
)
comps_pairs_df['same_property_type'] = comps_pairs_df.apply(
    lambda row: same_structure_type(get_subject_val(row['orderID'], 'property_type_clean'), row['property_type_clean']),
    axis=1
)
comps_pairs_df['same_storey_type'] = comps_pairs_df.apply(
    lambda row: same_storey_type(get_subject_val(row['orderID'], 'stories_clean'), row['stories_clean']),
    axis=1
)
comps_pairs_df['sold_recently_90'] = comps_pairs_df.apply(
    lambda row: sold_recently(row['sale_date_clean'], get_subject_val(row['orderID'], 'effective_date_clean'), days=90),
    axis=1
)
comps_pairs_df['sold_recently_180'] = comps_pairs_df.apply(
    lambda row: sold_recently(row['sale_date_clean'], get_subject_val(row['orderID'], 'effective_date_clean'), days=180),
    axis=1
)


In [85]:
candidates_pairs_df = candidates_cleaned_df.copy()

candidates_pairs_df['gla_diff'] = candidates_pairs_df.apply(
    lambda row: gla_diff(get_subject_val(row['orderID'], 'gla_clean'), row['gla_clean']),
    axis=1
)
candidates_pairs_df['lot_size_diff'] = candidates_pairs_df.apply(
    lambda row: lot_size_diff(get_subject_val(row['orderID'], 'lot_size_clean'), row['lot_size_clean']),
    axis=1
)
candidates_pairs_df['bedroom_diff'] = candidates_pairs_df.apply(
    lambda row: bedroom_diff(get_subject_val(row['orderID'], 'bedrooms_clean'), row['bedrooms_clean']),
    axis=1
)
candidates_pairs_df['bathroom_diff'] = candidates_pairs_df.apply(
    lambda row: bathroom_diff(get_subject_val(row['orderID'], 'bathrooms_clean'), row['bathrooms_clean']),
    axis=1
)
candidates_pairs_df['room_count_diff'] = candidates_pairs_df.apply(
    lambda row: room_count_diff(get_subject_val(row['orderID'], 'room_count_clean'), row['room_count_clean']),
    axis=1
)
candidates_pairs_df['same_property_type'] = candidates_pairs_df.apply(
    lambda row: same_structure_type(get_subject_val(row['orderID'], 'property_type_clean'), row['property_type_clean']),
    axis=1
)
candidates_pairs_df['same_storey_type'] = candidates_pairs_df.apply(
    lambda row: same_storey_type(get_subject_val(row['orderID'], 'stories_clean'), row['stories_clean']),
    axis=1
)
candidates_pairs_df['sold_recently_90'] = candidates_pairs_df.apply(
    lambda row: sold_recently(row['close_date_clean'], get_subject_val(row['orderID'], 'effective_date_clean'), days=90),
    axis=1
)
candidates_pairs_df['sold_recently_180'] = candidates_pairs_df.apply(
    lambda row: sold_recently(row['close_date_clean'], get_subject_val(row['orderID'], 'effective_date_clean'), days=180),
    axis=1
)


In [86]:
def save_df_to_csv(df, filename):
    """
    Save a DataFrame to a CSV file.
    
    Parameters:
    - df: DataFrame to save
    - filename: Destination file path (e.g., 'my_data.csv')
    """
    df.to_csv(filename, index=False)
    print(f"✅ DataFrame saved to: {filename}")

save_df_to_csv(comps_pairs_df, 'datasets/data/comps_pairs_features.csv')
save_df_to_csv(candidates_pairs_df, 'datasets/data/candidates_pairs_features.csv')


✅ DataFrame saved to: datasets/data/comps_pairs_features.csv
✅ DataFrame saved to: datasets/data/candidates_pairs_features.csv


In [91]:
def make_subjects_merge_df(subjects_cleaned_df):
    """
    Selects key columns from the cleaned subjects DataFrame and renames for merging.
    """
    subject_cols = [
        'orderID',
        'effective_date_clean',
        'lot_size_clean',
        'gla_clean',
        'room_count_clean',
        'bedrooms_clean',
        'bathrooms_clean',
        'property_type_clean',
        'stories_clean'
    ]
    # Select columns
    subjects_for_merge = subjects_cleaned_df[subject_cols].copy()
    # Rename for clarity (prefix with subj_ except for orderID)
    subjects_for_merge = subjects_for_merge.rename(columns={
        'effective_date_clean': 'subj_effective_date',
        'lot_size_clean': 'subj_lot_size',
        'gla_clean': 'subj_gla',
        'room_count_clean': 'subj_room_count',
        'bedrooms_clean': 'subj_bedrooms',
        'bathrooms_clean': 'subj_bathrooms',
        'property_type_clean': 'subj_property_type',
        'stories_clean': 'subj_stories'
    })
    return subjects_for_merge


In [100]:
def merge_subjects(subjects_for_merge, comps_pairs_df):
    """
    Merge subject summary features with comps pairwise data on 'orderID'.
    """
    # We'll do a left merge to keep all comps, attaching subject info for each
    merged_df = comps_pairs_df.merge(
        subjects_for_merge,
        how='left',
        on='orderID'
    )
    return merged_df

subjects_for_merge = make_subjects_merge_df(subjects_cleaned_df)


In [110]:
def reorder_columns(df):
    """
    Reorder columns to: subject, comp, diff, for each feature group.
    """
    # Define the columns for each group
    order = [
        'orderID',
        'subj_gla', 'gla_clean', 'gla_diff',
        'subj_lot_size', 'lot_size_clean', 'lot_size_diff',
        'subj_bedrooms', 'bedrooms_clean', 'bedroom_diff',
        'subj_bathrooms', 'bathrooms_clean', 'bathroom_diff',
        'subj_room_count', 'room_count_clean', 'room_count_diff',
        'subj_property_type','property_type_clean', 'same_property_type',
        'subj_stories','stories_clean', 'same_storey_type',
        'subj_effective_date','sale_date_clean', 'sold_recently_90', 'sold_recently_180',
        # add others you want to keep
    ]
    # Only keep columns that actually exist
    cols = [c for c in order if c in df.columns]
    # Add any others at the end
    rest = [c for c in df.columns if c not in cols]
    return df[cols + rest]

In [121]:
final_comps_df = merge_subjects(subjects_for_merge, comps_pairs_df)
final_comps_df = reorder_columns(final_comps_df)
#final_comps_df.head()


In [122]:
final_candidates_df = merge_subjects(subjects_for_merge, candidates_pairs_df)
final_candidates_df = reorder_columns(final_candidates_df)
#final_candidates_df.head()

In [None]:
keep_cols = [
    'orderID',
    'subj_gla', 'gla_clean', 'gla_diff',
    'subj_lot_size', 'lot_size_clean', 'lot_size_diff',
    'subj_bedrooms', 'bedrooms_clean', 'bedroom_diff',
    'subj_bathrooms', 'bathrooms_clean', 'bathroom_diff',
    'subj_room_count', 'room_count_clean', 'room_count_diff',
    'subj_property_type', 'property_type_clean', 'same_property_type',
    'subj_stories', 'stories_clean', 'same_storey_type',
    'sale_date_clean', 'sold_recently_90', 'sold_recently_180'
]
# Only keep these
final_comps_df = final_comps_df[keep_cols]


In [116]:
keep_cols_can = [
    'orderID',
    'subj_gla', 'gla_clean', 'gla_diff',
    'subj_lot_size', 'lot_size_clean', 'lot_size_diff',
    'subj_bedrooms', 'bedrooms_clean', 'bedroom_diff',
    'subj_bathrooms', 'bathrooms_clean', 'bathroom_diff',
    'subj_room_count', 'room_count_clean', 'room_count_diff',
    'subj_property_type', 'property_type_clean', 'same_property_type',
    'subj_stories', 'stories_clean', 'same_storey_type',
    'close_date_clean', 'sold_recently_90', 'sold_recently_180'
]
final_candidates_df = final_candidates_df[keep_cols_can]

In [None]:
save_df_to_csv(final_comps_df, 'data/model_ready/comp_pair_model_ready.csv')
save_df_to_csv(final_candidates_df, 'data/model_ready/candidates_pair_model_ready.csv')