In [1]:
import os
import re
import csv
import joblib
import datetime
from pathlib import Path
from dataclasses import dataclass

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from geopy.distance import geodesic

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import shuffle
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import (
    train_test_split,
    cross_val_score,
    GridSearchCV,
    RandomizedSearchCV
)
from sklearn.metrics import (
    accuracy_score,
    mean_absolute_error,
    mean_squared_error,
    r2_score
)
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.inspection import permutation_importance

from scipy.stats import randint, uniform

import tensorflow as tf
from tensorflow.keras import layers, models, regularizers, callbacks, utils
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping


In [2]:
df = pd.read_csv("CommercialTransaction20250917124317.csv")
df.head()

Unnamed: 0,Project Name,Street Name,Property Type,Transacted Price ($),Area (SQFT),Unit Price ($ PSF),Sale Date,Type of Area,Area (SQM),Unit Price ($ PSM),Tenure,Postal District,Floor Level
0,N.A.,LORONG 25A GEYLANG,Shop House,1400000.0,1033.34,1355,Sept-25,Strata,96.0,14583,Freehold,14,-
1,KAMPONG GLAM CONSERVATION AREA,BUSSORAH STREET,Shop House,6350000.0,1558.63,4074,Sept-25,Land,144.8,43854,99 yrs lease commencing from 2003,7,-
2,N.A.,TOH AVENUE,Shop House,2000000.0,1703.94,1174,Aug-25,Land,158.3,12634,Freehold,17,-
3,N.A.,CEYLON ROAD,Shop House,3850000.0,1371.33,2807,Aug-25,Land,127.4,30220,Freehold,15,-
4,N.A.,CLOVER WAY,Shop House,4200000.0,1767.45,2376,Aug-25,Land,164.2,25579,Freehold,20,-


In [3]:
import pandas as pd
import numpy as np
import re

def clean_transaction_data(df):
    # Create a copy to avoid modifying the original
    df_clean = df.copy()
    
    # 1. Clean 'Transacted Price ($)' - remove commas and convert to float
    df_clean['Transacted Price ($)'] = (
        df_clean['Transacted Price ($)']
        .str.replace(',', '')
        .astype(float)
    )
    
    # 2. Clean 'Area (SQFT)' and 'Area (SQM)' - ensure they're numeric
    df_clean['Area (SQFT)'] = pd.to_numeric(df_clean['Area (SQFT)'], errors='coerce')
    df_clean['Area (SQM)'] = pd.to_numeric(df_clean['Area (SQM)'], errors='coerce')
    
    # 3. Clean 'Unit Price ($ PSF)' and 'Unit Price ($ PSM)' - remove commas, convert to float
    df_clean['Unit Price ($ PSF)'] = (
        df_clean['Unit Price ($ PSF)']
        .astype(str).str.replace(',', '')
        .astype(float)
    )
    
    df_clean['Unit Price ($ PSM)'] = (
        df_clean['Unit Price ($ PSM)']
        .astype(str).str.replace(',', '')
        .astype(float)
    )
    
    # 4. Convert 'Sale Date' to datetime (handling 'Sept-25' format)
    df_clean['Sale Date'] = pd.to_datetime(df_clean['Sale Date'], format='%b-%y', errors='coerce')
    

    
    return df_clean

# Apply the cleaning function
df = clean_transaction_data(df)



# Removing null values

In [5]:
df['Project Name']=df['Project Name'].replace('N.A.', np.nan)

In [6]:
##remove rows with more than 3 missing values 

missing_counts = df.isnull().sum(axis=1)
print("Missing counts per row:\n", missing_counts)

missing_counts = df.isna().sum(axis=1)  # Count missing values per row
df.drop(df[missing_counts >= 3].index, inplace=True)

Missing counts per row:
 0       3
1       2
2       2
3       2
4       2
       ..
2949    0
2950    0
2951    0
2952    1
2953    0
Length: 2954, dtype: int64


## Target column by column:

In [8]:
##project name

df['Project Name'].unique()
most_frequent_value =df['Project Name'].mode()[0]
# Fill NaN values in the 'category' column with the most frequent value
df['Project Name'].fillna(most_frequent_value, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Project Name'].fillna(most_frequent_value, inplace=True)


In [9]:
##area SQFT

df['Area (SQFT)'].unique()
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=5)
df[['Area (SQFT)']] = imputer.fit_transform(df[['Area (SQFT)']])


In [10]:
##area SQM

df['Area (SQM)'].unique()
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=5)
df[['Area (SQM)']] = imputer.fit_transform(df[['Area (SQM)']])

In [11]:
## sale date

df['Sale Date'].unique()

df['sale_date_missing'] = df['Sale Date'].isna().astype(int)

df['sale_year'] = df['Sale Date'].dt.year
df['sale_month'] = df['Sale Date'].dt.month
df['sale_quarter'] = df['Sale Date'].dt.quarter
df['sale_dayofweek'] = df['Sale Date'].dt.dayofweek   # 0=Mon, 6=Sun

# continuous trend feature
df['days_since_first_sale'] = (df['Sale Date'] - df['Sale Date'].min()).dt.days

df.drop(columns=['Sale Date'], inplace=True)

In [12]:
## Tenure
df = df.dropna(subset=['Tenure'])

## Floor level according to property type: shop hse

In [14]:
df['Floor Level'].value_counts().get('-',0)

591

In [15]:
df['Floor Level'] =df['Floor Level'].replace('-', np.nan)

In [16]:
null_col1_rows = df[df['Floor Level'].isnull() & (df['Property Type']=='Shop House')]
print("Rows where 'col1' is null:")
null_col1_rows

Rows where 'col1' is null:


Unnamed: 0,Project Name,Street Name,Property Type,Transacted Price ($),Area (SQFT),Unit Price ($ PSF),Type of Area,Area (SQM),Unit Price ($ PSM),Tenure,Postal District,Floor Level,sale_date_missing,sale_year,sale_month,sale_quarter,sale_dayofweek,days_since_first_sale
1,KAMPONG GLAM CONSERVATION AREA,BUSSORAH STREET,Shop House,6350000.0,496.618271,4074.0,Land,144.8,43854.0,99 yrs lease commencing from 2003,7,,1,,,,,
2,LITTLE INDIA CONSERVATION AREA,TOH AVENUE,Shop House,2000000.0,496.618271,1174.0,Land,158.3,12634.0,Freehold,17,,0,2025.0,8.0,3.0,4.0,1400.0
3,LITTLE INDIA CONSERVATION AREA,CEYLON ROAD,Shop House,3850000.0,496.618271,2807.0,Land,127.4,30220.0,Freehold,15,,0,2025.0,8.0,3.0,4.0,1400.0
4,LITTLE INDIA CONSERVATION AREA,CLOVER WAY,Shop House,4200000.0,496.618271,2376.0,Land,164.2,25579.0,Freehold,20,,0,2025.0,8.0,3.0,4.0,1400.0
5,LITTLE INDIA CONSERVATION AREA,ROWELL ROAD,Shop House,5270000.0,496.618271,5037.0,Land,97.2,54218.0,Freehold,8,,0,2025.0,8.0,3.0,4.0,1400.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2518,BOAT QUAY CONSERVATION AREA,CIRCULAR ROAD,Shop House,9500000.0,496.618271,9184.0,Land,96.1,98855.0,999 yrs lease commencing from 1826,1,,0,2021.0,10.0,4.0,4.0,0.0
2519,KRETA AYER CONSERVATION AREA,PAGODA STREET,Shop House,13300000.0,496.618271,10782.0,Land,114.6,116056.0,Freehold,1,,0,2021.0,10.0,4.0,4.0,0.0
2520,TELOK AYER CONSERVATION AREA,TELOK AYER STREET,Shop House,16800000.0,496.618271,11570.0,Land,134.9,124537.0,Freehold,1,,1,,,,,
2521,TELOK AYER CONSERVATION AREA,TELOK AYER STREET,Shop House,8550000.0,769.630000,11109.0,Land,71.5,119580.0,999 yrs lease commencing from 1827,1,,1,,,,,


In [17]:
##shop hse
df.loc[
    df['Floor Level'].isnull() & (df['Property Type'] == 'Shop House'),
    'Floor Level'
] = 0

In [18]:
ull_col1_rows = df[df['Floor Level'].notna() & (df['Property Type']=='Shop House')]
print("Rows where 'col1' is null:")
ull_col1_rows

Rows where 'col1' is null:


Unnamed: 0,Project Name,Street Name,Property Type,Transacted Price ($),Area (SQFT),Unit Price ($ PSF),Type of Area,Area (SQM),Unit Price ($ PSM),Tenure,Postal District,Floor Level,sale_date_missing,sale_year,sale_month,sale_quarter,sale_dayofweek,days_since_first_sale
1,KAMPONG GLAM CONSERVATION AREA,BUSSORAH STREET,Shop House,6350000.0,496.618271,4074.0,Land,144.8,43854.0,99 yrs lease commencing from 2003,7,0,1,,,,,
2,LITTLE INDIA CONSERVATION AREA,TOH AVENUE,Shop House,2000000.0,496.618271,1174.0,Land,158.3,12634.0,Freehold,17,0,0,2025.0,8.0,3.0,4.0,1400.0
3,LITTLE INDIA CONSERVATION AREA,CEYLON ROAD,Shop House,3850000.0,496.618271,2807.0,Land,127.4,30220.0,Freehold,15,0,0,2025.0,8.0,3.0,4.0,1400.0
4,LITTLE INDIA CONSERVATION AREA,CLOVER WAY,Shop House,4200000.0,496.618271,2376.0,Land,164.2,25579.0,Freehold,20,0,0,2025.0,8.0,3.0,4.0,1400.0
5,LITTLE INDIA CONSERVATION AREA,ROWELL ROAD,Shop House,5270000.0,496.618271,5037.0,Land,97.2,54218.0,Freehold,8,0,0,2025.0,8.0,3.0,4.0,1400.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2518,BOAT QUAY CONSERVATION AREA,CIRCULAR ROAD,Shop House,9500000.0,496.618271,9184.0,Land,96.1,98855.0,999 yrs lease commencing from 1826,1,0,0,2021.0,10.0,4.0,4.0,0.0
2519,KRETA AYER CONSERVATION AREA,PAGODA STREET,Shop House,13300000.0,496.618271,10782.0,Land,114.6,116056.0,Freehold,1,0,0,2021.0,10.0,4.0,4.0,0.0
2520,TELOK AYER CONSERVATION AREA,TELOK AYER STREET,Shop House,16800000.0,496.618271,11570.0,Land,134.9,124537.0,Freehold,1,0,1,,,,,
2521,TELOK AYER CONSERVATION AREA,TELOK AYER STREET,Shop House,8550000.0,769.630000,11109.0,Land,71.5,119580.0,999 yrs lease commencing from 1827,1,0,1,,,,,


## fill in null values for floor level using imputation based off other features

In [20]:
def smart_floor_imputation(row, df):
    if pd.notna(row['Floor Level']):
        return row['Floor Level']
    
    # Try different levels of similarity
    similar_criteria = [
        # Level 1: Same Property Type + Postal District + Area Bin
        {'prop_type': row['Property Type'], 'postal_district': row['Postal District'], 'area_bin': row.get('Area_Bin', 'Unknown')},
        # Level 2: Same Property Type + Postal District
        {'prop_type': row['Property Type'], 'postal_district': row['Postal District']},
        # Level 3: Same Property Type only
        {'prop_type': row['Property Type']},
        # Level 4: Global most common
        {'global': True}
    ]
    
    for criteria in similar_criteria:
        if 'global' in criteria:
            # Global most common floor level
            most_common = df[df['Floor Level'].notna()]['Floor Level'].mode()
            if not most_common.empty:
                return most_common.iloc[0]
        
        elif 'area_bin' in criteria:
            # Check if Area_Bin column exists, if not skip this level
            if 'Area_Bin' not in df.columns:
                continue
            mask = (
                (df['Property Type'] == criteria['prop_type']) &
                (df['Postal District'] == criteria['postal_district']) &
                (df['Area_Bin'] == criteria['area_bin']) &
                (df['Floor Level'].notna())
            )
        elif 'postal_district' in criteria:
            mask = (
                (df['Property Type'] == criteria['prop_type']) &
                (df['Postal District'] == criteria['postal_district']) &
                (df['Floor Level'].notna())
            )
        else:
            # Only property type
            mask = (
                (df['Property Type'] == criteria['prop_type']) &
                (df['Floor Level'].notna())
            )
        
        similar_floors = df[mask]['Floor Level']
        if len(similar_floors) > 0:
            return similar_floors.mode().iloc[0]
    
    # Final fallback
    return '01 to 05'  # Most common default

# Apply smart imputation
df['Floor Level'] = df.apply(
    lambda row: smart_floor_imputation(row, df), axis=1
)

### Binning for floor levels

In [22]:
# Create both numerical and categorical features
def extract_floor_features(floor_str):
    """
    Extract multiple numerical features from floor ranges
    """
    if pd.isna(floor_str) or floor_str == '-' or floor_str == '0':
        return np.nan, np.nan, np.nan, 0, 0  # Added is_ground flag
    
    floor_str = str(floor_str).strip()
    
    # Handle ground floor (0)
    if floor_str == '0':
        return 0, 0, 0, 0, 1  # low, high, midpoint, is_basement, is_ground
    
    numbers = re.findall(r'\d+', floor_str)
    
    if not numbers:
        return np.nan, np.nan, np.nan, 0, 0
    
    is_basement = 1 if floor_str.startswith('B') else 0
    is_ground = 0  # Default not ground floor
    
    if len(numbers) >= 2:
        low = int(numbers[0])
        high = int(numbers[1])
        return low, high, (low + high) / 2, is_basement, is_ground
    else:
        num = int(numbers[0])
        return num, num, num, is_basement, is_ground

# Apply numerical feature extraction
floor_features = df['Floor Level'].apply(extract_floor_features)
df['Floor_Low'] = [x[0] for x in floor_features]
df['Floor_High'] = [x[1] for x in floor_features]
df['Floor_Midpoint'] = [x[2] for x in floor_features]
df['Is_Basement'] = [x[3] for x in floor_features]
df['Is_Ground'] = [x[4] for x in floor_features]  # New column

# Create ML-ready categories (updated to handle ground floor)
def create_ml_categories(floor_str):
    if pd.isna(floor_str) or floor_str == '-':
        return 'unknown'
    
    floor_str = str(floor_str).strip()
    
    # Handle ground floor
    if floor_str == '0':
        return 'shop_house'
    
    # Handle basement
    if floor_str.startswith('B'):
        return 'basement'
    
    numbers = re.findall(r'\d+', floor_str)
    if len(numbers) >= 2:
        low = int(numbers[0])
        if low <= 5: return 'floors_01_05'
        elif low <= 10: return 'floors_06_10'
        elif low <= 15: return 'floors_11_15'
        elif low <= 20: return 'floors_16_20'
        elif low <= 25: return 'floors_21_25'
        elif low <= 30: return 'floors_26_30'
        elif low <= 35: return 'floors_31_35'
        elif low <= 40: return 'floors_36_40'
        else: return 'floors_41_45'
    
    # Handle single numbers (if any)
    if len(numbers) == 1:
        num = int(numbers[0])
        if num == 0:
            return 'ground_floor'
        elif num <= 5: return 'floors_01_05'
        elif num <= 10: return 'floors_06_10'
        elif num <= 15: return 'floors_11_15'
        elif num <= 20: return 'floors_16_20'
        elif num <= 25: return 'floors_21_25'
        elif num <= 30: return 'floors_26_30'
        elif num <= 35: return 'floors_31_35'
        elif num <= 40: return 'floors_36_40'
        else: return 'floors_41_45'
    
    return 'unknown'

df['Floor_Category_ML'] = df['Floor Level'].apply(create_ml_categories)

print("Final Feature Set for Modeling:")
print(df[['Floor Level', 'Floor_Low', 'Floor_High', 'Floor_Midpoint', 'Is_Basement', 'Is_Ground', 'Floor_Category_ML']].head(10))

# Check the distribution of the new categories
print("\nFloor Category Distribution:")
print(df['Floor_Category_ML'].value_counts().sort_index())

Final Feature Set for Modeling:
   Floor Level  Floor_Low  Floor_High  Floor_Midpoint  Is_Basement  Is_Ground  \
1            0          0           0             0.0            0          1   
2            0          0           0             0.0            0          1   
3            0          0           0             0.0            0          1   
4            0          0           0             0.0            0          1   
5            0          0           0             0.0            0          1   
6            0          0           0             0.0            0          1   
7            0          0           0             0.0            0          1   
8            0          0           0             0.0            0          1   
9            0          0           0             0.0            0          1   
10           0          0           0             0.0            0          1   

   Floor_Category_ML  
1         shop_house  
2         shop_house  
3      

In [23]:
del df['Floor Level']

## load geo features

In [25]:
# Load your main transaction dataset
transactions_df = df
# Load geographic datasets
postal_codes = pd.read_csv('SG_postal.csv')
city_coordinates = pd.read_csv('singapore_city_coordinates_improved.csv')
street_coordinates = pd.read_csv('street_coordinates.csv')
train_stations = pd.read_csv('mrt_lrt_data.csv')
postal_district_mapping = pd.read_csv('sg_postal_districts.csv')

print("Dataset Overview:")
print(f"Main transactions: {transactions_df.shape}")
print(f"Postal codes: {postal_codes.shape}")
print(f"City coordinates: {city_coordinates.shape}")
print(f"Street coordinates: {street_coordinates.shape}")
print(f"Train stations: {train_stations.shape}")
print(f"Postal district mapping: {postal_district_mapping.shape}")

Dataset Overview:
Main transactions: (2924, 23)
Postal codes: (121154, 4)
City coordinates: (332, 9)
Street coordinates: (589, 3)
Train stations: (209, 4)
Postal district mapping: (28, 3)


In [26]:
missing_counts = city_coordinates.isnull().sum(axis=1)
print("Missing counts per row:\n", missing_counts)

missing_counts = city_coordinates.isna().sum(axis=1)  # Count missing values per row
city_coordinates.drop(city_coordinates[missing_counts >= 4].index, inplace=True)

Missing counts per row:
 0      0
1      0
2      0
3      0
4      0
      ..
327    0
328    0
329    0
330    6
331    0
Length: 332, dtype: int64


In [27]:
def add_geographic_features(main_df, postal_df, street_df, city_df, stations_df, district_df):
    """Enrich industrial data with geographic and proximity features - FIXED VERSION"""
    import pandas as pd
    import numpy as np
    from scipy.spatial.distance import cdist
    from rapidfuzz import process

    df_enriched = main_df.copy()
    original_count = len(df_enriched)
    
    print(f"Starting geographic feature engineering for {original_count} properties...")

    # --- 1. MULTI-STRATEGY COORDINATE MATCHING ---
    
    # Strategy 1A: Direct street name matching (your current approach)
    if 'Street Name' in df_enriched.columns:
        street_df['street_name_clean'] = street_df['street_name'].str.upper().str.strip()
        df_enriched['Street_Name_Clean'] = df_enriched['Street Name'].str.upper().str.strip()

        street_choices = street_df['street_name_clean'].unique().tolist()

        def match_street(name):
            if pd.isna(name): return None
            match = process.extractOne(name, street_choices, score_cutoff=80)  # Lowered threshold
            return match[0] if match else None

        df_enriched['Matched_Street'] = df_enriched['Street_Name_Clean'].apply(match_street)

        df_enriched = df_enriched.merge(
            street_df[['street_name_clean', 'latitude', 'longitude']],
            left_on='Matched_Street', right_on='street_name_clean', how='left'
        )
        street_matches = df_enriched['latitude'].notna().sum()
        print(f" Strategy 1A: Street name matching → {street_matches} properties")

    # Strategy 1B: Postal code matching for missing coordinates
    if 'Postal District' in df_enriched.columns and 'postal_code' in postal_df.columns:
        # Convert postal codes to district (first 2 digits for Singapore)
        postal_df['Postal_District'] = postal_df['postal_code'].astype(str).str[:2]
        
        # Get centroid coordinates for each postal district
        district_coords = postal_df.groupby('Postal_District').agg({
            'lat': 'mean',
            'lon': 'mean'
        }).reset_index()
        
        # Merge district centroids for properties missing coordinates
        missing_coords_mask = df_enriched['latitude'].isna()
        if missing_coords_mask.any():
            df_enriched.loc[missing_coords_mask, 'Postal_District_Str'] = df_enriched.loc[missing_coords_mask, 'Postal District'].apply(
                lambda x: str(int(x)) if pd.notna(x) else np.nan
            )
            
            temp_merge = df_enriched[missing_coords_mask].merge(
                district_coords.rename(columns={'lat': 'lat_district', 'lon': 'lon_district'}),
                left_on='Postal_District_Str', right_on='Postal_District', how='left'
            )
            
            # Fill missing coordinates with district centroids
            district_fill_mask = df_enriched['latitude'].isna() & df_enriched['Postal_District_Str'].notna()
            df_enriched.loc[district_fill_mask, 'latitude'] = temp_merge.set_index(df_enriched[district_fill_mask].index)['lat_district']
            df_enriched.loc[district_fill_mask, 'longitude'] = temp_merge.set_index(df_enriched[district_fill_mask].index)['lon_district']
            
            district_matches = district_fill_mask.sum()
            print(f"Strategy 1B: Postal district centroids → {district_matches} properties")

    # Strategy 1C: Planning Area matching from city coordinates
    if 'Planning Area' in df_enriched.columns and 'Place' in city_df.columns:
        missing_coords_mask = df_enriched['latitude'].isna()
        if missing_coords_mask.any():
            city_df['Place_Clean'] = city_df['Place'].str.upper().str.strip()
            df_enriched['Planning_Area_Clean'] = df_enriched['Planning Area'].str.upper().str.strip()
            
            temp_merge = df_enriched[missing_coords_mask].merge(
                city_df[['Place_Clean', 'latitude', 'longitude']].rename(
                    columns={'latitude': 'lat_city', 'longitude': 'lon_city'}
                ),
                left_on='Planning_Area_Clean', right_on='Place_Clean', how='left'
            )
            
            # Fill missing coordinates with city coordinates
            city_fill_mask = df_enriched['latitude'].isna() & df_enriched['Planning_Area_Clean'].notna()
            df_enriched.loc[city_fill_mask, 'latitude'] = temp_merge.set_index(df_enriched[city_fill_mask].index)['lat_city']
            df_enriched.loc[city_fill_mask, 'longitude'] = temp_merge.set_index(df_enriched[city_fill_mask].index)['lon_city']
            
            city_matches = city_fill_mask.sum()
            print(f" Strategy 1C: Planning Area matching → {city_matches} properties")

    # Strategy 1D: Region-based fallback coordinates
    if 'Region' in df_enriched.columns:
        missing_coords_mask = df_enriched['latitude'].isna()
        if missing_coords_mask.any():
            # Define approximate coordinates for major regions
            region_coords = {
                'CENTRAL REGION': (1.2923, 103.8536),  # Singapore central
                'EAST REGION': (1.3443, 103.9645),     # East area
                'WEST REGION': (1.3526, 103.7584),     # West area
                'NORTH REGION': (1.4180, 103.8200),    # North area
                'NORTH-EAST REGION': (1.3691, 103.8975) # North-East
            }
            
            def get_region_coords(region):
                if pd.isna(region): return (np.nan, np.nan)
                region_upper = str(region).upper()
                for key, coords in region_coords.items():
                    if key in region_upper:
                        return coords
                return (np.nan, np.nan)
            
            region_coords_df = df_enriched[missing_coords_mask]['Region'].apply(get_region_coords)
            region_fill_mask = df_enriched['latitude'].isna() & df_enriched['Region'].notna()
            
            df_enriched.loc[region_fill_mask, 'latitude'] = region_coords_df.apply(lambda x: x[0])
            df_enriched.loc[region_fill_mask, 'longitude'] = region_coords_df.apply(lambda x: x[1])
            
            region_matches = region_fill_mask.sum()
            print(f" Strategy 1D: Region-based fallback → {region_matches} properties")

    # Final coordinate coverage report
    final_coverage = df_enriched['latitude'].notna().sum()
    print(f"FINAL COORDINATE COVERAGE: {final_coverage}/{original_count} properties ({final_coverage/original_count*100:.1f}%)")

    # --- 2. ENHANCED POSTAL DISTRICT FEATURES ---
    
    if 'Postal District' in df_enriched.columns:
        # Create proper postal district string (handle NaNs)
        df_enriched['Postal_District_Str'] = df_enriched['Postal District'].apply(
            lambda x: str(int(x)) if pd.notna(x) else 'Unknown'
        )
        
        district_df['Postal District'] = district_df['Postal District'].astype(str)
        
        df_enriched = df_enriched.merge(
            district_df[['Postal District', 'General Location']].rename(columns={'General Location': 'General_Location'}),
            left_on='Postal_District_Str', right_on='Postal District', how='left'
        )
        print(f" Added general location for {df_enriched['General_Location'].notna().sum()} properties")

    # --- 3. FIXED MRT DISTANCE CALCULATION ---
    
    def haversine_distance(lat1, lon1, lat2, lon2):
        """Calculate Haversine distance between two points in kilometers"""
        R = 6371  # Earth radius in kilometers
        
        lat1_rad = np.radians(lat1)
        lon1_rad = np.radians(lon1)
        lat2_rad = np.radians(lat2)
        lon2_rad = np.radians(lon2)
        
        dlat = lat2_rad - lat1_rad
        dlon = lon2_rad - lon1_rad
        
        a = np.sin(dlat/2)**2 + np.cos(lat1_rad) * np.cos(lat2_rad) * np.sin(dlon/2)**2
        c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
        
        return R * c

    if all(col in df_enriched.columns for col in ['latitude', 'longitude']):
        valid_coords = df_enriched[['latitude', 'longitude']].dropna()
        station_coords = stations_df[['Latitude', 'Longitude']].dropna()
        
        if len(valid_coords) > 0 and len(station_coords) > 0:
            print(f"Calculating MRT distances for {len(valid_coords)} properties...")
            
            # Vectorized distance calculation
            min_dists = []
            for idx, prop_row in valid_coords.iterrows():
                prop_lat, prop_lon = prop_row['latitude'], prop_row['longitude']
                
                # Calculate distances to all stations
                distances = []
                for _, station_row in station_coords.iterrows():
                    dist = haversine_distance(
                        prop_lat, prop_lon,
                        station_row['Latitude'], station_row['Longitude']
                    )
                    distances.append(dist)
                
                min_dists.append(min(distances) if distances else np.nan)
            
            # Assign distances back to dataframe
            df_enriched.loc[valid_coords.index, 'Distance_to_MRT_km'] = min_dists
            print(f" Added MRT distances for {len([x for x in min_dists if not np.isnan(x)])} properties")
            
            # For properties without coordinates, use district average
            missing_mrt_mask = df_enriched['Distance_to_MRT_km'].isna() & df_enriched['Postal_District_Str'].notna()
            if missing_mrt_mask.any():
                district_mrt_avg = df_enriched.groupby('Postal_District_Str')['Distance_to_MRT_km'].mean()
                df_enriched.loc[missing_mrt_mask, 'Distance_to_MRT_km'] = df_enriched.loc[missing_mrt_mask, 'Postal_District_Str'].map(district_mrt_avg)
                print(f" Added district-average MRT distances for {missing_mrt_mask.sum()} properties")

    # --- 4. ENHANCED REGION CLASSIFICATION ---
    
    def classify_region(d):
        if pd.isna(d) or d == 'Unknown': return 'Unknown'
        try:
            d_int = int(d)
            if d_int <= 9: return 'Central Core'
            elif d_int <= 16: return 'Rest Central'
            elif d_int <= 21: return 'City Fringe'
            elif d_int <= 28: return 'Outside Central'
            else: return 'Unknown'
        except:
            return 'Unknown'

    df_enriched['Region_Classification'] = df_enriched['Postal_District_Str'].apply(classify_region)
    region_coverage = (df_enriched['Region_Classification'] != 'Unknown').sum()
    print(f" Added region classification for {region_coverage} properties")

    # --- 5. ADDITIONAL GEOGRAPHIC FEATURES ---
    
    # CBD proximity (distance to Raffles Place)
    cbd_coords = (1.2833, 103.8515)  # Raffles Place
    if all(col in df_enriched.columns for col in ['latitude', 'longitude']):
        valid_coords = df_enriched[['latitude', 'longitude']].dropna()
        if len(valid_coords) > 0:
            print("Calculating CBD distances...")
            cbd_distances = []
            for idx, row in valid_coords.iterrows():
                dist = haversine_distance(
                    row['latitude'], row['longitude'],
                    cbd_coords[0], cbd_coords[1]
                )
                cbd_distances.append(dist)
            
            df_enriched.loc[valid_coords.index, 'Distance_to_CBD_km'] = cbd_distances
            print(f" Added CBD distances for {len(cbd_distances)} properties")

    # Urban vs Suburban classification
    def classify_urban_rural(distance_to_cbd):
        if pd.isna(distance_to_cbd): return 'Unknown'
        if distance_to_cbd <= 5: return 'CBD'
        elif distance_to_cbd <= 10: return 'Urban'
        elif distance_to_cbd <= 20: return 'Suburban'
        else: return 'Rural'
    
    if 'Distance_to_CBD_km' in df_enriched.columns:
        df_enriched['Urban_Classification'] = df_enriched['Distance_to_CBD_km'].apply(classify_urban_rural)
        print(f" Added urban classification")

    # --- FINAL REPORT ---
    print("\n" + "="*60)
    print(" GEOGRAPHIC FEATURE ENGINEERING COMPLETE")
    print("="*60)
    print(f"FINAL COVERAGE REPORT:")
    print(f"   • Coordinates: {df_enriched['latitude'].notna().sum()}/{original_count}")
    print(f"   • MRT Distances: {df_enriched['Distance_to_MRT_km'].notna().sum()}/{original_count}")
    print(f"   • Region Classification: {(df_enriched['Region_Classification'] != 'Unknown').sum()}/{original_count}")
    print(f"   • General Location: {df_enriched['General_Location'].notna().sum()}/{original_count}")
    
    if 'Distance_to_CBD_km' in df_enriched.columns:
        print(f"   • CBD Distances: {df_enriched['Distance_to_CBD_km'].notna().sum()}/{original_count}")
    
    # Clean up temporary columns
    cols_to_drop = [col for col in df_enriched.columns if col in ['Matched_Street', 'street_name_clean', 'Postal_District_Str', 'Postal District_y']]
    df_enriched = df_enriched.drop(columns=cols_to_drop, errors='ignore')
    
    return df_enriched

# Call the improved function
print("\nAdding geographic features with enhanced coverage...")
commercial_enriched = add_geographic_features(
    transactions_df, 
    postal_codes, 
    street_coordinates, 
    city_coordinates, 
    train_stations, 
    postal_district_mapping
)


Adding geographic features with enhanced coverage...
Starting geographic feature engineering for 2924 properties...
 Strategy 1A: Street name matching → 1196 properties
Strategy 1B: Postal district centroids → 1728 properties
FINAL COORDINATE COVERAGE: 1655/2924 properties (56.6%)
 Added general location for 2924 properties
Calculating MRT distances for 1655 properties...
 Added MRT distances for 1655 properties
 Added district-average MRT distances for 1269 properties
 Added region classification for 2924 properties
Calculating CBD distances...
 Added CBD distances for 1655 properties
 Added urban classification

 GEOGRAPHIC FEATURE ENGINEERING COMPLETE
FINAL COVERAGE REPORT:
   • Coordinates: 1655/2924
   • MRT Distances: 2924/2924
   • Region Classification: 2924/2924
   • General Location: 2924/2924
   • CBD Distances: 1655/2924


In [28]:
commercial_enriched.isna().sum()

Project Name                0
Street Name                 0
Property Type               0
Transacted Price ($)        0
Area (SQFT)                 0
Unit Price ($ PSF)          0
Type of Area                0
Area (SQM)                  0
Unit Price ($ PSM)          0
Tenure                      0
Postal District_x           0
sale_date_missing           0
sale_year                 210
sale_month                210
sale_quarter              210
sale_dayofweek            210
days_since_first_sale     210
Floor_Low                   0
Floor_High                  0
Floor_Midpoint              0
Is_Basement                 0
Is_Ground                   0
Floor_Category_ML           0
Street_Name_Clean           0
latitude                 1269
longitude                1269
General_Location            0
Distance_to_MRT_km          0
Region_Classification       0
Distance_to_CBD_km       1269
Urban_Classification        0
dtype: int64

In [29]:
target_column = 'Unit Price ($ PSF)'

exclude_columns = ['Transacted Price ($)', 'Unit Price ($ PSM)',
    'Unit Price ($ PSF)',
    'monthly_rental_price_yield',
    'rental_rate_psm_yield',
    'monthly_rental_price_tenure',
    'rental_rate_psm_tenure',
    'monthly_rental_price_market',
    'rental_rate_psm_avg',
    'monthly_rental_price_avg',
    'market_rent_rate_psm',
    'annual_rental_income_yield','implied_yield_market']

print(" CREATING PROPERTY-TYPE SPECIFIC MODELS")
print("="*50)

property_type_models = {}

for prop_type in commercial_enriched['Property Type'].unique():
    print(f"\n PROCESSING {prop_type.upper()}...")
    
    # Filter data for this property type
    type_mask = commercial_enriched['Property Type'] == prop_type
    type_data = commercial_enriched[type_mask]
    
    # Skip if not enough samples
    if len(type_data) < 30:
        print(f"   ⚠ Skipped - only {len(type_data)} samples (need at least 30)")
        continue
    
    print(f"   Samples: {len(type_data)}")
    
    # Use YOUR EXACT feature selection logic
    feature_columns = [col for col in type_data.columns 
                      if col not in exclude_columns and col != target_column]
    
    # Use YOUR EXACT preprocessing
    categorical_columns = type_data[feature_columns].select_dtypes(include=['object', 'category']).columns
    numerical_columns = type_data[feature_columns].select_dtypes(include=[np.number]).columns
    
    print(f"   Features: {len(feature_columns)} total, {len(categorical_columns)} categorical, {len(numerical_columns)} numerical")
    
    # One-hot encode using YOUR method
    X_encoded = pd.get_dummies(type_data[feature_columns], columns=categorical_columns, drop_first=True)
    y = type_data[target_column]
    
    # Split the data
    X_train_type, X_test_type, y_train_type, y_test_type = train_test_split(
        X_encoded, y, test_size=0.2, random_state=42, shuffle=True
    )

 CREATING PROPERTY-TYPE SPECIFIC MODELS

 PROCESSING SHOP HOUSE...
   Samples: 526
   Features: 28 total, 10 categorical, 18 numerical

 PROCESSING RETAIL...
   Samples: 1050
   Features: 28 total, 10 categorical, 18 numerical

 PROCESSING OFFICE...
   Samples: 1348
   Features: 28 total, 10 categorical, 18 numerical


In [30]:
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42, shuffle=True
)

print(f"Training set: {X_train.shape}")
print(f"Testing set: {X_test.shape}")

# Create a simple baseline (predict mean)
baseline_pred = np.full_like(y_test, y_train.mean())
baseline_mae = mean_absolute_error(y_test, baseline_pred)
baseline_rmse = np.sqrt(mean_squared_error(y_test, baseline_pred))

print(f"\nBaseline Model (Mean Prediction):")
print(f"MAE: ${baseline_mae:,.2f}")
print(f"RMSE: ${baseline_rmse:,.2f}")

Training set: (1078, 447)
Testing set: (270, 447)

Baseline Model (Mean Prediction):
MAE: $884.26
RMSE: $1,495.46


In [31]:
# Check for missing values
print("Missing values in features:")
print(X_encoded.isnull().sum().sum())
print("Missing values in target:")
print(y.isnull().sum())

# Handle missing values - FIXED VERSION
def handle_missing_values(X, strategy='mean'):
    """Handle missing values in the feature matrix"""
    imputer = SimpleImputer(strategy=strategy)
    X_imputed = imputer.fit_transform(X)
    return pd.DataFrame(X_imputed, columns=X.columns, index=X.index), imputer

# Apply to training and test data
X_train_imputed, imputer = handle_missing_values(X_train)
X_test_imputed = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns, index=X_test.index)

print(f"Missing values after imputation - Train: {X_train_imputed.isnull().sum().sum()}")
print(f"Missing values after imputation - Test: {X_test_imputed.isnull().sum().sum()}")
print(f"Training set: {X_train_imputed.shape}")
print(f"Test set: {X_test_imputed.shape}")

Missing values in features:
2393
Missing values in target:
0
Missing values after imputation - Train: 0
Missing values after imputation - Test: 0
Training set: (1078, 447)
Test set: (270, 447)


In [32]:
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor, VotingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score, KFold



# Updated models with proper data handling
models = {

    'XGBoost': XGBRegressor(
        n_estimators=100, 
        random_state=42,
        enable_categorical=False  # Ensure this is False for one-hot encoded data
    )
}

# Train and evaluate models with proper data splits
results = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    
    try:
            
        if name == 'Hist Gradient Boosting':
            # Use imputed data for HistGradientBoosting
            model.fit(X_train_imputed, y_train)
            y_pred = model.predict(X_test_imputed)
            y_test_compare = y_test
            
        else:
            # Use imputed data for other models
            model.fit(X_train_imputed, y_train)
            y_pred = model.predict(X_test_imputed)
            y_test_compare = y_test
        
        # Calculate metrics
        mae = mean_absolute_error(y_test_compare, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test_compare, y_pred))
        r2 = r2_score(y_test_compare, y_pred)
        
        # Store results
        results[name] = {
            'model': model,
            'mae': mae,
            'rmse': rmse,
            'r2': r2,
            'predictions': y_pred,
            'y_test': y_test_compare
        }
        
        print(f"{name} Results:")
        print(f"  MAE: ${mae:,.2f}")
        print(f"  RMSE: ${rmse:,.2f}")
        print(f"  R²: {r2:.4f}")
        print(f"  Improvement over baseline: {((baseline_mae - mae) / baseline_mae * 100):.1f}%")
        
    except Exception as e:
        print(f"Error training {name}: {e}")
        import traceback
        traceback.print_exc()
        continue

# Find best model
if results:
    best_model_name = min(results.keys(), key=lambda x: results[x]['mae'])
    best_model = results[best_model_name]['model']
    print(f"\n Best Model: {best_model_name}")


Training XGBoost...
XGBoost Results:
  MAE: $324.66
  RMSE: $1,075.44
  R²: 0.4816
  Improvement over baseline: 63.3%

 Best Model: XGBoost


In [33]:
def train_enhanced_models(X_train_imputed, y_train, X_test_imputed, y_test, baseline_mae=None):
    """Enhanced model training with better validation and ensemble options"""
    
    print("Training enhanced models with improved configurations...")
    print("=" * 60)
    
    # Enhanced models with optimized hyperparameters
    models = {
        'Hist Gradient Boosting': HistGradientBoostingRegressor(
            max_iter=200,
            max_depth=8,
            learning_rate=0.1,
            min_samples_leaf=20,
            l2_regularization=0.1,
            random_state=42
        ),
        'XGBoost': XGBRegressor(
            n_estimators=200,
            max_depth=8,
            learning_rate=0.1,
            subsample=0.8,
            colsample_bytree=0.8,
            reg_alpha=0.1,
            reg_lambda=1.0,
            random_state=42,
            n_jobs=-1
        ),
        'Gradient Boosting': GradientBoostingRegressor(
            n_estimators=200,
            max_depth=6,
            learning_rate=0.1,
            subsample=0.8,
            random_state=42
        )
    }
    
    # Add CatBoost if you have categorical features
    try:
        models['CatBoost'] = CatBoostRegressor(
            iterations=200,
            depth=8,
            learning_rate=0.1,
            random_state=42,
            verbose=False
        )
    except:
        print("CatBoost not available, skipping...")
    
    results = {}
    
    for name, model in models.items():
        print(f"\n--- Training {name} ---")
        
        try:
            # Train model
            model.fit(X_train_imputed, y_train)
            y_pred = model.predict(X_test_imputed)
            
            # Calculate metrics
            mae = mean_absolute_error(y_test, y_pred)
            rmse = np.sqrt(mean_squared_error(y_test, y_pred))
            r2 = r2_score(y_test, y_pred)
            
            # Cross-validation for robustness
            kf = KFold(n_splits=5, shuffle=True, random_state=42)
            cv_scores = cross_val_score(model, X_train_imputed, y_train, 
                                      cv=kf, scoring='neg_mean_absolute_error')
            cv_mae = -cv_scores.mean()
            cv_std = cv_scores.std()
            
            # Store results
            results[name] = {
                'model': model,
                'mae': mae,
                'rmse': rmse,
                'r2': r2,
                'cv_mae': cv_mae,
                'cv_std': cv_std,
                'predictions': y_pred
            }
            
            print(f"Test MAE: ${mae:,.2f}")
            print(f"Test RMSE: ${rmse:,.2f}")
            print(f"Test R²: {r2:.4f}")
            print(f"CV MAE: ${cv_mae:,.2f} ± ${cv_std:,.2f}")
            
            if baseline_mae:
                improvement = ((baseline_mae - mae) / baseline_mae * 100)
                print(f"Improvement over baseline: {improvement:+.1f}%")
            
        except Exception as e:
            print(f"Error training {name}: {e}")
            continue
    
    return results

def create_smart_ensemble(results, X_train_imputed, y_train, X_test_imputed, y_test):
    """Create intelligent ensemble from best performing models"""
    
    # Get top 3 models by MAE
    valid_models = {k: v for k, v in results.items() if 'mae' in v}
    if len(valid_models) < 2:
        print("Not enough models for ensemble")
        return None, None
    
    top_models = sorted(valid_models.items(), key=lambda x: x[1]['mae'])[:3]
    
    print(f"\nCreating ensemble from: {[name for name, _ in top_models]}")
    
    # Create weighted ensemble based on performance
    ensemble_models = []
    weights = []
    
    for name, result in top_models:
        ensemble_models.append((name, result['model']))
        # Weight inversely proportional to MAE (better models get higher weight)
        weight = 1.0 / result['mae']
        weights.append(weight)
    
    # Normalize weights
    total_weight = sum(weights)
    normalized_weights = [w/total_weight for w in weights]
    
    print("Ensemble weights:")
    for (name, _), weight in zip(ensemble_models, normalized_weights):
        print(f"  {name}: {weight:.3f}")
    
    # Create voting regressor with weights
    ensemble = VotingRegressor(
        estimators=ensemble_models,
        weights=normalized_weights
    )
    
    # Train ensemble
    ensemble.fit(X_train_imputed, y_train)
    y_pred_ensemble = ensemble.predict(X_test_imputed)
    
    # Evaluate ensemble
    mae_ens = mean_absolute_error(y_test, y_pred_ensemble)
    rmse_ens = np.sqrt(mean_squared_error(y_test, y_pred_ensemble))
    r2_ens = r2_score(y_test, y_pred_ensemble)
    
    print(f"\nEnsemble Results:")
    print(f"MAE: ${mae_ens:,.2f}")
    print(f"RMSE: ${rmse_ens:,.2f}")
    print(f"R²: {r2_ens:.4f}")
    
    # Compare with best single model
    best_single_mae = top_models[0][1]['mae']
    improvement = ((best_single_mae - mae_ens) / best_single_mae) * 100
    
    print(f"Ensemble vs Best Single Model: {improvement:+.1f}% improvement")
    
    return ensemble, mae_ens

def analyze_model_performance(results, X_test_imputed, y_test, feature_names=None):
    """Comprehensive model performance analysis"""
    
    print("\n" + "=" * 60)
    print("MODEL PERFORMANCE ANALYSIS")
    print("=" * 60)
    
    # Sort models by MAE
    sorted_results = sorted(results.items(), key=lambda x: x[1]['mae'])
    
    print("\nModel Ranking (by MAE):")
    for i, (name, result) in enumerate(sorted_results, 1):
        print(f"{i:2d}. {name:25} | MAE: ${result['mae']:,.2f} | R²: {result['r2']:.4f} | CV: ${result['cv_mae']:,.2f} ± ${result['cv_std']:,.2f}")
    
    best_model_name, best_result = sorted_results[0]
    
    # Error analysis for best model
    print(f"\n--- Error Analysis for {best_model_name} ---")
    y_pred_best = best_result['predictions']
    errors = np.abs(y_test - y_pred_best)
    
    print(f"Mean Absolute Error: ${errors.mean():,.2f}")
    print(f"Median Absolute Error: ${np.median(errors):,.2f}")
    print(f"Max Error: ${errors.max():,.2f}")
    print(f"Error Std: ${errors.std():,.2f}")
    
    # Error distribution
    error_pct = (errors / y_test) * 100
    print(f"\nError Percentage Stats:")
    print(f"  Mean: {error_pct.mean():.1f}%")
    print(f"  Median: {error_pct.median():.1f}%")
    print(f"  95th percentile: {np.percentile(error_pct, 95):.1f}%")
    
    # Worst predictions
    worst_indices = np.argsort(errors)[-5:]
    print(f"\nTop 5 Largest Errors:")
    for idx in worst_indices:
        actual = y_test.iloc[idx] if hasattr(y_test, 'iloc') else y_test[idx]
        predicted = y_pred_best[idx]
        error = abs(actual - predicted)
        error_pct = (error / actual) * 100
        print(f"  Actual: ${actual:,.0f} | Predicted: ${predicted:,.0f} | Error: {error_pct:.1f}%")
    
    return best_model_name, best_result

# Main execution function
def run_enhanced_training(X_train_imputed, y_train, X_test_imputed, y_test, baseline_mae=None):
    """Complete enhanced training pipeline"""
    
    print("Starting Enhanced Model Training Pipeline")
    print("=" * 60)
    
    # 1. Train individual models
    results = train_enhanced_models(X_train_imputed, y_train, X_test_imputed, y_test, baseline_mae)
    
    if not results:
        print("No models trained successfully!")
        return None, None
    
    # 2. Analyze performance
    best_model_name, best_result = analyze_model_performance(results, X_test_imputed, y_test)
    
    # 3. Create ensemble
    ensemble_model, ensemble_mae = create_smart_ensemble(results, X_train_imputed, y_train, X_test_imputed, y_test)
    
    # 4. Final recommendation
    print("\n" + "=" * 60)
    print("FINAL RECOMMENDATION")
    print("=" * 60)
    
    if ensemble_model and ensemble_mae < best_result['mae']:
        print(" RECOMMENDATION: USE ENSEMBLE MODEL")
        print(f"   Ensemble MAE: ${ensemble_mae:,.2f}")
        print(f"   Best Single MAE: ${best_result['mae']:,.2f}")
        print(f"   Improvement: {((best_result['mae'] - ensemble_mae) / best_result['mae'] * 100):+.1f}%")
        final_model = ensemble_model
    else:
        print(f" RECOMMENDATION: USE {best_model_name}")
        print(f"   MAE: ${best_result['mae']:,.2f}")
        print(f"   R²: {best_result['r2']:.4f}")
        print(f"   CV Consistency: ${best_result['cv_mae']:,.2f} ± ${best_result['cv_std']:,.2f}")
        final_model = best_result['model']
    
    return final_model, results

# Usage - replace your existing training code with this:
print("Running enhanced training pipeline...")
final_model, all_results = run_enhanced_training(
    X_train_imputed, y_train, X_test_imputed, y_test, baseline_mae
)

# You can then use final_model for predictions
if final_model:
    #print("\nTraining completed successfully!")
    print(f"Final model type: {type(final_model).__name__}")

Running enhanced training pipeline...
Starting Enhanced Model Training Pipeline
Training enhanced models with improved configurations...

--- Training Hist Gradient Boosting ---
Test MAE: $621.84
Test RMSE: $1,294.54
Test R²: 0.2488
CV MAE: $754.46 ± $146.40
Improvement over baseline: +29.7%

--- Training XGBoost ---
Test MAE: $304.62
Test RMSE: $882.43
Test R²: 0.6510
CV MAE: $625.45 ± $176.00
Improvement over baseline: +65.6%

--- Training Gradient Boosting ---
Test MAE: $338.74
Test RMSE: $985.08
Test R²: 0.5650
CV MAE: $574.69 ± $201.13
Improvement over baseline: +61.7%

--- Training CatBoost ---
Test MAE: $353.12
Test RMSE: $854.73
Test R²: 0.6725
CV MAE: $565.67 ± $158.91
Improvement over baseline: +60.1%

MODEL PERFORMANCE ANALYSIS

Model Ranking (by MAE):
 1. XGBoost                   | MAE: $304.62 | R²: 0.6510 | CV: $625.45 ± $176.00
 2. Gradient Boosting         | MAE: $338.74 | R²: 0.5650 | CV: $574.69 ± $201.13
 3. CatBoost                  | MAE: $353.12 | R²: 0.6725 | CV

In [34]:
def enhance_commercial_features_targeted(commercial_enriched):
    """
    Add features specifically for problematic property types
    """
    df = commercial_enriched.copy()
    
    # 1. Conservation area indicator
    conservation_keywords = ['conservation', 'heritage', 'shophouse', 'shop house']
    df['is_conservation'] = df['Project Name'].str.lower().str.contains(
        '|'.join(conservation_keywords), na=False
    ).astype(int)
    
    # 2. Size-based features for large properties
    if 'Area (SQM)' in df.columns:
        df['is_large_property'] = (df['Area (SQM)'] > 150).astype(int)
        df['is_very_large'] = (df['Area (SQM)'] > 300).astype(int)
        
        # Size-price interaction
        df['area_price_interaction'] = df['Area (SQM)'] * df.get('Unit Price ($ PSM)', 1)
    
    # 3. Premium building indicator
    premium_buildings = ['peninsula', 'marina', 'orchard', 'raffles', 'capital', 'suntec']
    df['is_premium_building'] = df['Project Name'].str.lower().str.contains(
        '|'.join(premium_buildings), na=False
    ).astype(int)
    
    # 4. Property type interactions
    property_dummies = pd.get_dummies(df['Property Type'], prefix='type')
    df = pd.concat([df, property_dummies], axis=1)
    
    # 5. Location clusters based on performance
    high_error_locations = ['peninsula', 'little india', 'telok ayer', 'oxley']
    df['is_high_variance_location'] = df['Project Name'].str.lower().str.contains(
        '|'.join(high_error_locations), na=False
    ).astype(int)
    
    print(f"Added targeted features for problem areas")
    print(f"   Conservation areas: {df['is_conservation'].sum()} properties")
    print(f"   Large properties: {df['is_large_property'].sum()} properties")
    print(f"   Premium buildings: {df['is_premium_building'].sum()} properties")
    
    return df

# Apply targeted feature engineering
commercial_enriched = enhance_commercial_features_targeted(commercial_enriched)

Added targeted features for problem areas
   Conservation areas: 522 properties
   Large properties: 432 properties
   Premium buildings: 165 properties


In [35]:
def create_property_type_models(commercial_enriched):
    """
    Create and train separate models for each property type
    """
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.impute import SimpleImputer
    from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
    import numpy as np
    import pandas as pd
    
    target_column = 'Unit Price ($ PSF)'
    
    exclude_columns = ['Transacted Price ($)', 'Unit Price ($ PSM)',
        'Unit Price ($ PSF)',
        'monthly_rental_price_yield',
        'rental_rate_psm_yield',
        'monthly_rental_price_tenure',
        'rental_rate_psm_tenure',
        'monthly_rental_price_market',
        'rental_rate_psm_avg',
        'monthly_rental_price_avg',
        'market_rent_rate_psm',
        'annual_rental_income_yield','implied_yield_market']
    
    print("CREATING PROPERTY-TYPE SPECIFIC MODELS")
    print("="*50)
    
    property_type_models = {}
    
    def handle_missing_values(X, strategy='mean'):
        """Handle missing values in the feature matrix"""
        imputer = SimpleImputer(strategy=strategy)
        X_imputed = imputer.fit_transform(X)
        return pd.DataFrame(X_imputed, columns=X.columns, index=X.index), imputer
    
    for prop_type in commercial_enriched['Property Type'].unique():
        print(f"\n PROCESSING {prop_type.upper()}...")
        
        # Filter data for this property type
        type_mask = commercial_enriched['Property Type'] == prop_type
        type_data = commercial_enriched[type_mask]
        
        # Skip if not enough samples
        if len(type_data) < 30:
            print(f"   ⚠  Skipped - only {len(type_data)} samples (need at least 30)")
            continue
        
        print(f"   Samples: {len(type_data)}")
        
        # Use YOUR EXACT feature selection logic
        feature_columns = [col for col in type_data.columns 
                          if col not in exclude_columns and col != target_column]
        
        # Use YOUR EXACT preprocessing
        categorical_columns = type_data[feature_columns].select_dtypes(include=['object', 'category']).columns
        numerical_columns = type_data[feature_columns].select_dtypes(include=[np.number]).columns
        
        print(f"   Features: {len(feature_columns)} total, {len(categorical_columns)} categorical, {len(numerical_columns)} numerical")
        
        # One-hot encode using YOUR method
        X_encoded = pd.get_dummies(type_data[feature_columns], columns=categorical_columns, drop_first=True)
        y = type_data[target_column]
        
        # Split the data
        X_train_type, X_test_type, y_train_type, y_test_type = train_test_split(
            X_encoded, y, test_size=0.2, random_state=42, shuffle=True
        )
        
        # Handle missing values using FIXED function
        X_train_imputed_type, imputer = handle_missing_values(X_train_type)
        X_test_imputed_type = pd.DataFrame(imputer.transform(X_test_type), 
                                         columns=X_test_type.columns, index=X_test_type.index)
        
        print(f"   Training set: {X_train_imputed_type.shape}")
        print(f"   Test set: {X_test_imputed_type.shape}")
        
        # Train model (using your preferred algorithm)
        model_type = RandomForestRegressor(
            n_estimators=100,
            max_depth=10,
            min_samples_split=10,
            random_state=42,
            n_jobs=-1
        )
        
        model_type.fit(X_train_imputed_type, y_train_type)
        
        # Evaluate
        y_pred_type = model_type.predict(X_test_imputed_type)
        mae_type = mean_absolute_error(y_test_type, y_pred_type)
        rmse_type = np.sqrt(mean_squared_error(y_test_type, y_pred_type))
        r2_type = r2_score(y_test_type, y_pred_type)
        
        print(f"   Performance - MAE: ${mae_type:,.2f}, RMSE: ${rmse_type:,.2f}, R²: {r2_type:.4f}")
        
        # Store the model and its preprocessing information
        property_type_models[prop_type] = {
            'model': model_type,
            'feature_columns': feature_columns,
            'categorical_columns': categorical_columns.tolist(),
            'imputer': imputer,
            'performance': {
                'mae': mae_type,
                'rmse': rmse_type, 
                'r2': r2_type
            },
            'training_samples': len(X_train_imputed_type),
            'feature_names_after_encoding': X_train_imputed_type.columns.tolist()
        }
    
    print(f"\n CREATED {len(property_type_models)} PROPERTY-TYPE MODELS:")
    for prop_type, info in property_type_models.items():
        perf = info['performance']
        print(f"   {prop_type:15} | MAE: ${perf['mae']:,.2f} | R²: {perf['r2']:.4f} | Samples: {info['training_samples']}")
    
    return property_type_models

# Usage:
property_type_models = create_property_type_models(commercial_enriched)

CREATING PROPERTY-TYPE SPECIFIC MODELS

 PROCESSING SHOP HOUSE...
   Samples: 526
   Features: 37 total, 10 categorical, 24 numerical
   Training set: (420, 454)
   Test set: (106, 454)
   Performance - MAE: $498.70, RMSE: $985.59, R²: 0.9189

 PROCESSING RETAIL...
   Samples: 1050
   Features: 37 total, 10 categorical, 24 numerical
   Training set: (840, 626)
   Test set: (210, 626)
   Performance - MAE: $239.25, RMSE: $433.08, R²: 0.8980

 PROCESSING OFFICE...
   Samples: 1348
   Features: 37 total, 10 categorical, 24 numerical
   Training set: (1078, 456)
   Test set: (270, 456)
   Performance - MAE: $251.75, RMSE: $688.73, R²: 0.7874

🎉 CREATED 3 PROPERTY-TYPE MODELS:
   Shop House      | MAE: $498.70 | R²: 0.9189 | Samples: 420
   Retail          | MAE: $239.25 | R²: 0.8980 | Samples: 840
   Office          | MAE: $251.75 | R²: 0.7874 | Samples: 1078


In [36]:
import pickle
import datetime

# IMPORTANT: Save the property-type-specific models from Cell 34
# These models have much better performance (R²: 0.79-0.92) than the combined model
# Each property type has its own model, feature set, and imputer

# Check if property_type_models exists (from Cell 34)
if 'property_type_models' not in globals() or not property_type_models:
    print(" WARNING: property_type_models not found. Please run Cell 34 first to create property-type-specific models.")
    print("   Falling back to combined model (less accurate)...")
    
    # Fallback: Use combined model if property-type models don't exist
    print("Running enhanced training pipeline...")
    final_model, all_results = run_enhanced_training(
        X_train_imputed, y_train, X_test_imputed, y_test, baseline_mae
    )
    
    if final_model:
        sorted_results = sorted(all_results.items(), key=lambda x: x[1]['mae'])
        best_model_name, best_result = sorted_results[0]
        deployment_package = {
            'model': final_model,  # Single combined model
            'feature_names': list(X_train_imputed.columns),
            'performance': best_result,
            'timestamp': datetime.datetime.now(),
            'model_type': 'single_model',
            'is_property_type_specific': False
        }
    else:
        print(" Failed to create model")
        deployment_package = None
else:
    # Use property-type-specific models (preferred - better accuracy)
    print(" Saving property-type-specific models...")
    print(f"   Models for: {list(property_type_models.keys())}")
    
    # Create deployment package with property-type-specific models
    deployment_package = {
        'model': property_type_models,  # Dictionary of property-type-specific models
        'timestamp': datetime.datetime.now(),
        'model_type': 'property_type_specific',
        'is_property_type_specific': True,
        'property_types': list(property_type_models.keys()),
        'model_info': {}
    }
    
    # Add metadata for each property type model
    for prop_type, model_info in property_type_models.items():
        deployment_package['model_info'][prop_type] = {
            'feature_names_after_encoding': model_info['feature_names_after_encoding'],
            'feature_columns': model_info['feature_columns'],
            'categorical_columns': model_info['categorical_columns'],
            'performance': model_info['performance'],
            'training_samples': model_info['training_samples'],
            'n_features': len(model_info['feature_names_after_encoding'])
        }
        print(f"   {prop_type}: {len(model_info['feature_names_after_encoding'])} features, R²={model_info['performance']['r2']:.4f}")

# Save to PKL file
if deployment_package:
    filename = 'commercial_real_estate_model_final.pkl'
    with open(filename, 'wb') as f:
        pickle.dump(deployment_package, f)
    print(f"\n Model saved to: {filename}")
    print(f"   Model type: {deployment_package.get('model_type', 'unknown')}")
    print(f"   Property-type-specific: {deployment_package.get('is_property_type_specific', False)}")
else:
    print(" Failed to create deployment package")


 Saving property-type-specific models...
   Models for: ['Shop House', 'Retail', 'Office']
   Shop House: 454 features, R²=0.9189
   Retail: 626 features, R²=0.8980
   Office: 456 features, R²=0.7874

 Model saved to: commercial_real_estate_model_final.pkl
   Model type: property_type_specific
   Property-type-specific: True
