In [11]:
# Phase 3 – Baseline modelling
# ================================================================
# Simple > complex.  Everything here is pure pandas / scikit-learn.
# lots of #comments so anyone can tweak ranges or models.

import pandas as pd
import numpy as np
from pathlib import Path

# ML
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.impute import SimpleImputer # Import SimpleImputer

# 1. Load
clean_path = Path("../data/processed/clean_hdb.csv")  # adjust if notebook lives elsewhere
df = pd.read_csv(clean_path, parse_dates=["sale_date"])

In [12]:
# Convert 'sale_date' to datetime if it's not already (important for sorting if using date directly)
df['sale_date'] = pd.to_datetime(df['sale_date'])

# --- Feature Engineering / Preprocessing based on your df.info() ---

# Identify columns to drop (identifiers, redundant string columns, highly cardinal strings)
cols_to_drop_pre_encoding = [
    '_id',        # Identifier
    'month',      # Redundant with 'sale_month' and is a string
    'town',       # Redundant if town_X columns are already present and used
    'flat_type',  # Redundant if flat_type_X columns are already present and used
    'block',      # Highly cardinal string, not suitable for direct encoding in Linear Regression
    'street_name' # Highly cardinal string, not suitable for direct encoding in Linear Regression
]
# Drop these columns if they exist in the DataFrame
df = df.drop(columns=[col for col in cols_to_drop_pre_encoding if col in df.columns])

# Identify remaining object Dtype columns that need one-hot encoding
# Based on your df.info(), these are likely 'storey_range' and 'flat_model'
categorical_cols_to_encode = [
    c for c in df.columns if df[c].dtype == 'object'
]

# Apply one-hot encoding to the remaining specified categorical columns
if categorical_cols_to_encode:
    df = pd.get_dummies(df, columns=categorical_cols_to_encode, drop_first=True)


In [13]:
# --- Handle Missing Values (NaN) ---
# Check for NaNs before imputation (optional, for verification)
# print("Columns with NaNs before imputation:")
# print(df.isnull().sum()[df.isnull().sum() > 0])

# Impute missing values for numerical columns.
# Based on your df.info(), these are 'lease_remaining_years' and 'flat_age'.
# We will use median imputation as it's robust to outliers.
numerical_cols_with_nans = ['lease_remaining_years', 'flat_age']

# Create an imputer
# You can use strategy='mean', 'median', or 'most_frequent'
imputer = SimpleImputer(strategy='median')

# Apply imputation. It's best to apply this to the full DataFrame before splitting
# to ensure consistent imputation across train and test sets.
# SimpleImputer returns a numpy array, so we convert it back to a DataFrame
# and preserve column names.
df[numerical_cols_with_nans] = imputer.fit_transform(df[numerical_cols_with_nans])

# Verify no more NaNs in these columns (optional)
# print("\nColumns with NaNs after imputation:")
# print(df.isnull().sum()[df.isnull().sum() > 0])


In [14]:
# ----------------------------------------------------------------
# 3 .1  Time-based train / test split
# ----------------------------------------------------------------

df = df.sort_values("sale_year")          # 3 .1 .1 – chronological order

train = df[df.sale_year <= 2023]          # 3 .1 .2 – up to 2023 inclusive
test  = df[df.sale_year >= 2024]          # last two years = hold-out

# drop obvious target leakage columns
y_col = "resale_price"
X_cols = [c for c in df.columns
          if c not in [y_col, "price_per_sqm", "sale_date"]]

X_train, y_train = train[X_cols], train[y_col]   # 3 .1 .3
X_test , y_test  = test [X_cols], test [y_col]

In [22]:
# ================================================================
# QA Plan - Implementing ChatGPT's checks
# ================================================================

print("\n--- QA Checks ---")

# 1. Column parity (Mis-aligned train/test columns after encoding)
missing_in_train = set(X_test.columns) - set(X_train.columns)
missing_in_test = set(X_train.columns) - set(X_test.columns)

if missing_in_train or missing_in_test:
    print(f"WARNING: Column mismatch between X_train and X_test!")
    if missing_in_train:
        print(f"  Columns in X_test but not in X_train: {missing_in_train}")
    if missing_in_test:
        print(f"  Columns in X_train but not in X_test: {missing_in_test}")
    # For a critical error, you might want to assert False or raise an error here
    # assert not missing_in_train and not missing_in_test, "Column mismatch detected!"
else:
    print("✅ Column parity: X_train and X_test have identical columns.")


# 2. No NaNs in model input
train_nans = X_train.isna().sum().sum()
test_nans = X_test.isna().sum().sum()
if train_nans == 0 and test_nans == 0:
    print("✅ No NaNs found in X_train or X_test.")
else:
    print(f"❌ NaNs found! X_train has {train_nans} NaNs, X_test has {test_nans} NaNs.")


# 3. Check test-set size
test_share = len(test) / len(df)
print(f"✅ Test set share: {test_share:.2%}")
if test_share < 0.05: # Using ChatGPT's 5% threshold
    print("   WARNING: Test set is less than 5% of total data, metrics might be noisy.")
elif test_share > 0.30: # Arbitrary upper limit, adjust as needed
    print("   Note: Test set is quite large, reducing training data significantly.")


# 4. Condition number for multicollinearity
try:
    from numpy.linalg import cond
    # Convert to float to avoid potential integer overflow for large numbers
    # Ensure all columns are numeric after all processing
    X_train_numeric = X_train.select_dtypes(include=np.number).to_numpy()
    if X_train_numeric.size > 0: # Check if there are numeric columns to compute cond on
        condition_number = cond(X_train_numeric)
        print(f"✅ Condition number for X_train: {condition_number:.2e}")
        if condition_number > 1e8:
            print("   WARNING: High condition number indicates severe multicollinearity.")
            print("            This affects coefficient interpretability but usually not prediction accuracy.")
    else:
        print("ℹ️ No numeric columns found in X_train to compute condition number.")

except ImportError:
    print("ℹ️ numpy.linalg.cond not available. Skipping multicollinearity check.")
except Exception as e:
    print(f"❌ Error computing condition number: {e}")


# 5. Re-run metrics with log-target for curiosity (Optional, but good for exploration)
# This isn't a "bug check" but a recommended exploration
try:
    lr_log = LinearRegression()
    # Apply log1p transformation to y_train and y_test
    y_train_log = np.log1p(y_train)
    y_test_log = np.log1p(y_test)

    lr_log.fit(X_train, y_train_log)
    preds_log = lr_log.predict(X_test)

    # Inverse transform predictions for MAE and RMSE calculation
    preds_original_scale = np.expm1(preds_log)

    mae_log_transformed = mean_absolute_error(y_test, preds_original_scale)
    rmse_log_transformed = np.sqrt(mean_squared_error(y_test, preds_original_scale))

    print(f"\n--- Log-transformed target metrics (for comparison) ---")
    print(f"MAE (log-transformed): {mae_log_transformed:.1f}")
    print(f"RMSE (log-transformed): {rmse_log_transformed:.1f}")
    if mae_log_transformed < mae:
        print("   Note: Log-transformation slightly improved MAE.")
    else:
        print("   Note: Log-transformation did not improve MAE for this baseline.")

except Exception as e:
    print(f"\n❌ Error during log-target metric calculation: {e}")


print("--- QA Checks Complete ---\n")


--- QA Checks ---
✅ Column parity: X_train and X_test have identical columns.
✅ No NaNs found in X_train or X_test.
✅ Test set share: 4.50%
✅ Condition number for X_train: 3.85e+04

--- Log-transformed target metrics (for comparison) ---
MAE (log-transformed): 81783.3
RMSE (log-transformed): 109381.3
   Note: Log-transformation slightly improved MAE.
--- QA Checks Complete ---



In [23]:
# ----------------------------------------------------------------
# 3 .2  Baseline Linear Regression
# ----------------------------------------------------------------
lr = LinearRegression()
lr.fit(X_train, y_train)

preds = lr.predict(X_test)
mae  = mean_absolute_error(y_test, preds)
mse = mean_squared_error(y_test, preds)
rmse = np.sqrt(mse)

results = pd.DataFrame({
    "model": ["LinearRegression"],
    "MAE_SGD": [round(mae, 1)],
    "RMSE_SGD": [round(rmse, 1)],
})
report_path = Path("../reports/baseline_metrics.csv")
results.to_csv(report_path, index=False)
print(results)


              model  MAE_SGD  RMSE_SGD
0  LinearRegression  83733.9  119830.1


In [21]:
# ----------------------------------------------------------------
# 3 .3  Quick Time-series cross-validation on the train set
#     (5 splits, expanding window)
# ----------------------------------------------------------------
tscv = TimeSeriesSplit(n_splits=5)
cv_mae = -cross_val_score(lr, X_train, y_train,
                          cv=tscv,
                          scoring="neg_mean_absolute_error")
print(f"CV MAE (mean ± std): {cv_mae.mean():.0f} ± {cv_mae.std():.0f} SGD")

CV MAE (mean ± std): 73752 ± 17389 SGD


In [None]:
'''
What’s happening:

    •   Chronological Split: We establish a time-based train/test split. The model trains on data up to 2023 inclusive and predicts prices for 2024 and onwards. This setup accurately mimics real-world deployment scenarios, ensuring no future data "leaks" into the training process.

    •   Feature Engineering & Selection:
        •   The target variable, 'resale_price', is excluded.
        •   'price_per_sqm' is also excluded to prevent target leakage, as it's directly derived from 'resale_price'.
        •   Highly cardinal string identifiers like '_id', 'block', and 'street_name' are dropped, as they are generally not suitable for direct use in linear regression without complex feature engineering.
        •   Raw date strings ('month', 'sale_date') are excluded, relying instead on numerical year and month features.
        •   Existing one-hot encoded columns (like 'town_X', 'flat_type_X') are utilized, and any remaining categorical strings ('storey_range', 'flat_model') are one-hot encoded to convert them into a numerical format suitable for the model.
        •   Missing numerical values in 'lease_remaining_years' and 'flat_age' are imputed using the median strategy for robustness.

    •   Baseline Model - Linear Regression: We use a straightforward Linear Regression model. This choice is deliberate due to its simplicity and lack of hyperparameters, providing a clear and fast baseline MAE (Mean Absolute Error) and RMSE (Root Mean Squared Error) to evaluate against more complex models (like tree-based methods) in subsequent phases.

    •   Metrics Explained:
        •   MAE (Mean Absolute Error): Represents the average absolute difference between predicted and actual prices, offering an easily interpretable measure of typical prediction error in dollar terms.
        •   RMSE (Root Mean Squared Error): Penalizes larger prediction errors more heavily than smaller ones. It's a commonly used metric for regression that provides a good comparative measure across models.
'''
