In [None]:
# Phase 3 – I'm building a baseline model.
# ================================================================
# I'm keeping it simple here, just using pandas and scikit-learn.
# I'll add a lot of comments so it's easy to see what I'm doing.

import pandas as pd
import numpy as np
from pathlib import Path

# I'll need these for the machine learning part.
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.impute import SimpleImputer # I need this to handle missing values.

# 1. I'll load the data first.
clean_path = Path("../data/processed/clean_hdb.csv")  # I might need to change this path if I move the notebook.
df = pd.read_csv(clean_path, parse_dates=["sale_date"])

In [None]:
# I need to make sure 'sale_date' is a datetime object for sorting.
df['sale_date'] = pd.to_datetime(df['sale_date'])

# --- I'll do some feature engineering and preprocessing based on what I saw in df.info(). ---

# I need to identify which columns to drop.
cols_to_drop_pre_encoding = [
    '_id',        # This is just an identifier.
    'month',      # This is redundant because I have 'sale_month'.
    'town',       # This is redundant if I already have the town_X columns.
    'flat_type',  # Same for this one.
    'block',      # This has too many unique values for linear regression.
    'street_name' # This one too.
]
# Now I'll drop them if they exist.
df = df.drop(columns=[col for col in cols_to_drop_pre_encoding if col in df.columns])

# Now I need to find the remaining object columns to one-hot encode.
# I think it's just 'storey_range' and 'flat_model'.
categorical_cols_to_encode = [
    c for c in df.columns if df[c].dtype == 'object'
]

# And now I'll apply the one-hot encoding.
if categorical_cols_to_encode:
    df = pd.get_dummies(df, columns=categorical_cols_to_encode, drop_first=True)


In [None]:
# --- I need to handle the missing values (NaNs). ---
# I'll use median imputation since it's robust to outliers.
numerical_cols_with_nans = ['lease_remaining_years', 'flat_age']

# I'll create an imputer.
imputer = SimpleImputer(strategy='median')
# SimpleImputer gives me a numpy array, so I'll convert it back to a DataFrame.
df[numerical_cols_with_nans] = imputer.fit_transform(df[numerical_cols_with_nans])
# I can check for NaNs after imputation with print("\nColumns with NaNs after imputation:")

In [None]:
# ----------------------------------------------------------------
# 3.1  I'll do a time-based train/test split.
# ----------------------------------------------------------------

df = df.sort_values("sale_year")          # 3.1.1 – I need to make sure it's in chronological order.

train = df[df.sale_year <= 2023]          # 3.1.2 – I'll train on everything up to 2023.
test  = df[df.sale_year >= 2024]          # And use the last two years as a hold-out set.

# I should drop any obvious target leakage columns.
y_col = "resale_price"
X_cols = [c for c in df.columns
          if c not in [y_col, "price_per_sqm", "sale_date"]]

X_train, y_train = train[X_cols], train[y_col]   # 3.1.3
X_test , y_test  = test [X_cols], test [y_col]

In [None]:
# ================================================================
# My QA Plan - I'm implementing some checks that ChatGPT suggested.
# ================================================================

print("\n--- QA Checks ---")

# 1. I'll check for column parity to make sure I didn't mess up the train/test columns.
missing_in_train = set(X_test.columns) - set(X_train.columns)
missing_in_test = set(X_train.columns) - set(X_test.columns)

if missing_in_train or missing_in_test:
    print(f"WARNING: Column mismatch between X_train and X_test!")
    if missing_in_train:
        print(f"  Columns in X_test but not in X_train: {missing_in_train}")
    if missing_in_test:
        print(f"  Columns in X_train but not in X_test: {missing_in_test}")
    # This is a critical error, so I might want to assert False here.
else:
    print("✅ Column parity: X_train and X_test have identical columns.")


# 2. I need to make sure there are no NaNs in the model input.
train_nans = X_train.isna().sum().sum()
test_nans = X_test.isna().sum().sum()
if train_nans == 0 and test_nans == 0:
    print("✅ No NaNs found in X_train or X_test.")
else:
    print(f"❌ NaNs found! X_train has {train_nans} NaNs, X_test has {test_nans} NaNs.")


# 3. I'll check the test-set size.
test_share = len(test) / len(df)
print(f"✅ Test set share: {test_share:.2%}")
if test_share < 0.05: # I'll use ChatGPT's 5% threshold.
    print("   WARNING: Test set is less than 5% of total data, metrics might be noisy.")
elif test_share > 0.30: # This is an arbitrary upper limit I can adjust.
    print("   Note: Test set is quite large, reducing training data significantly.")


# 4. I'll check the condition number for multicollinearity.
try:
    from numpy.linalg import cond
    # I'll convert to float to avoid any potential integer overflow.
    X_train_numeric = X_train.select_dtypes(include=np.number).to_numpy()
    if X_train_numeric.size > 0: # I'll only compute this if there are numeric columns.
        condition_number = cond(X_train_numeric)
        print(f"✅ Condition number for X_train: {condition_number:.2e}")
        if condition_number > 1e8:
            print("   WARNING: High condition number indicates severe multicollinearity.")
            print("            This affects coefficient interpretability but usually not prediction accuracy.")
    else:
        print("ℹ️ No numeric columns found in X_train to compute condition number.")

except ImportError:
    print("ℹ️ numpy.linalg.cond not available. Skipping multicollinearity check.")
except Exception as e:
    print(f"❌ Error computing condition number: {e}")


# 5. I'm curious, so I'll re-run the metrics with a log-transformed target.
# This isn't a bug check, just something I want to explore.
try:
    lr_log = LinearRegression()
    # I'll apply a log1p transformation to y_train and y_test.
    y_train_log = np.log1p(y_train)
    y_test_log = np.log1p(y_test)

    lr_log.fit(X_train, y_train_log)
    preds_log = lr_log.predict(X_test)

    # I need to inverse transform the predictions to calculate MAE and RMSE.
    preds_original_scale = np.expm1(preds_log)

    mae_log_transformed = mean_absolute_error(y_test, preds_original_scale)
    rmse_log_transformed = np.sqrt(mean_squared_error(y_test, preds_original_scale))

    print(f"\n--- Log-transformed target metrics (for comparison) ---")
    print(f"MAE (log-transformed): {mae_log_transformed:.1f}")
    print(f"RMSE (log-transformed): {rmse_log_transformed:.1f}")
    if mae_log_transformed < mae:
        print("   Note: Log-transformation slightly improved MAE.")
    else:
        print("   Note: Log-transformation did not improve MAE for this baseline.")

except Exception as e:
    print(f"\n❌ Error during log-target metric calculation: {e}")


print("--- QA Checks Complete ---\n")

In [None]:
# ----------------------------------------------------------------
# 3.2  My baseline Linear Regression.
# ----------------------------------------------------------------
lr = LinearRegression()
lr.fit(X_train, y_train)

preds = lr.predict(X_test)
mae  = mean_absolute_error(y_test, preds)
mse = mean_squared_error(y_test, preds)
rmse = np.sqrt(mse)

results = pd.DataFrame({
    "model": ["LinearRegression"],
    "MAE_SGD": [round(mae, 1)],
    "RMSE_SGD": [round(rmse, 1)],
})
report_path = Path("../reports/baseline_metrics.csv")
results.to_csv(report_path, index=False)
print(results)


In [None]:
# ----------------------------------------------------------------
# 3.3  I'll do a quick time-series cross-validation on the train set.
#      (5 splits, with an expanding window)
# ----------------------------------------------------------------
tscv = TimeSeriesSplit(n_splits=5)
cv_mae = -cross_val_score(lr, X_train, y_train,
                          cv=tscv,
                          scoring="neg_mean_absolute_error")
print(f"CV MAE (mean ± std): {cv_mae.mean():.0f} ± {cv_mae.std():.0f} SGD")

In [None]:
'''
Here's what I'm doing:

    •   Chronological Split: I'm setting up a time-based train/test split. I'll train the model on data up to 2023 and predict prices for 2024 and later. This is how it would work in the real world, so I'm not accidentally leaking future data into my training.

    •   Feature Engineering & Selection:
        •   I'm excluding the target variable, 'resale_price'.
        •   I'm also excluding 'price_per_sqm' to avoid target leakage, since it's derived from 'resale_price'.
        •   I'm dropping highly cardinal string identifiers like '_id', 'block', and 'street_name' because they aren't great for linear regression without a lot of work.
        •   I'm excluding raw date strings and just using the numerical year and month features.
        •   I'm using the one-hot encoded columns I already have and encoding any other categorical strings.
        •   I'm imputing missing numerical values with the median.

    •   Baseline Model - Linear Regression: I'm using a simple Linear Regression model. I chose it because it's simple and doesn't have any hyperparameters, so it gives me a clear baseline to compare against more complex models.

    •   My metrics:
        •   MAE (Mean Absolute Error): This is the average absolute difference between my predicted and actual prices. It's easy to understand.
        •   RMSE (Root Mean Squared Error): This penalizes larger errors more heavily. It's a good way to compare different models.
'''
