In [None]:
import itertools
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
import os
import math

INPUT_FILE = "suburb_info.xlsx"
FEATURES = ["number_of_houses", "number_of_units", "population", "aus_born_perc", "median_income"]
TARGET = "median_house_price"
LAMBDAS = [-1.0, -0.5, 0.0, 0.5, 1.0]
SCALERS = ["minmax", "robust", "zscore"]
CV_FOLDS = 3
RANDOM_STATE = 0

# Load
if not os.path.exists(INPUT_FILE):
    raise FileNotFoundError(f"Could not find {INPUT_FILE} in the working directory. Make sure the file is present.")

df = pd.read_excel(INPUT_FILE)

# Convert 'aus_born_perc' from '67%' → 67.0 (float)
df["aus_born_perc"] = df["aus_born_perc"].astype(str).str.replace('%', '', regex=False)
df["aus_born_perc"] = pd.to_numeric(df["aus_born_perc"], errors="coerce")

# Convert 'median_income' from '$1,583' → 1583 (int)
df["median_income"] = (
    df["median_income"]
    .astype(str)
    .str.replace(r"[\$,]", "", regex=True)
    .astype(float)
)

# Convert 'median_house_price' from '$1,148,100' → 1148100 (int)
df["median_house_price"] = (
    df["median_house_price"]
    .astype(str)
    .str.replace(r"[\$,]", "", regex=True)
    .astype(float)
)

# Check positives (Box-Cox requires > 0)
for col in FEATURES + [TARGET]:
    if (df[col] <= 0).any():
        raise ValueError(f"Column {col} contains non-positive values; Box-Cox requires strictly positive values.")

X_orig = df[FEATURES].astype(float).copy()
y = df[TARGET].astype(float).copy()

results = []
kf = KFold(n_splits=CV_FOLDS, shuffle=True, random_state=RANDOM_STATE)
model = LinearRegression()

lambda_combos = list(itertools.product(LAMBDAS, repeat=len(FEATURES)))
total_runs = len(SCALERS) * len(lambda_combos)
print(f"Total runs to evaluate: {total_runs}")

for scaler_name in SCALERS:
    for lambdas in lambda_combos:
        # Box-Cox transform each feature with its lambda
        X_trans = np.zeros_like(X_orig.values, dtype=float)
        for i, col in enumerate(FEATURES):
            lam = lambdas[i]
            X_trans[:, i] = stats.boxcox(X_orig.iloc[:, i].values, lmbda=lam)

        # Scale
        if scaler_name == "minmax":
            scaler = MinMaxScaler(feature_range=(0, 1))
        elif scaler_name == "robust":
            scaler = RobustScaler()
        else:
            scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X_trans)

        # CV
        r2_scores = cross_val_score(model, X_scaled, y.values, cv=kf, scoring="r2")
        neg_mse_scores = cross_val_score(model, X_scaled, y.values, cv=kf, scoring="neg_mean_squared_error")
        mse_scores = -neg_mse_scores
        rmse_scores = np.sqrt(mse_scores)

        results.append({
            "scaler": scaler_name,
            "lambdas": tuple(lambdas),
            "mean_R2": float(np.mean(r2_scores)),
            "std_R2": float(np.std(r2_scores)),
            "mean_RMSE": float(np.mean(rmse_scores)),
            "std_RMSE": float(np.std(rmse_scores))
        })

res_df = pd.DataFrame(results).sort_values(by="mean_R2", ascending=False).reset_index(drop=True)
out_path = "boxcox_scaling_results.csv"
res_df.to_csv(out_path, index=False)

print("Best result (by mean R^2):")
best = res_df.iloc[0]
print(f"  scaler: {best['scaler']}")
print(f"  lambdas (per feature order {FEATURES}): {best['lambdas']}")
print(f"  mean R^2: {best['mean_R2']:.6f} (std {best['std_R2']:.6f})")
print(f"  mean RMSE: {best['mean_RMSE']:.6f} (std {best['std_RMSE']:.6f})")
print(f"All results saved to: {out_path}")


{'number_of_houses': [np.float64(0.0), np.float64(0.5), np.float64(-0.5)], 'number_of_units': [np.float64(0.0), np.float64(0.5), np.float64(-0.5)], 'population': [np.float64(0.5), np.float64(1.0), np.float64(1.5)], 'aus_born_perc': [np.float64(3.0), np.float64(2.5), np.float64(3.5)], 'median_income': [np.float64(0.5), np.float64(1.0), np.float64(0.0)]}
Total runs to evaluate: 729
Best result (by mean R^2):
  scaler: minmax
  lambdas (per feature order ['number_of_houses', 'number_of_units', 'population', 'aus_born_perc', 'median_income']): (np.float64(0.5), np.float64(0.0), np.float64(1.5), np.float64(2.5), np.float64(1.0))
  mean R^2: 0.666431 (std 0.078910)
  mean RMSE: 270254.737312 (std 37885.187524)
All results saved to: boxcox_scaling_results.csv


In [17]:
print(df)

               suburb  number_of_houses  number_of_units   municipality  \
0          ABBOTSFORD              2304             4706          Yarra   
1          ABERFELDIE              1410              453  Moonee Valley   
2           ALBANVALE              1897              138       Brimbank   
3              ALBION              1389             1392       Brimbank   
4          ALPHINGTON              1729             1099        Darebin   
..                ...               ...              ...            ...   
197  WILLIAMS LANDING              2735              173        Wyndham   
198           WINDSOR              2201             4448    Stonnington   
199           WOLLERT              6516              259     Whittlesea   
200         YALLAMBIE              1286               81        Banyule   
201        YARRAVILLE              5855             2072    Maribyrnong   

     aus_born_perc  median_income  median_house_price  population  
0               68         1797

I have a data file, suburb_info.xlsx, which contain the following non-zero positive numeric columns: number_of_houses, number_of_units, population, aus_born_perc, median_income, median_house_price. I want to make a linear model to predict median_house_price using the other 5 attributes.  for this, i want the features to be on the same scale, and to have as much linear relationship with median_house_price as possible.  

to do this, i want to create the following framework:  
each individual attribute should be transformed with box-cox power transformation with varying values of lambda (the 5 common values, -1, -0.5, 0, 0.5, 1 will do).  
then, all 5 attributes should be scaled (with the same scale) of either:   
1. min-max scaling to [0, 1] = $$x_{scaled} = \frac{x - x_{min}}{x_{max} - x_{min}}$$   
2. robust scaling: $$x_{scaled} = \frac{x - x_{median}}{IQR(x)}$$   

the framework should run a simple linear model on every permutation of picking one scaling method for all 5 attributes, and then varying lambda values for box-cox transformation for each of the 5 attributes.  
save the RMSE and R^2 metrics for each permutation, and denote which scaling and box-cox lambda parameters yields the best R^2 value.  
to optimize performance, use only 3-fold CV.