In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import (
    OneHotEncoder, 
    RobustScaler, 
    FunctionTransformer, 
    OrdinalEncoder
)
import sklearn
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPRegressor
import lightgbm as lgb
import optuna



df = pd.read_csv('data/train_rm_OL.csv')

# Load Data (assuming df is already loaded)
target = "SalePrice"

# Split X and y
y = np.log1p(df[target])  # Log-transforming the target
X = df.drop(columns=[target])

# Define categorical & numerical columns
ordinal_columns = [
    "OverallQual", "OverallCond", "ExterQual", "ExterCond", "BsmtQual", "BsmtCond", 
    "BsmtExposure", "BsmtFinType1", "BsmtFinType2", "HeatingQC", "KitchenQual", 
    "FireplaceQu", "GarageQual", "GarageCond", "PoolQC"
]
ordinal_columns = [col for col in ordinal_columns if col in X.columns]  # Ensure they exist in data

nominal_columns = [
    col for col in X.select_dtypes(include=["object", "category"]).columns 
    if col not in ordinal_columns
]
numerical_columns = X.select_dtypes(include=["int64", "float64"]).columns.tolist()


ordinal_mappings = {
    "ExterQual": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "NA": 0},
    "ExterCond": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "NA": 0},
    "BsmtQual": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "NA": 0},
    "BsmtCond": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "NA": 0},
    "HeatingQC": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "NA": 0},
    "KitchenQual": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "NA": 0},
    "FireplaceQu": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "NA": 0},
    "GarageQual": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "NA": 0},
    "GarageCond": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "NA": 0},
    "PoolQC": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "NA": 0},
    "OverallQual": {i: i for i in range(1, 11)},
    "OverallCond": {i: i for i in range(1, 11)},
    "BsmtExposure": {"Gd": 4, "Av": 3, "Mn": 2, "No": 1, "NA": 0},
    "BsmtFinType1": {"GLQ": 6, "ALQ": 5, "BLQ": 4, "Rec": 3, "LwQ": 2, "Unf": 1, "NA": 0},
    "BsmtFinType2": {"GLQ": 6, "ALQ": 5, "BLQ": 4, "Rec": 3, "LwQ": 2, "Unf": 1, "NA": 0}
}

ordinal_transformers = []
for col in ordinal_columns:
    if col in ordinal_mappings:
        ordinal_transformers.append(
            (f'ord_{col}', 
             OrdinalEncoder(
                 categories=[list(ordinal_mappings[col].keys())],
                 handle_unknown='use_encoded_value',
                 unknown_value=-1
             ), 
             [col])
        )


preprocessor = ColumnTransformer(
    transformers=[
        ("num", Pipeline([('log', FunctionTransformer(np.log1p, validate=True)),
            ("scaler", RobustScaler())  # Scale numerical data after log
        ]), numerical_columns),
        ("nom", OneHotEncoder(handle_unknown="ignore"), nominal_columns),
    ] + ordinal_transformers
)

pipeline = Pipeline([
    ("preprocessor", preprocessor),
])

X_transformed = pipeline.fit_transform(X, y)

print(f"Final shape of X: {X_transformed.shape}")


Final shape of X: (1430, 342)


In [2]:

def objective(trial):
    params = {
        "hidden_layer_sizes": (
            trial.suggest_int("layer1", 10, 32),
            trial.suggest_int("layer2", 10, 32),
        ),
        "alpha": trial.suggest_float("alpha", 1e-4, 1e1, log=True),
        "learning_rate_init": trial.suggest_float("learning_rate", 1e-4, 0.1, log=True),
    }
    model = MLPRegressor(**params, max_iter=1000)
    return sklearn.model_selection.cross_val_score(model, X_transformed, y, cv=5, scoring="r2").mean()


# Run Optuna study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

# Print best trial
trial = study.best_trial
print("Best R2:", trial.value)
print("Best hyperparameters:", trial.params)


[I 2025-03-10 10:27:17,644] A new study created in memory with name: no-name-7127d6fd-fca7-4db7-aa7b-fd4ff65e4486
[I 2025-03-10 10:27:29,480] Trial 0 finished with value: 0.4565712885122841 and parameters: {'layer1': 29, 'layer2': 25, 'alpha': 0.0005166807702824, 'learning_rate': 0.00011851319198179293}. Best is trial 0 with value: 0.4565712885122841.
[I 2025-03-10 10:27:44,480] Trial 1 finished with value: 0.35071006850768854 and parameters: {'layer1': 30, 'layer2': 18, 'alpha': 0.00036281372001573207, 'learning_rate': 0.00011203209541024333}. Best is trial 0 with value: 0.4565712885122841.
[I 2025-03-10 10:27:48,351] Trial 2 finished with value: 0.6979103035718482 and parameters: {'layer1': 25, 'layer2': 13, 'alpha': 0.00918036749079353, 'learning_rate': 0.006261915071886201}. Best is trial 2 with value: 0.6979103035718482.
[I 2025-03-10 10:27:54,941] Trial 3 finished with value: 0.5401662399524081 and parameters: {'layer1': 22, 'layer2': 10, 'alpha': 0.0011258738806291812, 'learning

Best R2: 0.8444195287496612
Best hyperparameters: {'layer1': 17, 'layer2': 14, 'alpha': 0.8418062607346753, 'learning_rate': 0.0008047954534664058}


In [3]:


# Ensure the features are scaled
# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)

def objective(trial):
    # Choose model type
    model_type = trial.suggest_categorical("model_type", ["Ridge", "Lasso", "ElasticNet"])
    
    # Choose regularization strength (alpha)
    alpha = trial.suggest_float("alpha", 1e-4, 10.0, log=True)
    
    # Define model based on choice
    if model_type == "Ridge":
        model = Ridge(alpha=alpha)
    elif model_type == "Lasso":
        model = Lasso(alpha=alpha)
    else:  # ElasticNet
        l1_ratio = trial.suggest_float("l1_ratio", 0.1, 0.9)  # Mix between Lasso (1.0) & Ridge (0.0)
        model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio)

    # Pipeline to standardize data (important for regularization)
    # pipeline = make_pipeline(StandardScaler(), model)

    # Perform cross-validation
    score = cross_val_score(model, X_transformed, y, cv=5, scoring="r2").mean()
    
    return score  # Optuna maximizes R²

# Run optimization
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50, n_jobs=-1)

print("Best model and parameters:", study.best_params)


[I 2025-03-10 10:32:27,141] A new study created in memory with name: no-name-aa07cc15-0c2a-4764-bbe8-8e69d1538c3f
[I 2025-03-10 10:32:27,240] Trial 5 finished with value: -0.002365061359100018 and parameters: {'model_type': 'ElasticNet', 'alpha': 4.3072561920707, 'l1_ratio': 0.20304078127425607}. Best is trial 5 with value: -0.002365061359100018.
[I 2025-03-10 10:32:27,264] Trial 2 finished with value: 0.6683902643810744 and parameters: {'model_type': 'Lasso', 'alpha': 0.11328020555609132}. Best is trial 2 with value: 0.6683902643810744.
[I 2025-03-10 10:32:27,269] Trial 6 finished with value: -0.002365061359100018 and parameters: {'model_type': 'ElasticNet', 'alpha': 6.58915800819121, 'l1_ratio': 0.6526597553629435}. Best is trial 2 with value: 0.6683902643810744.
[I 2025-03-10 10:32:27,328] Trial 8 finished with value: 0.6823498312081453 and parameters: {'model_type': 'ElasticNet', 'alpha': 0.21045462836364512, 'l1_ratio': 0.47475329573773084}. Best is trial 8 with value: 0.682349831

Best model and parameters: {'model_type': 'Lasso', 'alpha': 0.0005069943346946719}


In [4]:
import sklearn.ensemble

def objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 2, 20)
    max_depth = trial.suggest_int("max_depth", 1, 32)
    
    reg = sklearn.ensemble.RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth)

    return sklearn.model_selection.cross_val_score(
        reg, X_transformed, y, n_jobs=-1, cv=5, scoring="r2"
    ).mean()

# Run Optuna study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

# Print best trial
trial = study.best_trial
print("Best R2:", trial.value)
print("Best hyperparameters:", trial.params)


[I 2025-03-10 10:32:33,775] A new study created in memory with name: no-name-6f89e738-5988-43a9-92b7-614289888a10
[I 2025-03-10 10:32:35,891] Trial 0 finished with value: 0.8440124694472709 and parameters: {'n_estimators': 4, 'max_depth': 17}. Best is trial 0 with value: 0.8440124694472709.
[I 2025-03-10 10:32:36,939] Trial 1 finished with value: 0.7323836064835201 and parameters: {'n_estimators': 6, 'max_depth': 3}. Best is trial 0 with value: 0.8440124694472709.
[I 2025-03-10 10:32:38,789] Trial 2 finished with value: 0.8691640793933285 and parameters: {'n_estimators': 17, 'max_depth': 21}. Best is trial 2 with value: 0.8691640793933285.
[I 2025-03-10 10:32:40,122] Trial 3 finished with value: 0.8695377425589881 and parameters: {'n_estimators': 15, 'max_depth': 13}. Best is trial 3 with value: 0.8695377425589881.
[I 2025-03-10 10:32:40,555] Trial 4 finished with value: 0.8611667006620538 and parameters: {'n_estimators': 7, 'max_depth': 9}. Best is trial 3 with value: 0.86953774255898

Best R2: 0.876338326638529
Best hyperparameters: {'n_estimators': 18, 'max_depth': 24}


In [5]:

def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 300, 1000),
        "max_depth": trial.suggest_int("max_depth", 1, 4),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 1e-1),
        "num_leaves": trial.suggest_int("num_leaves", 8, 1000),
        # "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        # "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-3, 10.0, log=True),  # L1 regularization
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-3, 10.0, log=True),  # L2 regularization
        "force_row_wise": True
    }
    model = lgb.LGBMRegressor(**params, verbose=-1) #I suppress the output due to many warnings that optimal split is not found
    return sklearn.model_selection.cross_val_score(model, X_transformed, y, cv=5, scoring="r2").mean()


# Run Optuna study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

# Print best trial
trial = study.best_trial
print("Best R2:", trial.value)
print("Best hyperparameters:", trial.params)



[I 2025-03-10 10:35:33,776] A new study created in memory with name: no-name-079410df-cbd6-400a-8365-feb63ed9d081
[I 2025-03-10 10:35:35,665] Trial 0 finished with value: 0.8967090493091826 and parameters: {'n_estimators': 889, 'max_depth': 3, 'learning_rate': 0.006697147540639736, 'num_leaves': 352, 'reg_alpha': 0.0032651857532792483, 'reg_lambda': 1.954222128624492}. Best is trial 0 with value: 0.8967090493091826.
[I 2025-03-10 10:35:36,411] Trial 1 finished with value: 0.8944421168071036 and parameters: {'n_estimators': 878, 'max_depth': 4, 'learning_rate': 0.06608889810583818, 'num_leaves': 33, 'reg_alpha': 1.7944003526723005, 'reg_lambda': 0.0016320663650815491}. Best is trial 0 with value: 0.8967090493091826.
[I 2025-03-10 10:35:37,142] Trial 2 finished with value: 0.9090274268507746 and parameters: {'n_estimators': 415, 'max_depth': 3, 'learning_rate': 0.0926952167056839, 'num_leaves': 241, 'reg_alpha': 0.021743969391415947, 'reg_lambda': 0.009925342539370966}. Best is trial 2 w

Best R2: 0.9092595905524172
Best hyperparameters: {'n_estimators': 492, 'max_depth': 3, 'learning_rate': 0.059766719431178915, 'num_leaves': 322, 'reg_alpha': 0.017729388440876875, 'reg_lambda': 0.8896739716299361}
