In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import (
    OneHotEncoder, 
    StandardScaler, 
    RobustScaler, 
    FunctionTransformer, 
    PowerTransformer,
    OrdinalEncoder
)
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPRegressor
import lightgbm as lgb
import optuna
import warnings



df = pd.read_csv('data/train_rm_OL.csv')

# Load Data (assuming df is already loaded)
target = "SalePrice"

# Split X and y
y = np.log1p(df[target])  # Log-transforming the target
X = df.drop(columns=[target])

# Define categorical & numerical columns
ordinal_columns = [
    "OverallQual", "OverallCond", "ExterQual", "ExterCond", "BsmtQual", "BsmtCond", 
    "BsmtExposure", "BsmtFinType1", "BsmtFinType2", "HeatingQC", "KitchenQual", 
    "FireplaceQu", "GarageQual", "GarageCond", "PoolQC"
]
ordinal_columns = [col for col in ordinal_columns if col in X.columns]  # Ensure they exist in data

nominal_columns = [
    col for col in X.select_dtypes(include=["object", "category"]).columns 
    if col not in ordinal_columns
]
numerical_columns = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

# --- Define ordinal mappings ---
ordinal_mappings = {
    "ExterQual": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "NA": 0},
    "ExterCond": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "NA": 0},
    "BsmtQual": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "NA": 0},
    "BsmtCond": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "NA": 0},
    "HeatingQC": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "NA": 0},
    "KitchenQual": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "NA": 0},
    "FireplaceQu": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "NA": 0},
    "GarageQual": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "NA": 0},
    "GarageCond": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "NA": 0},
    "PoolQC": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "NA": 0},
    "OverallQual": {i: i for i in range(1, 11)},
    "OverallCond": {i: i for i in range(1, 11)},
    "BsmtExposure": {"Gd": 4, "Av": 3, "Mn": 2, "No": 1, "NA": 0},
    "BsmtFinType1": {"GLQ": 6, "ALQ": 5, "BLQ": 4, "Rec": 3, "LwQ": 2, "Unf": 1, "NA": 0},
    "BsmtFinType2": {"GLQ": 6, "ALQ": 5, "BLQ": 4, "Rec": 3, "LwQ": 2, "Unf": 1, "NA": 0}
}

# --- Build the transformers ---
ordinal_transformers = []
for col in ordinal_columns:
    if col in ordinal_mappings:
        ordinal_transformers.append(
            (f'ord_{col}', 
             OrdinalEncoder(
                 categories=[list(ordinal_mappings[col].keys())],
                 handle_unknown='use_encoded_value',
                 unknown_value=-1
             ), 
             [col])
        )

# --- Define Preprocessing Pipeline ---
preprocessor = ColumnTransformer(
    transformers=[
        ("num", Pipeline([('log', FunctionTransformer(np.log1p, validate=True)),
            ("scaler", StandardScaler())  # Scale numerical data
        ]), numerical_columns),
        ("nom", OneHotEncoder(handle_unknown="ignore"), nominal_columns),
    ] + ordinal_transformers
)

# --- Apply Feature Selection (Optional) ---
feature_selector = SelectKBest(mutual_info_regression, k="all")  # Set k to select the best features

# --- Create Full Pipeline ---
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("feature_selection", feature_selector)  # Apply feature selection
])

# --- Transform Data ---
X_transformed = pipeline.fit_transform(X, y)

print(f"Final shape of X: {X_transformed.shape}")


Final shape of X: (1430, 342)


In [None]:

def objective(trial):
    params = {
        "hidden_layer_sizes": (
            trial.suggest_int("layer1", 10, 32),
            trial.suggest_int("layer2", 10, 32),
        ),
        "alpha": trial.suggest_float("alpha", 1e-4, 1e1, log=True),
        "learning_rate_init": trial.suggest_float("learning_rate", 1e-4, 0.1, log=True),
    }
    model = MLPRegressor(**params, max_iter=1000)
    return sklearn.model_selection.cross_val_score(model, X_transformed, y, cv=5, scoring="r2").mean()


# Run Optuna study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=2)

# Print best trial
trial = study.best_trial
print("Best R2:", trial.value)
print("Best hyperparameters:", trial.params)


[I 2025-03-09 09:40:59,857] A new study created in memory with name: no-name-88bd3094-c2bc-4755-9f39-9db224f0e366
[I 2025-03-09 09:41:04,474] Trial 0 finished with value: 0.6773729783032356 and parameters: {'layer1': 27, 'layer2': 10, 'alpha': 0.00024395215346022532, 'learning_rate': 0.0010124037894030527}. Best is trial 0 with value: 0.6773729783032356.
[I 2025-03-09 09:41:08,956] Trial 1 finished with value: 0.654057242186399 and parameters: {'layer1': 19, 'layer2': 23, 'alpha': 0.03928514049051563, 'learning_rate': 0.0008230884190237551}. Best is trial 0 with value: 0.6773729783032356.


Best R2: 0.6773729783032356
Best hyperparameters: {'layer1': 27, 'layer2': 10, 'alpha': 0.00024395215346022532, 'learning_rate': 0.0010124037894030527}


In [None]:


# Ensure the features are scaled
# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)

def objective(trial):
    # Choose model type
    model_type = trial.suggest_categorical("model_type", ["Ridge", "Lasso", "ElasticNet"])
    
    # Choose regularization strength (alpha)
    alpha = trial.suggest_float("alpha", 1e-4, 10.0, log=True)
    
    # Define model based on choice
    if model_type == "Ridge":
        model = Ridge(alpha=alpha)
    elif model_type == "Lasso":
        model = Lasso(alpha=alpha)
    else:  # ElasticNet
        l1_ratio = trial.suggest_float("l1_ratio", 0.1, 0.9)  # Mix between Lasso (1.0) & Ridge (0.0)
        model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio)

    # Pipeline to standardize data (important for regularization)
    # pipeline = make_pipeline(StandardScaler(), model)

    # Perform cross-validation
    score = cross_val_score(model, X_transformed, y, cv=5, scoring="r2").mean()
    
    return score  # Optuna maximizes R²

# Run optimization
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

print("Best model and parameters:", study.best_params)


[I 2025-03-09 09:42:14,081] A new study created in memory with name: no-name-5698b0c9-af49-4713-9bde-e22827c9e6ef
[I 2025-03-09 09:42:14,134] Trial 0 finished with value: 0.0024921741715003163 and parameters: {'model_type': 'Lasso', 'alpha': 0.4317597154560786}. Best is trial 0 with value: 0.0024921741715003163.
[I 2025-03-09 09:42:14,689] Trial 1 finished with value: 0.8945613503558167 and parameters: {'model_type': 'ElasticNet', 'alpha': 0.007551768601153703, 'l1_ratio': 0.8570836676954822}. Best is trial 1 with value: 0.8945613503558167.
[I 2025-03-09 09:42:14,719] Trial 2 finished with value: -0.002365061359100018 and parameters: {'model_type': 'ElasticNet', 'alpha': 2.0036859127463535, 'l1_ratio': 0.24757964658062737}. Best is trial 1 with value: 0.8945613503558167.
[I 2025-03-09 09:42:14,988] Trial 3 finished with value: 0.8766249095849554 and parameters: {'model_type': 'Lasso', 'alpha': 0.0187476123323215}. Best is trial 1 with value: 0.8945613503558167.
[I 2025-03-09 09:42:15,0

Best model and parameters: {'model_type': 'Lasso', 'alpha': 0.0005086421868919714}


In [17]:
import sklearn.ensemble

def objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 2, 20)
    max_depth = trial.suggest_int("max_depth", 1, 32)
    
    reg = sklearn.ensemble.RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth)

    return sklearn.model_selection.cross_val_score(
        reg, X_transformed, y, n_jobs=-1, cv=5, scoring="r2"
    ).mean()

# Run Optuna study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

# Print best trial
trial = study.best_trial
print("Best R2:", trial.value)
print("Best hyperparameters:", trial.params)


[I 2025-03-09 09:44:04,812] A new study created in memory with name: no-name-e5bb2567-a7c6-42c4-8685-0072b7d51e16
[I 2025-03-09 09:44:08,166] Trial 0 finished with value: 0.8691228527060115 and parameters: {'n_estimators': 12, 'max_depth': 24}. Best is trial 0 with value: 0.8691228527060115.
[I 2025-03-09 09:44:10,828] Trial 1 finished with value: 0.8697147340322701 and parameters: {'n_estimators': 16, 'max_depth': 18}. Best is trial 1 with value: 0.8697147340322701.
[I 2025-03-09 09:44:12,305] Trial 2 finished with value: 0.8697821273426902 and parameters: {'n_estimators': 16, 'max_depth': 13}. Best is trial 2 with value: 0.8697821273426902.
[I 2025-03-09 09:44:13,304] Trial 3 finished with value: 0.8662342482888269 and parameters: {'n_estimators': 9, 'max_depth': 21}. Best is trial 2 with value: 0.8697821273426902.
[I 2025-03-09 09:44:13,418] Trial 4 finished with value: 0.45988464311734656 and parameters: {'n_estimators': 17, 'max_depth': 1}. Best is trial 2 with value: 0.8697821273

Best R2: 0.8776539578248416
Best hyperparameters: {'n_estimators': 19, 'max_depth': 15}


In [3]:

def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 1000),
        "max_depth": trial.suggest_int("max_depth", 1, 10),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 1e-1),
        "num_leaves": trial.suggest_int("num_leaves", 8, 1000),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-3, 10.0, log=True),  # L1 regularization
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-3, 10.0, log=True),  # L2 regularization
        "force_row_wise": True
    }
    model = lgb.LGBMRegressor(**params, verbose=-1) #I suppress the output due to many warinnings that optimal split is not found
    return sklearn.model_selection.cross_val_score(model, X_transformed, y, cv=5, scoring="r2").mean()


# Run Optuna study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

# Print best trial
trial = study.best_trial
print("Best R2:", trial.value)
print("Best hyperparameters:", trial.params)



[I 2025-03-09 10:10:22,930] A new study created in memory with name: no-name-c97f759b-949d-411d-adc0-316c896851b4
[I 2025-03-09 10:10:25,700] Trial 0 finished with value: 0.9021801050378315 and parameters: {'n_estimators': 641, 'max_depth': 9, 'learning_rate': 0.024193992549588234, 'num_leaves': 744, 'subsample': 0.9278269035683757, 'colsample_bytree': 0.6625742551633016, 'reg_alpha': 0.0018244884978586735, 'reg_lambda': 0.0012301239974014367}. Best is trial 0 with value: 0.9021801050378315.
[I 2025-03-09 10:10:26,150] Trial 1 finished with value: 0.9054959549120374 and parameters: {'n_estimators': 819, 'max_depth': 1, 'learning_rate': 0.07298296458867502, 'num_leaves': 96, 'subsample': 0.5379432263364072, 'colsample_bytree': 0.7150709992617499, 'reg_alpha': 0.13517314859898205, 'reg_lambda': 0.10565125323729006}. Best is trial 1 with value: 0.9054959549120374.
[I 2025-03-09 10:10:27,323] Trial 2 finished with value: 0.8980536507440794 and parameters: {'n_estimators': 227, 'max_depth':

Best R2: 0.9100067984226552
Best hyperparameters: {'n_estimators': 852, 'max_depth': 3, 'learning_rate': 0.029034608070974337, 'num_leaves': 307, 'subsample': 0.8401342545925692, 'colsample_bytree': 0.6105922374334778, 'reg_alpha': 0.004436315380798706, 'reg_lambda': 0.0010040614367059146}
