In [118]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import KFold
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.compose import TransformedTargetRegressor
from scipy.stats import uniform, randint
from sklearn.model_selection import RandomizedSearchCV

RANDOM_STATE = 42


In [119]:
# Define project directories
PROJECT_ROOT = Path("..").resolve()
DATA_RAW = PROJECT_ROOT / "data" / "raw"
SPLITS_DIR = PROJECT_ROOT / "data" / "splits"
REPORTS_DIR = PROJECT_ROOT / "reports"

# Create directories if they don't exist
DATA_RAW.mkdir(parents=True, exist_ok=True)
SPLITS_DIR.mkdir(parents=True, exist_ok=True)

In [120]:
df = pd.read_csv(DATA_RAW / "house_prices_train.csv")

In [121]:
TARGET_COL = "SalePrice"

# Separate features and target variable
y = df[TARGET_COL]
X = df.drop(columns=[TARGET_COL])

In [122]:
train_idx = np.load(SPLITS_DIR / "train_indices.npy")
test_idx = np.load(SPLITS_DIR / "test_indices.npy")

X_train = X.loc[train_idx].copy()
y_train = y.loc[train_idx].copy()

X_test = X.loc[test_idx].copy()
y_test = y.loc[test_idx].copy()

In [123]:
# Identify numeric and categorical columns
numeric_cols = X_train.select_dtypes(include=["int64", "float64"]).columns.to_list()
if 'Id' in numeric_cols:
    numeric_cols.remove('Id')
categorical_cols = X_train.select_dtypes(include=["object", "category"]).columns.to_list()

print(f"Categorical columns: {categorical_cols}")
print(f"Numeric columns: {numeric_cols}")

Categorical columns: ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']
Numeric columns: ['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea'

In [124]:
# Define preprocessing pipelines for numeric and categorical data
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ]
)

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)), 
    ]
)

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols),
    ]
)

In [125]:
cv = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

In [126]:
hgb_model = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("regressor", HistGradientBoostingRegressor(
            max_iter=100,  
            learning_rate=0.1, 
            max_depth=None,  
            random_state=RANDOM_STATE,
            min_samples_leaf=20, 
            max_leaf_nodes=31, 
            early_stopping=True,  
            validation_fraction=0.1, 
        )),
    ]
)

log_target_model = TransformedTargetRegressor(
    regressor=hgb_model,
    func=np.log1p,   
    inverse_func=np.expm1,  
)

param_distributions = {
    "regressor__regressor__learning_rate": uniform(0.01, 0.19),  
    "regressor__regressor__max_leaf_nodes": randint(15, 50),     
    "regressor__regressor__min_samples_leaf": randint(5, 30),    
    "regressor__regressor__max_iter": randint(100, 600),        
}

random_search = RandomizedSearchCV(
    estimator=log_target_model, 
    param_distributions=param_distributions,
    n_iter=40,
    scoring={
        "neg_root_mean_squared_error": "neg_root_mean_squared_error",
        "neg_mean_absolute_error": "neg_mean_absolute_error",
    },
    cv=cv,
    n_jobs=-1,
    refit="neg_root_mean_squared_error",
    random_state=RANDOM_STATE,
)

random_search.fit(X_train, y_train)

best_params = random_search.best_params_
best_score = -random_search.best_score_

print("Best Parameters:", best_params)
print("Best Score:", best_score)

Best Parameters: {'regressor__regressor__learning_rate': np.float64(0.1254335218612733), 'regressor__regressor__max_iter': 120, 'regressor__regressor__max_leaf_nodes': 23, 'regressor__regressor__min_samples_leaf': 11}
Best Score: 27743.102384165988


In [127]:
cv_results = pd.DataFrame(random_search.cv_results_)

# Multiply the relevant columns by -1 to make the scores positive
cv_results["mean_test_neg_root_mean_squared_error"] *= -1
cv_results["mean_test_neg_mean_absolute_error"] *= -1

cv_results["mean_test_rmse"] = cv_results["mean_test_neg_root_mean_squared_error"]
cv_results["mean_test_mae"] = cv_results["mean_test_neg_mean_absolute_error"]


cv_results_sorted = cv_results.sort_values(
    by="mean_test_neg_root_mean_squared_error",  # Corrected column name
    ascending=True  # Sort in ascending order since scores are now positive
)

cv_results_sorted.to_csv(REPORTS_DIR / "cv_results_leader.csv", index=False)

top10 = cv_results_sorted.head(10)

cols_to_show = [
    "params",
    "mean_test_neg_root_mean_squared_error",
    "std_test_neg_root_mean_squared_error",
    "mean_test_neg_mean_absolute_error",
    "std_test_neg_mean_absolute_error",
]

print("Top 10 Hyperparameter Combinations:")
print(top10[cols_to_show])

Top 10 Hyperparameter Combinations:
                                               params  \
8   {'regressor__regressor__learning_rate': 0.1254...   
21  {'regressor__regressor__learning_rate': 0.0414...   
0   {'regressor__regressor__learning_rate': 0.0811...   
11  {'regressor__regressor__learning_rate': 0.0557...   
34  {'regressor__regressor__learning_rate': 0.0573...   
37  {'regressor__regressor__learning_rate': 0.1286...   
20  {'regressor__regressor__learning_rate': 0.0633...   
13  {'regressor__regressor__learning_rate': 0.0843...   
38  {'regressor__regressor__learning_rate': 0.0454...   
10  {'regressor__regressor__learning_rate': 0.0832...   

    mean_test_neg_root_mean_squared_error  \
8                            27743.102384   
21                           27743.224825   
0                            27909.672417   
11                           27945.580500   
34                           28039.805519   
37                           28139.791892   
20                   

In [128]:
best_index = random_search.best_index_

mean_test_rmse = -random_search.cv_results_["mean_test_neg_root_mean_squared_error"][best_index]
std_test_rmse = random_search.cv_results_["std_test_neg_root_mean_squared_error"][best_index]

mean_test_mae = -random_search.cv_results_["mean_test_neg_mean_absolute_error"][best_index]
std_test_mae = random_search.cv_results_["std_test_neg_mean_absolute_error"][best_index]

print(f"RMSE (mean ± std): {mean_test_rmse:.0f} ± {std_test_rmse:.0f}")
print(f"MAE (mean ± std): {mean_test_mae:.0f} ± {std_test_mae:.0f}")

best_params = random_search.best_params_
print("Best hyperparameters:")
print(f"Learning rate: {best_params['regressor__regressor__learning_rate']}")
print(f"Max leaf nodes: {best_params['regressor__regressor__max_leaf_nodes']}")
print(f"Min samples leaf: {best_params['regressor__regressor__min_samples_leaf']}")
print(f"Max iterations: {best_params['regressor__regressor__max_iter']}")

RMSE (mean ± std): 27743 ± 4617
MAE (mean ± std): 17094 ± 1378
Best hyperparameters:
Learning rate: 0.1254335218612733
Max leaf nodes: 23
Min samples leaf: 11
Max iterations: 120
