In [56]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import KFold
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.compose import TransformedTargetRegressor
from scipy.stats import uniform, randint
from sklearn.model_selection import RandomizedSearchCV

RANDOM_STATE = 42


In [57]:
# Define project directories
PROJECT_ROOT = Path("..").resolve()
DATA_RAW = PROJECT_ROOT / "data" / "raw"
SPLITS_DIR = PROJECT_ROOT / "data" / "splits"
REPORTS_DIR = PROJECT_ROOT / "reports"

# Create directories if they don't exist
DATA_RAW.mkdir(parents=True, exist_ok=True)
SPLITS_DIR.mkdir(parents=True, exist_ok=True)

In [58]:
df = pd.read_csv(DATA_RAW / "house_prices_train.csv")

In [59]:
TARGET_COL = "SalePrice"

# Separate features and target variable
y = df[TARGET_COL]
X = df.drop(columns=[TARGET_COL])

In [60]:
train_idx = np.load(SPLITS_DIR / "train_indices.npy")
test_idx = np.load(SPLITS_DIR / "test_indices.npy")

X_train = X.loc[train_idx].copy()
y_train = y.loc[train_idx].copy()

X_test = X.loc[test_idx].copy()
y_test = y.loc[test_idx].copy()

In [61]:
# Identify numeric and categorical columns
numeric_cols = X_train.select_dtypes(include=["int64", "float64"]).columns.to_list()
if 'Id' in numeric_cols:
    numeric_cols.remove('Id')
categorical_cols = X_train.select_dtypes(include=["object", "category"]).columns.to_list()

print(f"Categorical columns: {categorical_cols}")
print(f"Numeric columns: {numeric_cols}")

Categorical columns: ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']
Numeric columns: ['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea'

In [62]:
# Define preprocessing pipelines for numeric and categorical data
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ]
)

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)), 
    ]
)

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols),
    ]
)

In [63]:
cv = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

In [64]:
hgb_model = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("regressor", HistGradientBoostingRegressor(
            max_iter=100,  
            learning_rate=0.1, 
            max_depth=None,  
            random_state=RANDOM_STATE,
            min_samples_leaf=20, 
            max_leaf_nodes=31, 
            early_stopping=True,  
            validation_fraction=0.1, 
        )),
    ]
)

log_target_model = TransformedTargetRegressor(
    regressor=hgb_model,
    func=np.log1p,   
    inverse_func=np.expm1,  
)

param_distributions = {
    "regressor__regressor__learning_rate": uniform(0.01, 0.19),  
    "regressor__regressor__max_leaf_nodes": randint(15, 50),     
    "regressor__regressor__min_samples_leaf": randint(5, 30),    
    "regressor__regressor__max_iter": randint(100, 600),        
}

random_search = RandomizedSearchCV(
    estimator=log_target_model, 
    param_distributions=param_distributions,
    n_iter=40,
    scoring={
        "neg_root_mean_squared_error": "neg_root_mean_squared_error",
        "neg_mean_absolute_error": "neg_mean_absolute_error",
    },
    cv=cv,
    n_jobs=-1,
    refit="neg_root_mean_squared_error",
    random_state=RANDOM_STATE,
)

random_search.fit(X_train, y_train)

best_params = random_search.best_params_
best_score = -random_search.best_score_

print("Best Parameters:", best_params)
print("Best Score:", best_score)

Best Parameters: {'regressor__regressor__learning_rate': np.float64(0.1254335218612733), 'regressor__regressor__max_iter': 120, 'regressor__regressor__max_leaf_nodes': 23, 'regressor__regressor__min_samples_leaf': 11}
Best Score: -27743.102384165988
