In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import HistGradientBoostingRegressor


RANDOM_STATE = 42


In [2]:
# Define project directories
PROJECT_ROOT = Path("..").resolve()
DATA_RAW = PROJECT_ROOT / "data" / "raw"
SPLITS_DIR = PROJECT_ROOT / "data" / "splits"
REPORTS_DIR = PROJECT_ROOT / "reports"

# Create directories if they don't exist
DATA_RAW.mkdir(parents=True, exist_ok=True)
SPLITS_DIR.mkdir(parents=True, exist_ok=True)

In [3]:
df = pd.read_csv(DATA_RAW / "house_prices_train.csv")

In [4]:
TARGET_COL = "SalePrice"

# Separate features and target variable
y = df[TARGET_COL]
X = df.drop(columns=[TARGET_COL])

In [5]:
train_idx = np.load(SPLITS_DIR / "train_indices.npy")

X_train = X.loc[train_idx].copy()
y_train = y.loc[train_idx].copy()

In [6]:
# Identify numeric and categorical columns
numeric_cols = X_train.select_dtypes(include=["int64", "float64"]).columns.to_list()
if 'Id' in numeric_cols:
    numeric_cols.remove('Id')
categorical_cols = X_train.select_dtypes(include=["object", "category"]).columns.to_list()

print(f"Categorical columns: {categorical_cols}")
print(f"Numeric columns: {numeric_cols}")

Categorical columns: ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']
Numeric columns: ['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea'

In [10]:
# Define preprocessing pipelines for numeric and categorical data
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ]
)

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)), 
    ]
)

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols),
    ]
)

In [11]:
cv = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

In [12]:
# Create the full pipeline with preprocessing and model
hgb_model = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("regressor", HistGradientBoostingRegressor(
            max_iter=100,  
            learning_rate=0.1, 
            max_depth=None, 
            random_state=RANDOM_STATE,  
            min_samples_leaf=20,  
            max_leaf_nodes=31,  
            early_stopping=True,  
            validation_fraction=0.1,  
        )),
    ]
)

# Cross-validate RF pipeline
cv_scores = cross_validate(
    hgb_model,
    X_train,
    y_train,
    cv=cv,
    scoring=["neg_root_mean_squared_error", "neg_mean_absolute_error"],
    n_jobs=-1,
)

# Convert negative scores to positive errors
rmse_values = -cv_scores["test_neg_root_mean_squared_error"]
mae_values = -cv_scores["test_neg_mean_absolute_error"]

# Prepare a single row with RF metrics
hgb_values = {
    "model": "HistGradientBoostingRegressor_base",
    "rmse_mean": rmse_values.mean(),
    "rmse_std": rmse_values.std(),
    "mae_mean": mae_values.mean(),
    "mae_std": mae_values.std(),
}

# Load the baseline results CSV into a DataFrame
df_baseline = pd.read_csv(REPORTS_DIR / "metrics_baseline.csv")

# Use 'model' as an index to update or create the RF row
df_baseline = df_baseline.set_index("model")

# Update or create the RF row
df_baseline.loc["HistGradientBoostingRegressor_base", ["rmse_mean", "rmse_std", "mae_mean", "mae_std"]] = [
    hgb_values["rmse_mean"],
    hgb_values["rmse_std"],
    hgb_values["mae_mean"],
    hgb_values["mae_std"],
]

# Reset index back to a normal column and save to CSV
df_baseline = df_baseline.reset_index()
df_baseline.to_csv(REPORTS_DIR / "metrics_baseline.csv", index=False)

# Display the updated DataFrame
print(df_baseline)


                                model     rmse_mean     rmse_std  \
0                          Dummy_mean  77160.108457  3673.252798   
1                   Linear_Regression  36397.273713  7544.717946   
2                     Ridge_alpha_1.0  32814.497631  7332.537866   
3                     Lasso_alpha_0.1  36039.211360  7401.016075   
4               RandomForestRegressor  30187.811059  4925.198978   
5                   Lasso_alpha_100.0  31597.031137  7633.433043   
6                    Ridge_alpha_10.0  31556.174539  7216.615113   
7  HistGradientBoostingRegressor_base  28748.832334  4848.514218   

       mae_mean      mae_std  
0  56318.232277  2612.148500  
1  19621.981805   806.121901  
2  18676.071378  1116.810159  
3  19444.763807   899.418995  
4  18172.212787  1123.688346  
5  20612.670160   857.574200  
6  18001.482651   581.320602  
7  17057.678687   995.424645  
