In [1]:
# Using HistGradientBoostingRegressor

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingRegressor

In [2]:
def prepare_dataset(df, one_hot=False):
    # Encode categorical features with one hot, and replace NA with "Missing"
    # With one hot encoding support
    categorical_columns = df.select_dtypes(include=['object']).columns
    for key in categorical_columns:
        df[key] = df[key].astype('category').cat.add_categories("Missing").fillna("Missing")
    if not one_hot:
        return df
    
    categorical_columns = df.select_dtypes(include=['category']).columns
    encoder = OneHotEncoder(sparse_output=False)
    encoded_features = encoder.fit_transform(df[categorical_columns])
    encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_columns.to_list()))
    df = pd.concat([df.drop(categorical_columns, axis=1), encoded_df], axis=1)
    return df

In [3]:
train_file_path = "data/house-prices/train.csv"
dataset_df = pd.read_csv(train_file_path)
dataset_df = dataset_df.drop('Id', axis=1)
target_column = 'SalePrice'
dataset_df = prepare_dataset(dataset_df, False)
y = dataset_df.pop(target_column)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(dataset_df, y, test_size=0.2, random_state=42)

In [5]:
# Initialize the HistGradientBoostingRegressor
# Set categorical_features to 'from_dtype' so that it automatically detects pandas.Categorical columns (requires sklearn >= 1.4.2)
model = HistGradientBoostingRegressor(categorical_features='from_dtype')

# Train the model
model.fit(X_train, y_train)

['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']


In [7]:
test_file_path = "data/house-prices/test.csv"
test_data = pd.read_csv(test_file_path)
ids = test_data.pop('Id')

test_data = prepare_dataset(test_data, False)
sample_submission_df = pd.read_csv('data/house-prices/sample_submission.csv')
sample_submission_df['SalePrice'] = model.predict(test_data)
sample_submission_df.to_csv('working/hist.csv', index=False)
sample_submission_df.head()

Unnamed: 0,Id,SalePrice
0,1461,126861.641863
1,1462,153242.213146
2,1463,186711.176803
3,1464,199130.572734
4,1465,188666.519241


# Cross Validation

In [12]:
# Grid Search
from sklearn.model_selection import GridSearchCV

# Define the model
model = HistGradientBoostingRegressor(categorical_features='from_dtype')

# Set up the parameter grid
param_grid = {
    'max_depth': [3, 4, 6, 8],
    'learning_rate': [0.05, 0.1, 0.3],
    'max_iter': [100],
    'l2_regularization': [0.1, 1, 2, 3, 5, 10],
}

# Configure GridSearchCV
# When cv=None, default is 5-fold cross validation
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, 
                           cv=None, scoring='neg_mean_squared_error', verbose=2)

# Fit GridSearchCV
grid_search.fit(dataset_df, y)

# Best parameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best RMSE:", (-grid_search.best_score_) ** 0.5)

Fitting 5 folds for each of 72 candidates, totalling 360 fits
[CV] END l2_regularization=0.1, learning_rate=0.05, max_depth=3, max_iter=100; total time=   1.5s
[CV] END l2_regularization=0.1, learning_rate=0.05, max_depth=3, max_iter=100; total time=   2.8s
[CV] END l2_regularization=0.1, learning_rate=0.05, max_depth=3, max_iter=100; total time=   8.8s
[CV] END l2_regularization=0.1, learning_rate=0.05, max_depth=3, max_iter=100; total time=  11.4s
[CV] END l2_regularization=0.1, learning_rate=0.05, max_depth=3, max_iter=100; total time=   8.8s
[CV] END l2_regularization=0.1, learning_rate=0.05, max_depth=4, max_iter=100; total time=  14.8s
[CV] END l2_regularization=0.1, learning_rate=0.05, max_depth=4, max_iter=100; total time=  16.1s
[CV] END l2_regularization=0.1, learning_rate=0.05, max_depth=4, max_iter=100; total time=  15.6s
[CV] END l2_regularization=0.1, learning_rate=0.05, max_depth=4, max_iter=100; total time=  14.9s
[CV] END l2_regularization=0.1, learning_rate=0.05, max_

In [11]:
model = HistGradientBoostingRegressor(categorical_features='from_dtype', **grid_search.best_params_)
model.fit(dataset_df, y)
test_file_path = "data/house-prices/test.csv"
test_data = pd.read_csv(test_file_path)
ids = test_data.pop('Id')

test_data = prepare_dataset(test_data, False)
sample_submission_df = pd.read_csv('data/house-prices/sample_submission.csv')
sample_submission_df['SalePrice'] = model.predict(test_data)
sample_submission_df.to_csv('working/hist-cv.csv', index=False)
sample_submission_df.head()

Unnamed: 0,Id,SalePrice
0,1461,128149.981297
1,1462,157240.70591
2,1463,182599.478059
3,1464,197312.203554
4,1465,199533.499288
