In [1]:
!pip install xgboost


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [87]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from xgboost import XGBRegressor
import numpy as np



In [3]:
features = pd.read_parquet('data_output/clean_features.parquet')

In [4]:
features.head()

Unnamed: 0,CORE_price_paid,CORE_deed_date,1HE_district,BACKUP_county,ID_combined_address_x,ID_fuzzy_match,CORE_match_confidence,CORE_BUILDING_REFERENCE_NUMBER,1HE_CURRENT_ENERGY_RATING,1HE_POTENTIAL_ENERGY_RATING,...,AI_MAINHEAT_DESCRIPTION,1HE_MAINHEAT_ENERGY_EFF,AI_MAINHEATCONT_DESCRIPTION,1HE_MAINHEATC_ENERGY_EFF,AI_LIGHTING_DESCRIPTION,1HE_LIGHTING_ENERGY_EFF,1HE_MECHANICAL_VENTILATION,BACKUP_CONSTRUCTION_AGE_BAND,1HE_TENURE,ID_combined_address_y
0,582000,2022-06-06,BUCKINGHAMSHIRE,BUCKINGHAMSHIRE,49 amersham road nan hp13 5aa,49 amersham road hp13 5aa,92.592593,5175796000.0,D,B,...,"Boiler and radiators, mains gas",Good,"Programmer, room thermostat and TRVs",Good,Low energy lighting in 73% of fixed outlets,Very Good,natural,England and Wales: 1900-1929,Owner Occupied,49 amersham road hp13 5aa
1,582000,2022-06-06,BUCKINGHAMSHIRE,BUCKINGHAMSHIRE,49 amersham road nan hp13 5aa,49 amersham road hp13 5aa,92.592593,5175796000.0,D,B,...,"Boiler and radiators, mains gas",Good,"Programmer, room thermostat and TRVs",Good,Low energy lighting in 53% of fixed outlets,Good,natural,England and Wales: 1900-1929,Owner Occupied,49 amersham road hp13 5aa
2,582000,2022-06-06,BUCKINGHAMSHIRE,BUCKINGHAMSHIRE,49 amersham road nan hp13 5aa,49 amersham road hp13 5aa,92.592593,5175796000.0,E,E,...,"Boiler and radiators, mains gas",Good,"Programmer, room thermostat and TRVs",Good,No low energy lighting,Very Poor,natural,England and Wales: 1900-1929,Owner Occupied,49 amersham road hp13 5aa
3,582000,2022-06-06,BUCKINGHAMSHIRE,BUCKINGHAMSHIRE,49 amersham road nan hp13 5aa,49 amersham road hp13 5aa,92.592593,5175796000.0,D,D,...,"Boiler and radiators, mains gas",Good,"Programmer, room thermostat and TRVs",Good,No low energy lighting,Very Poor,natural,England and Wales: 1900-1929,Owner Occupied,49 amersham road hp13 5aa
4,655000,2024-12-12,BUCKINGHAMSHIRE,BUCKINGHAMSHIRE,67 amersham road nan hp13 5aa,67a amersham road hp13 5aa,90.909091,10005290000.0,D,C,...,"Boiler and radiators, mains gas",Good,"Programmer, room thermostat and TRVs",Good,Low energy lighting in all fixed outlets,Very Good,natural,England and Wales: 1930-1949,Owner Occupied,67a amersham road hp13 5aa


In [None]:
encoding_cols = [col for col in features.columns if col.startswith('1HE_')]
core_cols = [col for col in features.columns if col.startswith('CORE_')]
id_cols = [col for col in features.columns if col.startswith('ID_')]
AI_cols = [col for col in features.columns if col.startswith('AI_')]

final_feature_cols = encoding_cols + core_cols
features = features[final_feature_cols]

train = features.loc[features['CORE_deed_date'].dt.year < 2025]
test = features.loc[features['CORE_deed_date'].dt.year >= 2025]

X_train = train[final_feature_cols].drop(columns=['CORE_price_paid', 'CORE_deed_date', 'CORE_INSPECTION_DATE'], axis=1) # I need to drop date here! 
X_test = test[final_feature_cols].drop(columns=['CORE_price_paid','CORE_deed_date', 'CORE_INSPECTION_DATE'], axis=1) # I need to drop date here! 
y_train = train['CORE_price_paid']
y_test = test['CORE_price_paid']


In [54]:
# Train Test Split - Stratified
features['ENRICH_YEAR'] = pd.to_datetime(features['CORE_deed_date']).dt.year

X = features.drop(columns=['CORE_price_paid', 'CORE_deed_date', 'CORE_INSPECTION_DATE', 'ENRICH_YEAR'])
y = features['CORE_price_paid']

# Stratified Splits - could also try by Year-Month
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.3,
    stratify=features['ENRICH_YEAR'],
    random_state=42
)

# Checking the year distribution
print('TRAINING DATA...')
print(features.loc[X_train.index, 'ENRICH_YEAR'].value_counts().sort_index())

print('TEST DATA...')
print(features.loc[X_test.index, 'ENRICH_YEAR'].value_counts().sort_index())

TRAINING DATA...
ENRICH_YEAR
2020    2638
2021    5183
2022    3963
2023    2794
2024    3366
2025    2473
Name: count, dtype: int64
TEST DATA...
ENRICH_YEAR
2020    1131
2021    2221
2022    1699
2023    1197
2024    1443
2025    1060
Name: count, dtype: int64


In [55]:
preprocessor = ColumnTransformer([
    ('categorical', OneHotEncoder(drop='first', handle_unknown='ignore'),encoding_cols)
],remainder='passthrough')

In [None]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(
        n_estimators=100,
        max_depth=20,
        min_samples_split=5,
        random_state=43,
        n_jobs=-1
    ))
])

In [57]:
print('Training Random Forest...')
pipeline.fit(X_train, y_train)
print('Training Complete')
print(pipeline)

Training Random Forest...
Training Complete
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('categorical',
                                                  OneHotEncoder(drop='first',
                                                                handle_unknown='ignore'),
                                                  ['1HE_district',
                                                   '1HE_CURRENT_ENERGY_RATING',
                                                   '1HE_POTENTIAL_ENERGY_RATING',
                                                   '1HE_PROPERTY_TYPE',
                                                   '1HE_BUILT_FORM',
                                                   '1HE_ENERGY_TARIFF',
                                                   '1HE_MAINS_GAS_FLAG',
                                                   '1HE_GLAZED_AREA',
                                             

In [58]:
# Making Predictions on test data
y_train_pred = pipeline.predict(X_train)
y_test_pred = pipeline.predict(X_test)

In [59]:
# Evaluating
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))

train_mae = mean_absolute_error(y_train,y_train_pred)
test_mae = mean_absolute_error(y_test,y_test_pred)

train_mape = round(mean_absolute_percentage_error(y_train,y_train_pred)*100,2)
test_mape = round(mean_absolute_percentage_error(y_test,y_test_pred)*100,2)


print(f'Train RMSE: {train_rmse} \nTest RMSE: {test_rmse} \nTrain MAE:{train_mae} \nTest MAE: {test_mae} \nTrain MAPE: {train_mape}% \nTest MAPE: {test_mape}%')


Train RMSE: 129311.41353792151 
Test RMSE: 286634.9016329023 
Train MAE:69905.93181315358 
Test MAE: 139821.78756169832 
Train MAPE: 10.7% 
Test MAPE: 19.69%


In [60]:
# Saving the model
import joblib

joblib.dump(pipeline, 'models/RF_Naive_Pipeline.pkl')

#loaded_pipeline = joblib.load('models/LR_Pipeline.pkl')
#preds = loaded_pipeline.predict(X_new)

['models/RF_Naive_Pipeline.pkl']

# XGBoost

In [126]:
pipeline_XG = Pipeline([
    ('preprocessor', preprocessor),
    #('feature_selection', SelectFromModel(
    #    XGBRegressor(n_estimators=100, random_state=42),
    #    threshold='0.5*mean' # Test Alternative thresholds.
    #    )), # Validate!
    ('regressor', XGBRegressor(
        n_estimators=1000,
        learning_rate = 0.03,
        max_depth = 7,
        random_state = 42,
        min_child_weight=3, 
        subsample=0.85,
        colsample_bytree=0.8,
        colsample_bylevel=0.85,
        gamma=0.2,
        reg_alpha = 0.1,
        reg_lambda = 2
    ))
])

In [130]:
# Fitting the pipeline
print('Training XGBoost...')
pipeline_XG.fit(X_train, y_train)
print('Training Complete')
print(pipeline)

Training XGBoost...
Training Complete
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('categorical',
                                                  OneHotEncoder(drop='first',
                                                                handle_unknown='ignore'),
                                                  ['1HE_district',
                                                   '1HE_CURRENT_ENERGY_RATING',
                                                   '1HE_POTENTIAL_ENERGY_RATING',
                                                   '1HE_PROPERTY_TYPE',
                                                   '1HE_BUILT_FORM',
                                                   '1HE_ENERGY_TARIFF',
                                                   '1HE_MAINS_GAS_FLAG',
                                                   '1HE_GLAZED_AREA',
                                                   

In [None]:
# Predictions
y_train_pred_XG = pipeline_XG.predict(X_train)
y_test_pred_XG = pipeline_XG.predict(X_test)

In [132]:
# Evaluating 

train_rmse_XG = np.sqrt(mean_squared_error(y_train, y_train_pred_XG))
test_rmse_XG = np.sqrt(mean_squared_error(y_test, y_test_pred_XG))

train_mae_XG = mean_absolute_error(y_train,y_train_pred_XG)
test_mae_XG = mean_absolute_error(y_test,y_test_pred_XG)

train_mape_XG = round(mean_absolute_percentage_error(y_train,y_train_pred_XG)*100,2)
test_mape_XG = round(mean_absolute_percentage_error(y_test,y_test_pred_XG)*100,2)


print(f'Train RMSE: {train_rmse_XG} \nTest RMSE: {test_rmse_XG} \nTrain MAE:{train_mae_XG} \nTest MAE: {test_mae_XG} \nTrain MAPE: {train_mape_XG}% \nTest MAPE: {test_mape_XG}%')


Train RMSE: 104164.75988164409 
Test RMSE: 278600.68547851406 
Train MAE:69954.49725412646 
Test MAE: 135636.67883134785 
Train MAPE: 12.21% 
Test MAPE: 18.97%


In [135]:
joblib.dump(pipeline, 'models/XGBoost_Naive_Pipeline.pkl')

['models/XGBoost_Naive_Pipeline.pkl']

Important: SHAP values should be used to determine the most contributive features.