In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

# Loading the dataset
file_path = '../data/feature_engineered_data.csv'
df = pd.read_csv(file_path, index_col = 0)

In [16]:
df.index = pd.to_datetime(df.index)

In [18]:
# Adding temporal variables
df['YEAR'] = df.index.year
df['MONTH'] = df.index.month
df['DAY'] = df.index.day
df['QUARTER'] = df.index.quarter

In [22]:
from xgboost import XGBRegressor
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import pandas as pd

# Initialize TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=5)

# Lists to store scores for each fold
mse_scores = []
r2_scores = []
mae_scores = []

# Parameter grid for RandomizedSearchCV
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'learning_rate': [0.001, 0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7, 10, 12, 15],
    'min_child_weight': [1, 3, 5, 7, 10],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bylevel': [0.6, 0.7, 0.8, 0.9, 1.0],
    'gamma': [0, 0.1, 0.2, 0.4, 0.5],
    'alpha': [0, 0.1, 0.5, 1.0],
    'lambda': [1, 1.5, 2.0, 3.0],
    'scale_pos_weight': [1, 2, 5],
    'max_delta_step': [0, 1, 5],
}

# Features with all variables (including temporal ones)
X = df[['INT_SQFT', 'N_ROOM', 'AREA', 'SALE_COND', 
        'PARK_FACIL', 'UTILITY_AVAIL', 'MZZONE', 'DIST_MAINROAD', 
        'BUILDTYPE_commercial', 'BUILDTYPE_house' , 'PROPERTY_AGE', 'AREA_INT_SQFT', 
        'AREA_N_ROOM','YEAR', 'MONTH', 'DAY', 'QUARTER']].values

# Target variable (total_price)
y = df['SALES_PRICE'].values

# TimeSeries cross-validation
for train_index, test_index in tscv.split(X):
    
    # Train and test sets
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # RandomizedSearchCV for XGBoost with all variables
    xgb = XGBRegressor(objective='reg:squarederror', random_state=42)
    random_search = RandomizedSearchCV(
        estimator=xgb,
        param_distributions=param_grid,
        n_iter=100,  # Number of parameter combinations to try
        scoring='neg_mean_squared_error',
        cv=tscv,  # Use TimeSeriesSplit for cross-validation
        verbose=3,
        n_jobs=-1,
        random_state=42
    )
    
    # Fit the model
    random_search.fit(X_train, y_train)
    best_xgb = random_search.best_estimator_
    
    # Predict on test set
    y_pred = best_xgb.predict(X_test)
    
    # Calculate scores
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    
    # Store the scores for each fold
    mse_scores.append(mse)
    r2_scores.append(r2)
    mae_scores.append(mae)

# Output average scores across all folds
print("Average MSE: ", sum(mse_scores) / len(mse_scores))
print("Average R²: ", sum(r2_scores) / len(r2_scores))
print("Average MAE: ", sum(mae_scores) / len(mae_scores))


Fitting 5 folds for each of 100 candidates, totalling 500 fits
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Average MSE:  747438169163.9238
Average R²:  0.8975293889626934
Average MAE:  636821.4240272373


# After adding interacting variables:
Average MSE:  747438169163.9238

Average R²:  0.8975293889626934

Average MAE:  636821.4240272373

# Before adding interacting variables:
Average MSE:  896852790569.9255

Average R²:  0.8747397615887887

Average MAE:  677170.7884241246