# ANALYSIS

#### IN THE PREVIOUS NOTEBOOK, WE IDENTIFIED TENSORFLOW AND XGBREGRESSOR AS THE TOP CANDIDATE MODELS.

| ASPECT | TENSORFLOW | XGBREGRESSOR |
| ---- | ---- | ---- |
| Accuracy and MAE | 	Good performance, but higher MAE compared to XGBRegressor |	Best performance with higher accuracy and the lowest MAE |
| Efficiency and Resources |	Slower training and more resource-intensive	|Faster training and more computationally efficient |
| Best Use Case |	Suitable when deep learning experimentation is required |	Ideal when high performance, efficiency, and reliability are priorities |


#### NEXT STEPS
- Perform hyperparameter tuning on the XGBRegressor
- Re-evaluate performance against the TensorFlow model
- Select the best-performing model for final deployment

# IMPORT DATA

In [1]:
from google.colab import drive
drive.mount('/content/Drive')

Mounted at /content/Drive


In [2]:
import pandas as pd
import numpy as np

In [3]:
train_df = pd.read_csv('/content/Drive/MyDrive/House_price_predictor/dataset/featured_data/featured_train_data.csv')
test_df = pd.read_csv('/content/Drive/MyDrive/House_price_predictor/dataset/featured_data/featured_test_data.csv')

In [4]:
X_train = train_df.drop(columns=['price'])
y_train = train_df['price']

X_test = test_df.drop(columns=['price'])
y_test = test_df['price']

# HYPERPARAMETER TUNING WITH OPTUNA

In [5]:
!pip install optuna
import optuna
import numpy as np
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

Collecting optuna
  Downloading optuna-4.6.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.6.0-py3-none-any.whl (404 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.7/404.7 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.10.1 optuna-4.6.0


## WITHOUT USING LOG VALUES

In [6]:
def objective(trial):

  # PARAMETER
  n_estimators = trial.suggest_int('n_estimators', 100 , 1000)
  max_depth = trial.suggest_int('max_depth', 1, 10)
  learning_rate = trial.suggest_float('learning_rate', 0.001, 0.3)
  eta = trial.suggest_float('eta', 0.1 , 0.5)
  subsample = trial.suggest_int('subsample', 0 , 1)
  random_state = 42


  # MODEL

  model = XGBRegressor(
      n_estimators=n_estimators,
      max_depth=max_depth,
      learning_rate=learning_rate,
      eta=eta,
      subsample=subsample,
      random_state=random_state
  )

  # TRAIN

  model.fit(X_train , y_train, verbose=False)

  y_pred = model.predict(X_test)

  mae = mean_absolute_error(y_test , y_pred)

  return mae


study = optuna.create_study(direction='minimize')
study.optimize(objective , n_trials=50)

print(study.best_params)

[I 2026-01-06 18:09:29,813] A new study created in memory with name: no-name-ab0a44a4-fa83-46dc-8d19-cddb1f9e9510
[I 2026-01-06 18:09:34,350] Trial 0 finished with value: 239635.89824424568 and parameters: {'n_estimators': 114, 'max_depth': 3, 'learning_rate': 0.028553996703346428, 'eta': 0.3873972160771826, 'subsample': 0}. Best is trial 0 with value: 239635.89824424568.
[I 2026-01-06 18:09:57,337] Trial 1 finished with value: 239635.89824424568 and parameters: {'n_estimators': 963, 'max_depth': 2, 'learning_rate': 0.14919098382603144, 'eta': 0.20469934653115282, 'subsample': 0}. Best is trial 0 with value: 239635.89824424568.
[I 2026-01-06 18:10:03,543] Trial 2 finished with value: 239635.89824424568 and parameters: {'n_estimators': 243, 'max_depth': 9, 'learning_rate': 0.18930580772405017, 'eta': 0.31221896619262623, 'subsample': 0}. Best is trial 0 with value: 239635.89824424568.
[I 2026-01-06 18:10:41,056] Trial 3 finished with value: 41702.340532465365 and parameters: {'n_estimat

{'n_estimators': 777, 'max_depth': 4, 'learning_rate': 0.05020333377045197, 'eta': 0.3794940311797137, 'subsample': 1}


## WITH LOG VALUES

In [8]:
def objective(trial):

  # PARAMETER
  n_estimators = trial.suggest_int('n_estimators', 100 , 1000)
  max_depth = trial.suggest_int('max_depth', 1, 10)
  learning_rate = trial.suggest_float('learning_rate', 0.001, 0.3)
  eta = trial.suggest_float('eta', 0.1 , 0.5)
  subsample = trial.suggest_int('subsample', 0 , 1)
  random_state = 42

  y_train_log = np.log1p(y_train)



  # MODEL

  model = XGBRegressor(
      n_estimators=n_estimators,
      max_depth=max_depth,
      learning_rate=learning_rate,
      eta=eta,
      subsample=subsample,
      random_state=random_state
  )

  # TRAIN

  model.fit(X_train , y_train_log, verbose=False)

  y_pred_log = model.predict(X_test)

  y_pred = np.expm1(y_pred_log)

  mae = mean_absolute_error(y_test , y_pred)

  return mae


study = optuna.create_study(direction='minimize')
study.optimize(objective , n_trials=50)

print(study.best_params)
log_best_param = study.best_params

[I 2026-01-06 18:56:02,309] A new study created in memory with name: no-name-d3fe6d1f-edac-4eb1-abe0-66fc430cc989
[I 2026-01-06 18:56:23,487] Trial 0 finished with value: 267941.87659958634 and parameters: {'n_estimators': 756, 'max_depth': 8, 'learning_rate': 0.13858983855245058, 'eta': 0.4530423604658589, 'subsample': 0}. Best is trial 0 with value: 267941.87659958634.
[I 2026-01-06 18:56:29,247] Trial 1 finished with value: 267941.87659958634 and parameters: {'n_estimators': 222, 'max_depth': 9, 'learning_rate': 0.03432715227819864, 'eta': 0.4809556076363297, 'subsample': 0}. Best is trial 0 with value: 267941.87659958634.
[I 2026-01-06 18:56:42,347] Trial 2 finished with value: 267941.87659958634 and parameters: {'n_estimators': 518, 'max_depth': 3, 'learning_rate': 0.2352118170745256, 'eta': 0.28850696768078476, 'subsample': 0}. Best is trial 0 with value: 267941.87659958634.
[I 2026-01-06 18:56:56,892] Trial 3 finished with value: 53348.03564297713 and parameters: {'n_estimators'

{'n_estimators': 549, 'max_depth': 6, 'learning_rate': 0.05485753874997094, 'eta': 0.21400082394417877, 'subsample': 1}


# MODEL RETEST

In [10]:
without_log_params = {'n_estimators': 777, 'max_depth': 4, 'learning_rate': 0.05020333377045197, 'eta': 0.3794940311797137, 'subsample': 1}
log_params = {'n_estimators': 549, 'max_depth': 6, 'learning_rate': 0.05485753874997094, 'eta': 0.21400082394417877, 'subsample': 1}

In [11]:
from xgboost import XGBRegressor

Model_1 = XGBRegressor(**without_log_params)
Model_2 = XGBRegressor(**log_params)

In [13]:
# DATA
X_train = X_train.copy()
X_test = X_test.copy()
y_train = y_train.copy()
y_train_log = np.log1p(y_train)
y_test = y_test.copy()
y_test_log = np.log1p(y_test)

In [14]:
# MODEL 1 -> WITHOUT LOG

Model_1.fit(X_train , y_train , verbose=False)
y_pred_1 = Model_1.predict(X_test)
mae_1 = mean_absolute_error(y_test , y_pred_1)
print(f'Model 1 MAE: {mae_1}')

# MODEL 2 -> WITH LOG

Model_2.fit(X_train , y_train_log , verbose=False)
y_pred_log_2 = Model_2.predict(X_test)
y_pred_2 = np.log1p(y_pred_log_2)

mae_log = mean_absolute_error(y_test_log , y_pred_log_2)
mae_2 = mean_absolute_error(y_test , y_pred_2)

print(f'Model 2 MAE: {mae_2}')
print(f'Model 2 MAE: {mae_log}')

Model 1 MAE: 40059.55462612282
Model 2 MAE: 492252.36775736965
Model 2 MAE: 0.07635364291911705


# SAVING MODEL 1 AS IT PERFORMED THE BEST

In [15]:
import pickle

MODEL_PATH = '/content/Drive/MyDrive/House_price_predictor/XGB_FINAL.pkl'

with open(MODEL_PATH, 'wb') as file:
    pickle.dump(Model_1, file)