In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from tabulate import tabulate
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_squared_log_error, mean_absolute_error, make_scorer
from sklearn.model_selection import train_test_split

data = pd.read_csv('csv/final_dataset.csv')
print(data.columns)

# Separate features and response variables
X = data.iloc[:, 2:]                                # features
Y = data['temp_measured']                           # response variable: geothermal reservoir measured temperature
print(f'Features of dataset: {X.columns}')
print(f'Number of compenents in features: {X.shape[1]}')
print(Y.head(10))

Index(['well_sample', 'temp_measured', 'pH', 'Na ', 'K', 'Ca', 'Mg', 'Cl',
       'SO4'],
      dtype='object')
Features of dataset: Index(['pH', 'Na ', 'K', 'Ca', 'Mg', 'Cl', 'SO4'], dtype='object')
Number of compenents in features: 7
0    137
1    137
2    137
3    137
4    150
5    116
6    165
7    140
8    115
9    115
Name: temp_measured, dtype: int64


In [10]:
### Desicion tree with GridSearchCV for parameters tuning

from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV

start_time_dt_gs = time.time()

x_train_dt, x_test_dt, y_train_dt, y_test_dt = train_test_split(X, Y, test_size=0.2, random_state=42)

scaler = StandardScaler()
x_train_dt = scaler.fit_transform(x_train_dt)
x_test_dt = scaler.transform(x_test_dt)

# Setting up grid search cross validation for parameters tuning
# param_grid_dt: dictionary of parameters to be tested.
# cv: number of split for cross-validation: 5- fold cross-validation (validación cruzada quíntuple).
# scoring or evaluation metrics: mean squeared error. GridSearchCV maximize the scoring metrics, that's why it's called 'neg_mean_squared_error'.
# The actual calculation is: -1 * mean_squared_error

# Parameters tuning
tree_regressor = DecisionTreeRegressor()
param_grid_dt = {'criterion': ['squared_error'],
            'max_depth': [50, 75, 100],
            'min_samples_split': [5, 10, 25],
            'max_leaf_nodes': [25, 50, 75],
            'min_impurity_decrease': [0, 0.01, 0.1]}

grid_search_cv = GridSearchCV(
    estimator=tree_regressor, 
    param_grid=param_grid_dt, 
    cv=5, 
    verbose=1, 
    scoring='neg_mean_squared_error',
    n_jobs=-1                                         # n_jobs=-1 utilize all the cores avalaible,
    )


grid_search_cv.fit(x_train_dt, y_train_dt)
best_model_dt = grid_search_cv.best_estimator_

print('='*75)
print(f'Desicion tree regressor best parameters: \n{best_model_dt}')
print('='*75)

y_pred_test_dt = best_model_dt.predict(x_test_dt)
y_pred_train_dt = best_model_dt.predict(x_train_dt)

end_time_dt_gs = time.time()
training_time_dt_gs = end_time_dt_gs - start_time_dt_gs

def mean_relative_squared_error(Y_true, Y_pred):
    return np.mean(((Y_true - Y_pred) / Y_true) ** 2)

r2_dt = r2_score(y_test_dt, y_pred_test_dt)
mse_dt = mean_squared_error(y_test_dt, y_pred_test_dt)
mslr_dt = mean_squared_log_error(y_test_dt, y_pred_test_dt)
mae_dt = mean_absolute_error(y_test_dt, y_pred_test_dt)
mrse_dt = mean_relative_squared_error(y_test_dt, y_pred_test_dt)

eval_metrics_dt = {
    'Eval_metrics': ['R2 Score', 'MSE', 'MAE', 'MSLE', 'MRSE', 'Training time'],
    'Desicion_tree_GS': [r2_dt, mse_dt, mae_dt, mslr_dt, mrse_dt, training_time_dt_gs]
}

df_metrics_dt = pd.DataFrame(eval_metrics_dt)
df_metrics_dt.to_csv('metrics/metrics_dt_gs.csv', index=False)

print(tabulate(df_metrics_dt.round(4), headers='keys', tablefmt='pretty', showindex=False))

#joblib.dump(tree_regressor, 'dt_model.joblib')
#print("Decision Tree model saved as 'decision_tree_model.joblib'.")

Fitting 5 folds for each of 81 candidates, totalling 405 fits
Desicion tree regressor best parameters: 
DecisionTreeRegressor(max_depth=75, max_leaf_nodes=25,
                      min_impurity_decrease=0.01, min_samples_split=5)
+---------------+------------------+
| Eval_metrics  | Desicion_tree_GS |
+---------------+------------------+
|   R2 Score    |      0.831       |
|      MSE      |    1275.1436     |
|      MAE      |     19.4117      |
|     MSLE      |      0.151       |
|     MRSE      |      1.2998      |
| Training time |      0.2086      |
+---------------+------------------+


In [11]:
### Decision tree with RandomizedSearchCV for parameters tuning

from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

start_time_dt_rs = time.time()

x_train_dt, x_test_dt, y_train_dt, y_test_dt = train_test_split(X, Y, test_size=0.20, random_state=42)

scaler = StandardScaler()
x_train_dt = scaler.fit_transform(x_train_dt)
x_test_dt = scaler.transform(x_test_dt)

# Parameters tuning with distributions for RandomizedSearchCV
tree_regressor = DecisionTreeRegressor()
param_distributions_dt = {
    'criterion': ['squared_error'],
    'max_depth': randint(50, 100),             
    'min_samples_split': randint(5, 25),        
    'max_leaf_nodes': randint(25, 75),          
    'min_impurity_decrease': uniform(0, 0.1)    
}

random_search_cv = RandomizedSearchCV(
    estimator=tree_regressor, 
    param_distributions=param_distributions_dt, 
    n_iter=50,  # Number of parameter settings sampled
    cv=5, 
    verbose=1, 
    scoring='neg_mean_squared_error',
    random_state=42,
    n_jobs=-1
)

random_search_cv.fit(x_train_dt, y_train_dt)
best_model_dt = random_search_cv.best_estimator_
print('='*75)
print(f'Decision tree regressor best parameters: \n{best_model_dt}')
print('='*75)
y_pred_test_dt = best_model_dt.predict(x_test_dt)
y_pred_train_dt = best_model_dt.predict(x_train_dt)

end_time_dt_rs = time.time()
training_time_dt_rs = end_time_dt_rs - start_time_dt_rs

def mean_relative_squared_error(Y_true, Y_pred):
    return np.mean(((Y_true - Y_pred) / Y_true) ** 2)

r2_dt = r2_score(y_test_dt, y_pred_test_dt)
mse_dt = mean_squared_error(y_test_dt, y_pred_test_dt)
mslr_dt = mean_squared_log_error(y_test_dt, y_pred_test_dt)
mae_dt = mean_absolute_error(y_test_dt, y_pred_test_dt)
mrse_dt = mean_relative_squared_error(y_test_dt, y_pred_test_dt)

eval_metrics_dt = {
    'Eval_metrics': ['R2 Score', 'MSE', 'MAE', 'MSLE', 'MRSE', 'Training time'],
    'Decision_tree_RS': [r2_dt, mse_dt, mae_dt, mslr_dt, mrse_dt, training_time_dt_rs]
}

df_metrics_dt = pd.DataFrame(eval_metrics_dt)
df_metrics_dt.to_csv('metrics/metrics_dt_rs.csv', index=False)

print(tabulate(df_metrics_dt.round(4), headers='keys', tablefmt='pretty', showindex=False))

#joblib.dump(best_model_dt, 'dt_randomized_model.joblib')
#print("Decision Tree model saved as 'dt_randomized_model.joblib'.")

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Decision tree regressor best parameters: 
DecisionTreeRegressor(max_depth=75, max_leaf_nodes=68,
                      min_impurity_decrease=0.09394989415641891,
                      min_samples_split=8)
+---------------+------------------+
| Eval_metrics  | Decision_tree_RS |
+---------------+------------------+
|   R2 Score    |      0.8195      |
|      MSE      |    1361.9817     |
|      MAE      |     19.3159      |
|     MSLE      |      0.1506      |
|     MRSE      |      1.3017      |
| Training time |      0.2521      |
+---------------+------------------+


  _data = np.array(data, dtype=dtype, copy=copy,


In [14]:
### Decision Tree with Optuna hyperparameter optimization

import optuna
from sklearn.model_selection import cross_val_score

start_time_dt_op = time.time()

x_train_dt, x_test_dt, y_train_dt, y_test_dt = train_test_split(X, Y, test_size=0.20, random_state=42)

scaler = StandardScaler()
x_train_dt = scaler.fit_transform(x_train_dt)
x_test_dt = scaler.transform(x_test_dt)

def objective_dt(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 50, 100),
        'min_samples_split': trial.suggest_int('min_samples_split', 5, 25),
        'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 25, 75),
        'min_impurity_decrease': trial.suggest_float('min_impurity_decrease', 0.0, 0.1)
    }
    model = DecisionTreeRegressor(**params, random_state=42)
    score = cross_val_score(model, x_train_dt, y_train_dt, cv=5, scoring='neg_mean_squared_error').mean()
    return -score

optuna_dt = optuna.create_study(direction='minimize')
optuna_dt.optimize(objective_dt, n_trials=40, show_progress_bar=False)

print('='*125)
print(f"Best Decision Tree params: \n{optuna_dt.best_params}")
print('='*125)

best_dt = DecisionTreeRegressor(**optuna_dt.best_params, random_state=42)
best_dt.fit(x_train_dt, y_train_dt)
y_pred_dt = best_dt.predict(x_test_dt)

final_time_dt_op = time.time()
training_time_dt_op = final_time_dt_op - start_time_dt_op

def mean_relative_squared_error(Y_true, Y_pred):
    return np.mean(((Y_true - Y_pred) / Y_true) ** 2)

r2_dt = r2_score(y_test_dt, y_pred_dt)
mse_dt = mean_squared_error(y_test_dt, y_pred_dt)
mae_dt = mean_absolute_error(y_test_dt, y_pred_dt)
msle_dt = mean_squared_log_error(y_test_dt, y_pred_dt)
mrse_dt = mean_relative_squared_error(y_test_dt, y_pred_dt)

dt_eval_metrics = {
    'Eval_metrics': ['R2 Score', 'MSE', 'MAE', 'MSLE', 'MRSE', 'Training time'],
    'Decision_tree_Op': [r2_dt, mse_dt, mae_dt, msle_dt, mrse_dt, training_time_dt_op]
}

dt_df_metrics = pd.DataFrame(dt_eval_metrics)
dt_df_metrics.to_csv('metrics/metrics_dt_op.csv', index=False)

print(tabulate(dt_df_metrics.round(4), headers='keys', tablefmt='pretty', showindex=False))

[I 2025-08-25 11:14:14,435] A new study created in memory with name: no-name-1bff734f-e003-4ea3-98db-8576c19374a3
[I 2025-08-25 11:14:14,451] Trial 0 finished with value: 1887.6178392863762 and parameters: {'max_depth': 62, 'min_samples_split': 5, 'max_leaf_nodes': 26, 'min_impurity_decrease': 0.004687757094446721}. Best is trial 0 with value: 1887.6178392863762.
[I 2025-08-25 11:14:14,469] Trial 1 finished with value: 2315.778616644655 and parameters: {'max_depth': 55, 'min_samples_split': 24, 'max_leaf_nodes': 35, 'min_impurity_decrease': 0.03631932301086473}. Best is trial 0 with value: 1887.6178392863762.
[I 2025-08-25 11:14:14,484] Trial 2 finished with value: 2309.653616644655 and parameters: {'max_depth': 61, 'min_samples_split': 25, 'max_leaf_nodes': 32, 'min_impurity_decrease': 0.061648516557166425}. Best is trial 0 with value: 1887.6178392863762.
[I 2025-08-25 11:14:14,497] Trial 3 finished with value: 2309.653616644655 and parameters: {'max_depth': 53, 'min_samples_split': 2

Best Decision Tree params: 
{'max_depth': 82, 'min_samples_split': 6, 'max_leaf_nodes': 25, 'min_impurity_decrease': 0.00011707759343830646}
+---------------+------------------+
| Eval_metrics  | Decision_tree_Op |
+---------------+------------------+
|   R2 Score    |      0.7276      |
|      MSE      |    2055.0446     |
|      MAE      |     23.7536      |
|     MSLE      |      0.1663      |
|     MRSE      |      1.3511      |
| Training time |      0.9372      |
+---------------+------------------+


In [17]:
### Random Forest with RandomizedSearchCV for hyperparameter optimization

from sklearn.ensemble import RandomForestRegressor
import time

start_time_rf_rs = time.time()

x_train_rf, x_test_rf, y_train_rf, y_test_rf = train_test_split(X, Y, test_size=0.20, random_state=42)

scaler = StandardScaler()
x_train_rf = scaler.fit_transform(x_train_rf)
x_test_rf = scaler.transform(x_test_rf)

# Define parameter distributions for RandomizedSearchCV
rf_param_distributions = {
    'n_estimators': randint(50, 500),
    'max_depth': randint(5, 50),
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 10),
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True, False]
}

# RandomizedSearchCV with Random Forest
rf_regressor = RandomForestRegressor(random_state=42, n_jobs=-1)

random_search_rf = RandomizedSearchCV(
    estimator=rf_regressor,
    param_distributions=rf_param_distributions,
    n_iter=50,  # Number of parameter settings sampled
    cv=5,
    verbose=1,
    scoring='neg_mean_squared_error',
    random_state=42,
    n_jobs=-1
)

random_search_rf.fit(x_train_rf, y_train_rf)
best_rf = random_search_rf.best_estimator_

print("="*135)
print(f"Best Random Forest params: \n{random_search_rf.best_params_}")
print("="*135)
y_pred_rf = best_rf.predict(x_test_rf)

end_time_rf_rs = time.time()
training_time_rf_rs= end_time_rf_rs - start_time_rf_rs

def mean_relative_squared_error(Y_true, Y_pred):
    return np.mean(((Y_true - Y_pred) / Y_true) ** 2)

r2_rf = r2_score(y_test_rf, y_pred_rf)
mse_rf = mean_squared_error(y_test_rf, y_pred_rf)
mae_rf = mean_absolute_error(y_test_rf, y_pred_rf)
msle_rf = mean_squared_log_error(y_test_rf, y_pred_rf)
mrse_rf = mean_relative_squared_error(y_test_rf, y_pred_rf)

rf_eval_metrics = {
    'Eval_metrics': ['R2 Score', 'MSE', 'MAE', 'MSLE', 'MRSE', 'Training time'],
    'Random_forest_RS': [r2_rf, mse_rf, mae_rf, msle_rf, mrse_rf, training_time_rf_rs]
}

rf_df_metrics = pd.DataFrame(rf_eval_metrics)
rf_df_metrics.to_csv('metrics/metrics_rf_rs.csv', index=False)

print(tabulate(rf_df_metrics.round(4), headers='keys', tablefmt='pretty', showindex=False))

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best Random Forest params: 
{'bootstrap': False, 'max_depth': 34, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 363}
+---------------+------------------+
| Eval_metrics  | Random_forest_RS |
+---------------+------------------+
|   R2 Score    |      0.7983      |
|      MSE      |    1521.8001     |
|      MAE      |      21.442      |
|     MSLE      |      0.1257      |
|     MRSE      |      0.7566      |
| Training time |     11.3228      |
+---------------+------------------+


In [18]:
### Random Forest with Optuna hyperparameter optimization

start_time_rf_op = time.time()

x_train_rf, x_test_rf, y_train_rf, y_test_rf = train_test_split(X, Y, test_size=0.20, random_state=42)

scaler = StandardScaler()
x_train_rf = scaler.fit_transform(x_train_rf)
x_test_rf = scaler.transform(x_test_rf)

def objective_rf(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_depth': trial.suggest_int('max_depth', 5, 50),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False])
    }
    model = RandomForestRegressor(**params, random_state=42, n_jobs=-1)
    score = cross_val_score(model, x_train_rf, y_train_rf, cv=5, scoring='neg_mean_squared_error').mean()
    return -score

study_rf = optuna.create_study(direction='minimize')
study_rf.optimize(objective_rf, n_trials=40)

print("Best Random Forest params:", study_rf.best_params)

best_rf = RandomForestRegressor(**study_rf.best_params, random_state=42, n_jobs=-1)
best_rf.fit(x_train_rf, y_train_rf)
y_pred_rf = best_rf.predict(x_test_rf)

end_time_rf_op = time.time()
training_time_rf_op = end_time_rf_op - start_time_rf_op

def mean_relative_squared_error(Y_true, Y_pred):
    return np.mean(((Y_true - Y_pred) / Y_true) ** 2)

r2_rf = r2_score(y_test_rf, y_pred_rf)
mse_rf = mean_squared_error(y_test_rf, y_pred_rf)
mae_rf = mean_absolute_error(y_test_rf, y_pred_rf)
msle_rf = mean_squared_log_error(y_test_rf, y_pred_rf)
mrse_rf = mean_relative_squared_error(y_test_rf, y_pred_rf)

rf_eval_metrics = {
    'Eval_metrics': ['R2 Score', 'MSE', 'MAE', 'MSLE', 'MRSE', 'Training time'],
    'Random_forest_Op': [r2_rf, mse_rf, mae_rf, msle_rf, mrse_rf, training_time_rf_op]
}

rf_df_metrics = pd.DataFrame(rf_eval_metrics)
rf_df_metrics.to_csv('metrics/metrics_rf_op.csv', index=False)

print(tabulate(rf_df_metrics.round(4), headers='keys', tablefmt='pretty', showindex=False))

[I 2025-08-25 11:16:12,461] A new study created in memory with name: no-name-335c961e-0a6a-4899-90ae-13a6abc789f9
[I 2025-08-25 11:16:14,249] Trial 0 finished with value: 1437.9577771351253 and parameters: {'n_estimators': 344, 'max_depth': 13, 'min_samples_split': 3, 'min_samples_leaf': 7, 'max_features': 'log2', 'bootstrap': True}. Best is trial 0 with value: 1437.9577771351253.
[I 2025-08-25 11:16:16,498] Trial 1 finished with value: 1202.4290593381006 and parameters: {'n_estimators': 442, 'max_depth': 13, 'min_samples_split': 14, 'min_samples_leaf': 2, 'max_features': 'log2', 'bootstrap': True}. Best is trial 1 with value: 1202.4290593381006.
[I 2025-08-25 11:16:16,888] Trial 2 finished with value: 1343.8687435196366 and parameters: {'n_estimators': 57, 'max_depth': 33, 'min_samples_split': 15, 'min_samples_leaf': 3, 'max_features': 'log2', 'bootstrap': True}. Best is trial 1 with value: 1202.4290593381006.
[I 2025-08-25 11:16:18,203] Trial 3 finished with value: 2355.188657962865 

Best Random Forest params: {'n_estimators': 412, 'max_depth': 11, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_features': 'log2', 'bootstrap': False}
+---------------+------------------+
| Eval_metrics  | Random_forest_Op |
+---------------+------------------+
|   R2 Score    |      0.8107      |
|      MSE      |    1428.4317     |
|      MAE      |     20.1389      |
|     MSLE      |      0.126       |
|     MRSE      |      0.7838      |
| Training time |     54.5136      |
+---------------+------------------+


In [19]:
### XGBoost implementation with RandomizedSearchCV for faster hyperparametr tuning
'''
Unlike GridSearchCV, which tries all possible combiationn of hyperparameters,
RandomizedSearchCV samples a fixed number of hyperparameters combinations from
the specified space. This is useful when you have a large hyprparameteres space,
as it saves time by exploring only a subset of all possible combinations.
'''

import xgboost as xgb
import json

start_time_xg_rs = time.time()

x_train_xg, x_test_xg, y_train_log_xg, y_test_log_xg = train_test_split(X, np.log(Y), test_size = 0.20, random_state = 45)

scaler = StandardScaler()
x_train_xg = scaler.fit_transform(x_train_xg)
x_test_xg = scaler.transform(x_test_xg)

xgb_regressor = xgb.XGBRegressor() # (tree_method='gpu_hist', predictor='gpu_predictor', use_label_encoder=False)   # Use GPU for training

# Convert data into DMatrix (XGBoost optimized data structure)
dtrain = xgb.DMatrix(x_train_xg, label=y_train_log_xg)
dtest = xgb.DMatrix(x_test_xg, label=y_test_log_xg)

# Define the space of hyperparameters
xgb_param_distributions = {
    'colsample_bytree': uniform(0.3, 0.7),
    'learning_rate': uniform(0.00001, 1),
    'max_depth': randint(10, 100),
    'n_estimators': randint(50, 5000),
    'subsample': uniform(0.1, 0.7),
    'reg_alpha': uniform(0, 1),
    'reg_lambda': uniform(0, 1)
}

# Define scoring functions
scorers = {
    'r2': make_scorer(r2_score),
    'neg_mse': make_scorer(mean_squared_error, greater_is_better=False)
}

# RandomizedSearchCV with cross-validation to optimize hyperparameter tuning
random_search_xgb = RandomizedSearchCV(
    estimator=xgb_regressor,
    param_distributions=xgb_param_distributions,
    scoring=scorers,
    refit='r2',         # Refit the model using R2 after searching
    cv=10,              # 10-fold cross-validation
    n_iter=100,         # Number of random combinations to try
    verbose=1,
    random_state=42,
    n_jobs=-1           # -1 to use all processors avalaible
)

random_search_xgb.fit(x_train_xg, y_train_log_xg)

xgb_best_params = random_search_xgb.best_params_

print("="*200)
print(f'Best XGBoost hyperparameters: \n{xgb_best_params}')
print("="*200)

#params_filename = 'xgbRS_best_params.json'
#with open(params_filename, 'w') as params_file:
#    json.dump(xgb_best_params, params_file)
#print(f'Best XGBoost hyperparameters saved to: \n{params_filename}')

xgb_regressor = xgb.XGBRegressor(**xgb_best_params, 
                                 eval_metrics='rmse', 
                                 early_stopping_rounds=1500) #, tree_method='gpu_hist', predictor='gpu_predictor',)  ## Use GPU for training

xgb_model = xgb_regressor.fit(x_train_xg, 
                              y_train_log_xg, 
                              verbose=False, 
                              eval_set = [(x_train_xg, y_train_log_xg), (x_test_xg, y_test_log_xg)])

end_time_xg_rs = time.time()
training_time_xgb_rs = end_time_xg_rs - start_time_xg_rs

# Save the trained XGBoost model to a file
# model_filename = 'xgbRS_model.json'
# xgb_model.save_model(model_filename)
# print(f'XGBoost model saved to {model_filename}')

y_pred_log_test_xg = xgb_model.predict(x_test_xg)
y_pred_log_train_xg = xgb_model.predict(x_train_xg)

y_pred_test_xg = np.exp(y_pred_log_test_xg)
y_pred_train_xg = np.exp(y_pred_log_train_xg)
y_train_xg = np.exp(y_train_log_xg)
y_test_xg = np.exp(y_test_log_xg)

def mean_relative_squared_error(Y_true, Y_pred):
    return np.mean(((Y_true - Y_pred) / Y_true) ** 2)

r2_xgb= r2_score(y_test_xg, y_pred_test_xg)
mse_xgb = mean_squared_error(y_test_xg, y_pred_test_xg)
mae_xgb = mean_absolute_error(y_test_xg, y_pred_test_xg)
mslr_xgb = mean_squared_log_error(y_test_xg, y_pred_test_xg)
mrse_xgb = mean_relative_squared_error(y_test_xg, y_pred_test_xg)

xgb_eval_metrics = {
    'Eval_metrics': ['R2 Score', 'MSE', 'MAE', 'MSLE', 'MRSE', 'Training time'],
    'XGBoost_RS': [r2_xgb, mse_xgb, mae_xgb, mslr_xgb, mrse_xgb, training_time_xgb_rs]
}

xgb_df_metrics = pd.DataFrame(xgb_eval_metrics)
xgb_df_metrics.to_csv('metrics/metrics_xgb_rs.csv', index=False)

print(tabulate(xgb_df_metrics.round(4), headers='keys', tablefmt='pretty', showindex=False))

Fitting 10 folds for each of 100 candidates, totalling 1000 fits
Best XGBoost hyperparameters: 
{'colsample_bytree': 0.7395302726436355, 'learning_rate': 0.0817690319488719, 'max_depth': 72, 'n_estimators': 4193, 'reg_alpha': 0.06107795985486375, 'reg_lambda': 0.2768776481472037, 'subsample': 0.6643408958551429}


Parameters: { "eval_metrics" } are not used.



+---------------+------------+
| Eval_metrics  | XGBoost_RS |
+---------------+------------+
|   R2 Score    |   0.9387   |
|      MSE      |  443.9573  |
|      MAE      |  14.2474   |
|     MSLE      |   0.0278   |
|     MRSE      |   0.0389   |
| Training time |  43.2044   |
+---------------+------------+


In [20]:
### XGBoost with Optuna hyperparameter optimization

start_time_xg_op = time.time()

x_train_xg, x_test_xg, y_train_log_xg, y_test_log_xg = train_test_split(X, np.log(Y), test_size=0.2, random_state=42)

scaler = StandardScaler()
x_train_xg = scaler.fit_transform(x_train_xg)
x_test_xg = scaler.transform(x_test_xg)

def objective_xgb(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 5, 100),
        'learning_rate': trial.suggest_float('learning_rate', 1e-5, 1.0, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 7),
        'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
        'subsample': trial.suggest_float('subsample', 0.1, 0.8),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0, log=True)
    }


    model = xgb.XGBRegressor(**params, random_state=42)
    score = cross_val_score(model, x_train_xg, y_train_log_xg, 
                          cv=5, scoring='neg_mean_squared_error').mean()
    return -score

study_xgb = optuna.create_study(direction='minimize')
study_xgb.optimize(objective_xgb, n_trials=50)

print("="*100)
print(f"Best XGBoost parameters: \n{study_xgb.best_params}")
print("="*100)

#params_filename = 'xgb_optuna_best_params.json'
#with open(params_filename, 'w') as params_file:
#    json.dump(study_xgb.best_params, params_file)
#print(f'Best parameters saved to: {params_filename}')

best_xgb = xgb.XGBRegressor(**study_xgb.best_params, 
                            random_state=42,  
                            eval_metrics='rmse', 
                            early_stopping_rounds=1500) #, tree_method='gpu_hist', predictor='gpu_predictor',)  ## Use GPU for training)

best_xgb.fit(x_train_xg, 
             y_train_log_xg,
             verbose = True,
             eval_set = [(x_train_xg, y_train_log_xg), (x_test_xg, y_test_log_xg)]
             )

end_time_xg_op = time.time()
training_time_xg_op = end_time_xg_op - start_time_xg_op

#model_filename = 'xgb_optuna_model.json'
#best_xgb.save_model(model_filename)
#print(f'Model saved to: {model_filename}')

y_pred_log_test_xg = best_xgb.predict(x_test_xg)
y_pred_log_train_xg = best_xgb.predict(x_train_xg)

y_pred_test_xg = np.exp(y_pred_log_test_xg)
y_pred_train_xg = np.exp(y_pred_log_train_xg)
y_train_xg = np.exp(y_train_log_xg)
y_test_xg = np.exp(y_test_log_xg)

r2_xgb = r2_score(y_test_xg, y_pred_test_xg)
mse_xgb = mean_squared_error(y_test_xg, y_pred_test_xg)
mae_xgb = mean_absolute_error(y_test_xg, y_pred_test_xg)
msle_xgb = mean_squared_log_error(y_test_xg, y_pred_test_xg)
mrse_xgb = mean_relative_squared_error(y_test_xg, y_pred_test_xg)

xgb_eval_metrics = {
    'Eval_metrics': ['R2 Score', 'MSE', 'MAE', 'MSLE', 'MRSE', 'Training time'],
    'XGBoost_Op': [r2_xgb, mse_xgb, mae_xgb, msle_xgb, mrse_xgb, training_time_xg_op]
}

xgb_df_metrics = pd.DataFrame(xgb_eval_metrics)
xgb_df_metrics.to_csv('metrics/metrics_xgb_op.csv', index=False)

print(tabulate(xgb_df_metrics.round(4), headers='keys', tablefmt='pretty', showindex=False))

#Plot optimization history
#optuna.visualization.plot_optimization_history(study_xgb)
#plt.show()

[I 2025-08-25 11:20:30,988] A new study created in memory with name: no-name-5e694853-0734-4c89-9e33-da976b447026
[I 2025-08-25 11:20:32,220] Trial 0 finished with value: 0.37467298699550683 and parameters: {'max_depth': 26, 'learning_rate': 2.9381406729705334e-05, 'n_estimators': 629, 'min_child_weight': 6, 'gamma': 0.00014135830313387655, 'subsample': 0.17715804814974878, 'colsample_bytree': 0.7702370011951241, 'reg_alpha': 3.3382513487061873e-08, 'reg_lambda': 0.000806012494977329}. Best is trial 0 with value: 0.37467298699550683.
[I 2025-08-25 11:20:34,659] Trial 1 finished with value: 0.34640209594585386 and parameters: {'max_depth': 44, 'learning_rate': 0.00016120868296473886, 'n_estimators': 895, 'min_child_weight': 2, 'gamma': 6.428728811158676e-06, 'subsample': 0.13971737317398442, 'colsample_bytree': 0.38252849893995167, 'reg_alpha': 2.228596877305506e-07, 'reg_lambda': 0.3144717616308693}. Best is trial 1 with value: 0.34640209594585386.
[I 2025-08-25 11:20:35,750] Trial 2 f

Best XGBoost parameters: 
{'max_depth': 24, 'learning_rate': 0.03290638655505097, 'n_estimators': 362, 'min_child_weight': 3, 'gamma': 0.0008629817602142118, 'subsample': 0.3625718810264329, 'colsample_bytree': 0.8280957854861865, 'reg_alpha': 0.0011104626478684787, 'reg_lambda': 0.8986530621498459}
[0]	validation_0-rmse:0.59959	validation_1-rmse:0.65469
[1]	validation_0-rmse:0.58786	validation_1-rmse:0.64208
[2]	validation_0-rmse:0.57427	validation_1-rmse:0.62910
[3]	validation_0-rmse:0.56250	validation_1-rmse:0.61868
[4]	validation_0-rmse:0.55159	validation_1-rmse:0.61190
[5]	validation_0-rmse:0.53813	validation_1-rmse:0.60072
[6]	validation_0-rmse:0.52533	validation_1-rmse:0.59222
[7]	validation_0-rmse:0.51343	validation_1-rmse:0.58171
[8]	validation_0-rmse:0.50060	validation_1-rmse:0.57077
[9]	validation_0-rmse:0.49219	validation_1-rmse:0.56310
[10]	validation_0-rmse:0.48105	validation_1-rmse:0.55443
[11]	validation_0-rmse:0.46916	validation_1-rmse:0.54518
[12]	validation_0-rmse:0.

Parameters: { "eval_metrics" } are not used.



[27]	validation_0-rmse:0.34201	validation_1-rmse:0.45744
[28]	validation_0-rmse:0.33538	validation_1-rmse:0.45155
[29]	validation_0-rmse:0.32964	validation_1-rmse:0.44887
[30]	validation_0-rmse:0.32306	validation_1-rmse:0.44361
[31]	validation_0-rmse:0.31718	validation_1-rmse:0.43950
[32]	validation_0-rmse:0.31140	validation_1-rmse:0.43597
[33]	validation_0-rmse:0.30521	validation_1-rmse:0.43191
[34]	validation_0-rmse:0.30007	validation_1-rmse:0.42867
[35]	validation_0-rmse:0.29406	validation_1-rmse:0.42552
[36]	validation_0-rmse:0.28912	validation_1-rmse:0.42325
[37]	validation_0-rmse:0.28546	validation_1-rmse:0.42032
[38]	validation_0-rmse:0.28154	validation_1-rmse:0.41700
[39]	validation_0-rmse:0.27752	validation_1-rmse:0.41493
[40]	validation_0-rmse:0.27440	validation_1-rmse:0.41278
[41]	validation_0-rmse:0.27027	validation_1-rmse:0.40910
[42]	validation_0-rmse:0.26586	validation_1-rmse:0.40621
[43]	validation_0-rmse:0.26215	validation_1-rmse:0.40572
[44]	validation_0-rmse:0.25912	

In [21]:
### TabPFN for Temperature Prediction

from tabpfn import TabPFNRegressor

start_time_tabpfn = time.time()

x_train_tab, x_test_tab, y_train_log_tab, y_test_log_tab = train_test_split(X, np.log(Y), test_size=0.20, random_state=42)

scaler = StandardScaler()
x_train_tab = scaler.fit_transform(x_train_tab)
x_test_tab = scaler.transform(x_test_tab)

tabpfn_regressor = TabPFNRegressor(
    device='cpu',  # Use 'cuda' if you have a GPU
)

tabpfn_regressor.fit(x_train_tab, y_train_log_tab)

y_pred_log_test = tabpfn_regressor.predict(x_test_tab)
y_pred_log_train = tabpfn_regressor.predict(x_train_tab)

y_pred_test = np.exp(y_pred_log_test)
y_pred_train = np.exp(y_pred_log_train)
y_train = np.exp(y_train_log_tab)
y_test = np.exp(y_test_log_tab)

end_time_tabpfn = time.time()
training_time_tabpfn = end_time_tabpfn - start_time_tabpfn

def mean_relative_squared_error(Y_true, Y_pred):
    return np.mean(((Y_true - Y_pred) / Y_true) ** 2)

r2_tabpfn = r2_score(y_test, y_pred_test)
mse_tabpfn = mean_squared_error(y_test, y_pred_test)
mae_tabpfn = mean_absolute_error(y_test, y_pred_test)
msle_tabpfn = mean_squared_log_error(y_test, y_pred_test)
mrse_tabpfn = mean_relative_squared_error(y_test, y_pred_test)

tabpfn_eval_metrics = {
    'Eval_metrics': ['R2 Score', 'MSE', 'MAE', 'MSLE', 'MRSE', 'Training time'],
    'TabPFN Model': [r2_tabpfn, mse_tabpfn, mae_tabpfn, msle_tabpfn, mrse_tabpfn, training_time_tabpfn]
}

tabpfn_df_metrics = pd.DataFrame(tabpfn_eval_metrics)
tabpfn_df_metrics.to_csv('metrics/metrics_tabpfn.csv', index=False)

print(tabulate(tabpfn_df_metrics.round(4), headers='keys', tablefmt='pretty', showindex=False))




+---------------+--------------+
| Eval_metrics  | TabPFN Model |
+---------------+--------------+
|   R2 Score    |    0.8231    |
|      MSE      |  1334.5401   |
|      MAE      |    17.991    |
|     MSLE      |    0.1016    |
|     MRSE      |    0.3247    |
| Training time |    8.0808    |
+---------------+--------------+
