In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from tabulate import tabulate

data = pd.read_csv('csv/final_dataset.csv')
print(data.columns)

# Separate features and response variables
X = data.iloc[:, 2:]                                # features
Y = data['temp_measured']                           # response variable: geothermal reservoir measured temperature
print(f'Features of dataset: {X.columns}')
print(f'Number of compenents in features: {X.shape[1]}')
print(Y.head(10))

Index(['well_sample', 'temp_measured', 'pH', 'Na ', 'K', 'Ca', 'Mg', 'Cl',
       'SO4'],
      dtype='object')
Features of dataset: Index(['pH', 'Na ', 'K', 'Ca', 'Mg', 'Cl', 'SO4'], dtype='object')
Number of compenents in features: 7
0    137
1    137
2    137
3    137
4    150
5    116
6    165
7    140
8    115
9    115
Name: temp_measured, dtype: int64


In [3]:
### Linear Model - Elastic-Net
# Elastic Net is a regularization technique that combines Lasso (L1) and Ridge (L2) regularization or penalties.
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_squared_log_error, mean_absolute_error, make_scorer
import joblib

x_train_lm, x_test_lm, y_train_log_lm, y_test_log_lm = train_test_split(X, np.log(Y), test_size=0.2, random_state=42)
print(f'Number of samples in training set: {x_train_lm.shape[0]}')

'''scaler = StandardScaler()
x_train_lm = scaler.fit_transform(x_train_lm)
x_test_lm = scaler.transform(x_test_lm)'''

# Setting up grid search cross validation for parameters tuning
# esimator: elastic_net model
# param_grid: dictionary of parameters to be tested.
# cv: number of split for cross-validation: 5- fold cross-validation (validación cruzada quíntuple).
# scoring or evaluation metrics: mean squeared error. GridSearchCV maximize the scoring metrics, that's why it's called 'neg_mean_squared_error'.
# The actual calculation is: -1 * mean_squared_error

start_time_lm = time.time()

elastic_net = ElasticNet()
param_lm = {
    'alpha': [0.001, 0.01, 0.1, 1, 10, 100],
    'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]
}

param_grid_lm = GridSearchCV(
    estimator=elastic_net, 
    param_grid=param_lm, 
    cv=5, 
    scoring='neg_mean_squared_error',
    verbose=0, 
    n_jobs=-1                                                   # n_jobs=-1 utilize all the cores avalaible,
)                                                               # 

# Perform grid search
param_grid_lm.fit(x_train_lm, y_train_log_lm)                   # fit the model to the training data.

# Get the best model
best_model_lm = param_grid_lm.best_estimator_                   # best_estimator_ attribute of the GridSearchCV object returns the best model.
print(f'Best parameters for Elastic-Net: {best_model_lm}')

'''
The best parameters can then be accessed using:

best_params = param_grid_lm.best_params_
best_model = param_grid_lm.best_estimator_
'''

y_pred_test_log_lm = best_model_lm.predict(x_test_lm)           # predict the response variable (Geothermal Reservoir Temperature) using the best model.
y_pred_train_log_lm = best_model_lm.predict(x_train_lm)

y_pred_test_lm = np.exp(y_pred_test_log_lm)                     # convert the predicted response variable to the original scale.
y_pred_train_lm = np.exp(y_pred_train_log_lm)
y_test_lm = np.exp(y_test_log_lm)
y_train_lm = np.exp(y_train_log_lm)

end_time_lm = time.time()
training_time_lm = end_time_lm - start_time_lm
print(f'Training time for Elastic-Net: {training_time_lm}')

# Evaluate the model
def mean_relative_squared_error(y_true, y_pred):
    return np.mean(np.square((y_true - y_pred) / y_true))

r2_lm = r2_score(y_test_lm, y_pred_test_lm)
mse_lm = mean_squared_error(y_test_lm, y_pred_test_lm)
mslr_lm = mean_squared_log_error(y_test_lm, y_pred_test_lm)
mae_lm = mean_absolute_error(y_test_lm, y_pred_test_lm)
mrse_lm = mean_relative_squared_error(y_test_lm, y_pred_test_lm)

eval_metrics_lm = {
    'Eval_metrics': ['R2 Score', 'MSE', 'MAE', 'MSLE', 'MRSE', 'Training time'],
    'Elastic-Net Model': [r2_lm, mse_lm, mae_lm, mslr_lm, mrse_lm, training_time_lm]
}

metrics_lm = pd.DataFrame(eval_metrics_lm)
metrics_lm.to_csv('metrics_lm', index=False)

# Save the model
# joblib.dump(best_model_lm, 'elastic_net_model.pkl')
# print(f'Model saved as elastic_net_model.pkl')

print(tabulate(metrics_lm.round(4), headers='keys', tablefmt='pretty', showindex=False))

Number of samples in training set: 172
Best parameters for Elastic-Net: ElasticNet(alpha=10)
Training time for Elastic-Net: 0.1760709285736084
+---------------+-------------------+
| Eval_metrics  | Elastic-Net Model |
+---------------+-------------------+
|   R2 Score    |      0.0209       |
|      MSE      |     7387.1982     |
|      MAE      |      70.0896      |
|     MSLE      |      0.3832       |
|     MRSE      |      0.8181       |
| Training time |      0.1761       |
+---------------+-------------------+


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

In [4]:
### Generalized Linear Model - Tweedie Regression

from sklearn.linear_model import TweedieRegressor

x_train_tweedie, x_test_tweedie, y_train_tweedie, y_test_tweedie = train_test_split(X, Y, test_size=0.2, random_state=42)

tw_regressor = TweedieRegressor(power=1, alpha=0.5, link='log')
tw_regressor.fit(x_train_tweedie, y_train_tweedie)

y_pred_test_tweedie = tw_regressor.predict(x_test_tweedie)
y_pred_train_tweedie = tw_regressor.predict(x_train_tweedie)

r2_tweedie = r2_score(y_test_tweedie, y_pred_test_tweedie)
mse_tweedie = mean_squared_error(y_test_tweedie, y_pred_test_tweedie)
mslr_tweedie = mean_squared_log_error(y_test_tweedie, y_pred_test_tweedie)
mae_tweedie = mean_absolute_error(y_test_tweedie, y_pred_test_tweedie)
mrse_tweedie = mean_relative_squared_error(y_test_tweedie, y_pred_test_tweedie)

eval_metrics_tweedie = {
    'Eval_metrics': ['R2 Score', 'MSE', 'MAE', 'MSLE', 'MRSE'],
    'Tweedie Model': [r2_tweedie, mse_tweedie, mae_tweedie, mslr_tweedie, mrse_tweedie]
}

metrics_tweedie = pd.DataFrame(eval_metrics_tweedie)
metrics_tweedie.to_csv('metrics_tweedie', index=False)

print(tabulate(metrics_tweedie.round(4), headers='keys', tablefmt='pretty', showindex=False))

# Save the model
# joblib.dump(tw_regressor, 'tweedie_model.pkl')
print(f'Model saved as tweedie_model.pkl')

+--------------+---------------+
| Eval_metrics | Tweedie Model |
+--------------+---------------+
|   R2 Score   |    0.1677     |
|     MSE      |   6279.1402   |
|     MAE      |    68.1122    |
|     MSLE     |    0.4046     |
|     MRSE     |    1.1003     |
+--------------+---------------+
Model saved as tweedie_model.pkl


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
