In [8]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from tabulate import tabulate
from sklearn.metrics import r2_score, mean_squared_error, mean_squared_log_error, mean_absolute_error, make_scorer
from sklearn.model_selection import train_test_split

data = pd.read_csv('csv/final_dataset.csv')
print(data.columns)

# Separate features and response variables
X = data.iloc[:, 2:]                                # features
Y = data['temp_measured']                           # response variable: geothermal reservoir measured temperature
print(f'Features of dataset: {X.columns}')
print(f'Number of compenents in features: {X.shape[1]}')
print(Y.head(10))

Index(['well_sample', 'temp_measured', 'pH', 'Na ', 'K', 'Ca', 'Mg', 'Cl',
       'SO4'],
      dtype='object')
Features of dataset: Index(['pH', 'Na ', 'K', 'Ca', 'Mg', 'Cl', 'SO4'], dtype='object')
Number of compenents in features: 7
0    137
1    137
2    137
3    137
4    150
5    116
6    165
7    140
8    115
9    115
Name: temp_measured, dtype: int64


In [9]:
### Scikit-learn MLP Implementation ..... probar RandomizedSearchCV u Optuna

from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler, RobustScaler

start_time_mlp = time.time()

x_train_mlp, x_test_mlp, y_train_log_mlp, y_test_log_mlp = train_test_split(X, np.log(Y), test_size=0.2, random_state=42)

scaler = RobustScaler()
x_train_mlp = scaler.fit_transform(x_train_mlp)
x_test_mlp = scaler.transform(x_test_mlp)

mlp = MLPRegressor(
    hidden_layer_sizes=(1024, 512, 256),
    activation='relu',
    solver='adam',
    alpha=0.001,  # Regularización L2
    learning_rate='adaptive',
    learning_rate_init=0.0001,
    max_iter=1000,
    early_stopping=True,
    validation_fraction=0.2,
    n_iter_no_change=20,
    random_state=42
)

mlp.fit(x_train_mlp, y_train_log_mlp)

end_time_mlp = time.time()

y_pred_log_mlp = mlp.predict(x_test_mlp)
y_pred_test_mlp = np.exp(y_pred_log_mlp)
y_test_mlp_orig = np.exp(y_test_log_mlp)

def mean_relative_squared_error(Y_true, Y_pred):
    return np.mean(((Y_true - Y_pred) / Y_true) ** 2)


r2_mlp = r2_score(y_test_mlp_orig, y_pred_test_mlp)
mse_mlp = mean_squared_error(y_test_mlp_orig, y_pred_test_mlp)
mslr_mlp = mean_squared_log_error(y_test_mlp_orig, y_pred_test_mlp)
mae_mlp = mean_absolute_error(y_test_mlp_orig, y_pred_test_mlp)
mrse_mlp = mean_relative_squared_error(y_test_mlp_orig, y_pred_test_mlp)

training_time_mlp = end_time_mlp - start_time_mlp

mlp_metrics = {
    'Eval_metrics': ['R2 Score', 'MSE', 'MAE', 'MSLE', 'MRSE', 'Training time'],
    'MLP Regressor': [r2_mlp, mse_mlp, mslr_mlp, mae_mlp, mrse_mlp, training_time_mlp]
}

df_mlp = pd.DataFrame(mlp_metrics)
df_mlp.to_csv('metrics/metrics_mlp.csv', index=False)

print(tabulate(df_mlp.round(4), headers='keys', tablefmt='pretty', showindex=False))

+---------------+---------------+
| Eval_metrics  | MLP Regressor |
+---------------+---------------+
|   R2 Score    |    0.5649     |
|      MSE      |   3282.9729   |
|      MAE      |    0.1494     |
|     MSLE      |    37.1181    |
|     MRSE      |    0.2125     |
| Training time |    26.8384    |
+---------------+---------------+


In [10]:
### Neural Network implementation (Keras)

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, LeakyReLU
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from sklearn.preprocessing import RobustScaler  # Better for handling outliers


start_time_k = time.time()

x_train_k, x_test_k, y_train_log_k, y_test_log_k = train_test_split(X, np.log(Y), test_size=0.2, random_state=36)

scaler = RobustScaler()
x_train_k = scaler.fit_transform(x_train_k)
x_test_k = scaler.transform(x_test_k)

# Define neural networks architecture with LeakyReLU activation
model = Sequential([
    # Input layer
    Dense(512, input_dim=x_train_k.shape[1], kernel_regularizer=l2(0.01)),
    LeakyReLU(alpha=0.1),
    BatchNormalization(),
    Dropout(0.3),
    
    # Hidden layers
    Dense(256, kernel_regularizer=l2(0.01)),
    LeakyReLU(alpha=0.1),
    BatchNormalization(),
    Dropout(0.3),
    
    Dense(128, kernel_regularizer=l2(0.01)),
    LeakyReLU(alpha=0.1),
    BatchNormalization(),
    Dropout(0.3),
    
    #Dense(64, kernel_regularizer=l2(0.01)),
    #LeakyReLU(alpha=0.1),
    #BatchNormalization(),
    #Dropout(0.3),
    
    # Output layer
    Dense(1, activation='linear')
])

model.compile(optimizer=Adam(learning_rate=0.001), 
              loss='mean_squared_error', 
              metrics=['mean_absolute_error'])

early_stop = EarlyStopping(monitor='val_loss',
                           patience=20, 
                           restore_best_weights=True)

reduce_lr = ReduceLROnPlateau(monitor='val_loss', 
                              actor=0.2, 
                              patience=10, 
                              min_lr=1e-4)

training = model.fit(x_train_k, y_train_log_k, epochs=2000, validation_split=0.2, batch_size=20,
                     verbose=0, callbacks=[early_stop, reduce_lr])

end_time_k = time.time()

#model.save('keras_nn_model.h5')
#print("Model saved to 'keras_nn_model.h5'.")

y_pred_test_log_k = model.predict(x_test_k)
y_pred_train_log_k = model.predict(x_train_k)

y_pred_test_k = np.exp(y_pred_test_log_k) 
y_pred_train_k = np.exp(y_pred_train_log_k)
y_train_k = np.exp(y_train_log_k)
y_test_k = np.exp(y_test_log_k)

Y_test_k = np.squeeze(y_test_k)
Y_pred_test_k = np.squeeze(y_pred_test_k)

training_time_k = end_time_k - start_time_k

def mean_relative_squared_error(y_true, y_pred_test):
    return np.mean(((y_true - y_pred_test)/y_true)**2)

r2_k = r2_score(y_test_k, y_pred_test_k)
mse_k = mean_squared_error(y_test_k, y_pred_test_k)
mae_k = mean_absolute_error(y_test_k, y_pred_test_k)
mslr_k = mean_squared_log_error(y_test_k, y_pred_test_k)
mrse_k = mean_relative_squared_error(Y_test_k, Y_pred_test_k)


eval_metrics_k = {
    'Eval_metrics': ['R2 Score', 'MSE', 'MAE', 'MSLR', 'MRSE', 'Training time'],
    'NN TensorFlow': [r2_k, mse_k, mae_k, mslr_k, mrse_k, training_time_k]
}

df_metrics_k = pd.DataFrame(eval_metrics_k)
df_metrics_k.to_csv('metrics/metrics_nn.csv', index=False)

print(tabulate(df_metrics_k.round(4), headers='keys', tablefmt='pretty', showindex=False))

### reducir learning_rate de Adam de 0,01 a 0,001 mejor el r2, 
### de 0,001 a 0,0001 no lo mejoro y aumento mucho el tiempo.
### Agregar input layer de 1024 no mejoro nada y aumento el tiempo
### Con RobustScaler mejoro respecto de StandardScaler.
### Prueba l2 de 0.01 a 0.001, no mejora nada, queda 0.01

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
+---------------+---------------+
| Eval_metrics  | NN TensorFlow |
+---------------+---------------+
|   R2 Score    |    0.5827     |
|      MSE      |   2978.1181   |
|      MAE      |    36.557     |
|     MSLR      |    0.1303     |
|     MRSE      |    0.2163     |
| Training time |    44.852     |
+---------------+---------------+
