In [1]:
import numpy as np
import pandas as pd
import glob
import matplotlib.pyplot as plt
import sklearn
from sklearn.preprocessing import OrdinalEncoder

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm.sklearn import LGBMRegressor

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid
from sklearn.inspection import permutation_importance
import multiprocessing

from sklearn import metrics
from sklearn.metrics import mean_squared_error, r2_score

import warnings
warnings.filterwarnings('ignore')

## Data Preprocessing

In [2]:
Well_C1=pd.read_excel("well_C1.xlsx")

Train_Wells={}

for i in list(range(1,5)):
    path="TEST-"+ str(list(range(5)))
    file= glob.glob(path + ".xlsx")

    Train_Wells['TEST_'+ str(i)] = pd.read_excel(file[i-1])
    
    i=i+1   

In [3]:
Well_C1=Well_C1.drop(['WELL','LOI', 'Cr2O3', 'V2O5'], axis=1)

### Categorical features to numerical

In [4]:
def ordinal_encoder(test):
    well_encoder = OrdinalEncoder(categories=[Train_Wells[test]['WELL'].unique()])

    well_encoder.fit(Train_Wells[test][["WELL"]])
    Train_Wells[test][["Well"]] = well_encoder.transform(Train_Wells[test][["WELL"]])
    Train_Wells[test] = Train_Wells[test].drop('WELL', axis=1)
    
    Well_C1["Well"] = Train_Wells[test]['Well'].nunique()           

In [5]:
tests=['TEST_1','TEST_2','TEST_3','TEST_4']
traces=['SO3', 'Ba', 'Cu', 'Ga', 'Mo', 'Nb', 'Ni','Pb', 'Rb', 'Sr', 'Th', 'U', 'Y', 'Zn', 'Zr', 'Cr', 'V']
Tests={}
for test in tests:
    ordinal_encoder(test)
    Tests[test] = Train_Wells[test].drop(traces, axis=1)
    Tests['C1_'+ test] = Well_C1.copy()

## Random Forest

In [6]:
C1_depth=Well_C1.loc[:,'Depth']

In [7]:
def Random_Forest(test, C1_test):
    
    wells_scores = pd.DataFrame()
    C1_scores = pd.DataFrame()
    pred = pd.DataFrame(C1_depth)
        
    for col in Tests[test].iloc[:,1:-1]:
        X = Tests[test].drop([col], axis=1)
        y = Tests[test][col]
        
                
        X_train, X_test, y_train, y_test = train_test_split(
                                            X,
                                            y,
                                            train_size   = 0.7,
                                            random_state = 123,
                                            shuffle      = True
                                        )        
    
        # Evaluated Hyperparameters
    
        param_grid = {'n_estimators': [30,70,150],
                      'max_features': [5, 10, 25],
                      'max_depth'   : [None, 3, 10, 20]
                     }

        # Grid
    
        grid = GridSearchCV(
                estimator  = RandomForestRegressor(random_state = 123),
                param_grid = param_grid,
                scoring    = 'neg_root_mean_squared_error',
                n_jobs     = - 1,
                cv         = RepeatedKFold(n_splits=5, n_repeats=3, random_state=123), 
                refit      = True,
                verbose    = 0,
                return_train_score = True
               )

        grid.fit(X = X_train, y = y_train)

        print("Final model best hyperparameters: ")        
        print("")       
        print(grid.best_params_, ":", grid.best_score_, grid.scoring)
        print("")
    
        # Test scores
        
        final_model = grid.best_estimator_
        predictions = final_model.predict(X = X_test)
        
        rmse = mean_squared_error(
                y_true  = y_test,
                y_pred  = predictions,
                squared = False
               )
                
        nrmse = rmse/(y_test.max()-y_test.min())
        
        R2 = r2_score(y_test, predictions)        
        
        print("Wells Test Scores")
        print(f"{col}:")
        print(f"RMSE: {rmse}")
        print(f"NRMSE: {nrmse}")        
        print(f"R2 score: {R2}")
        print("")
        print("")
        
                  
        # Save well test validation
                
        wells_scores = wells_scores.append({"y_hat": col, f"rmse_RF_{test}": rmse, f"nrmse_RF_{test}": nrmse,
                                           f"R2_RF_{test}": R2}, ignore_index=True)
                
        wells_scores.to_excel(f"Wells_Mayor_scores_RF_{test}.xlsx")         
    
        
        # WELL C1 MAYOR ELEMENTS PREDICTIONS 
                
        X_test_C1 = Tests[C1_test].drop([col], axis=1)
        y_test_C1 = Tests[C1_test][col]
    
        C1_predictions = final_model.predict(X_test_C1) 
    
        rmse_C1 = mean_squared_error(
                    y_true  = y_test_C1,
                    y_pred  = C1_predictions,
                    squared = False
                   )
                
        nrmse_C1 = rmse_C1/(y_test_C1.max()-y_test_C1.min())
        
        R2_C1 = r2_score(y_test_C1, C1_predictions) 
                
        
        print("C1 Prediction Scores")
        print(f"{col}:")
        print(f"RMSE: {rmse_C1}")
        print(f"NRMSE: {nrmse_C1}")        
        print(f"R2 score: {R2_C1}")
        print("")
        print("")
                
         
        # Save C1 predicted curves
        
        C1_pred = pd.DataFrame()

        C1_pred = C1_pred.assign(Predictions = C1_predictions.flatten().tolist())        
                
        pred=pd.concat([pred,C1_pred],axis=1)
        
        pred.columns=pred.columns.str.replace('Predictions', f"{col}")
        
        pred.to_excel(f"C1_Mayor_pred_RF_{test}.xlsx")     
        
        
        # Save C1 predictions scores
    
        C1_scores = C1_scores.append({'y_hat':col,f"rmse_RF_{test}": rmse_C1, f"nrmse_RF_{test}": nrmse_C1,
                                 f"R2_RF_{test}": R2_C1}, ignore_index=True)
        
        C1_scores.to_excel(f"C1_Mayor_scores_RF_{test}.xlsx")         
                
    
        print("")
                
    print(f"Well {test} validation")
    print(wells_scores)
    print("")
    print("")
    print(f"C1 {test} scores")
    print(C1_scores)

In [8]:
Random_Forest('TEST_1', 'C1_TEST_1')

Final model best hyperparameters: 

{'max_depth': 10, 'max_features': 5, 'n_estimators': 150} : -0.33756504576820184 neg_root_mean_squared_error

Wells Test Scores
Al2O3:
RMSE: 0.2681016230040316
NRMSE: 0.020790888280014706
R2 score: 0.9826982971589855


C1 Prediction Scores
Al2O3:
RMSE: 0.7371085573263692
NRMSE: 0.04697951289524342
R2 score: 0.9263074355294078



Final model best hyperparameters: 

{'max_depth': None, 'max_features': 5, 'n_estimators': 150} : -1.7791619861755563 neg_root_mean_squared_error

Wells Test Scores
SiO2:
RMSE: 1.4003630431716758
NRMSE: 0.02919206763573817
R2 score: 0.9685839865471649


C1 Prediction Scores
SiO2:
RMSE: 4.817915391690907
NRMSE: 0.0918047902380127
R2 score: 0.8461353144428179



Final model best hyperparameters: 

{'max_depth': 10, 'max_features': 5, 'n_estimators': 150} : -0.025898083451094314 neg_root_mean_squared_error

Wells Test Scores
TiO2:
RMSE: 0.03166529100932136
NRMSE: 0.04623952776583484
R2 score: 0.9095671383208707


C1 Prediction S

In [9]:
Random_Forest('TEST_2', 'C1_TEST_2')

Final model best hyperparameters: 

{'max_depth': 10, 'max_features': 5, 'n_estimators': 150} : -0.366778146161944 neg_root_mean_squared_error

Wells Test Scores
Al2O3:
RMSE: 0.3791612260025159
NRMSE: 0.03386846810817543
R2 score: 0.9609899017850667


C1 Prediction Scores
Al2O3:
RMSE: 0.8928413090364804
NRMSE: 0.05690511848543534
R2 score: 0.8918791839417666



Final model best hyperparameters: 

{'max_depth': None, 'max_features': 5, 'n_estimators': 150} : -1.8938907510427334 neg_root_mean_squared_error

Wells Test Scores
SiO2:
RMSE: 2.3150981692075896
NRMSE: 0.04884484558314875
R2 score: 0.9150256879588414


C1 Prediction Scores
SiO2:
RMSE: 4.7717333796241
NRMSE: 0.09092479763003239
R2 score: 0.8490709096307181



Final model best hyperparameters: 

{'max_depth': 20, 'max_features': 5, 'n_estimators': 150} : -0.026339960057896437 neg_root_mean_squared_error

Wells Test Scores
TiO2:
RMSE: 0.030197571902869487
NRMSE: 0.049524513165837614
R2 score: 0.8831650833035147


C1 Prediction Sco

In [10]:
Random_Forest('TEST_3', 'C1_TEST_3')

Final model best hyperparameters: 

{'max_depth': 10, 'max_features': 5, 'n_estimators': 150} : -0.36368175032751215 neg_root_mean_squared_error

Wells Test Scores
Al2O3:
RMSE: 0.29852101160143213
NRMSE: 0.02073627067933994
R2 score: 0.9812247811901312


C1 Prediction Scores
Al2O3:
RMSE: 0.8209694734254923
NRMSE: 0.05232437689136345
R2 score: 0.90858556456075



Final model best hyperparameters: 

{'max_depth': 10, 'max_features': 10, 'n_estimators': 150} : -2.0089189472106415 neg_root_mean_squared_error

Wells Test Scores
SiO2:
RMSE: 1.6877705691316929
NRMSE: 0.033659481859334754
R2 score: 0.957045363178637


C1 Prediction Scores
SiO2:
RMSE: 4.740461805661689
NRMSE: 0.09032892160178523
R2 score: 0.8510426562712772



Final model best hyperparameters: 

{'max_depth': None, 'max_features': 5, 'n_estimators': 150} : -0.032375128546626865 neg_root_mean_squared_error

Wells Test Scores
TiO2:
RMSE: 0.025656333956393624
NRMSE: 0.04081893587742009
R2 score: 0.9329840219249219


C1 Prediction 

In [11]:
Random_Forest('TEST_4', 'C1_TEST_4')

Final model best hyperparameters: 

{'max_depth': None, 'max_features': 10, 'n_estimators': 150} : -0.33906016074429596 neg_root_mean_squared_error

Wells Test Scores
Al2O3:
RMSE: 0.34843120499654257
NRMSE: 0.024203200107011255
R2 score: 0.9733690984929319


C1 Prediction Scores
Al2O3:
RMSE: 0.7665577008593539
NRMSE: 0.04885645002290338
R2 score: 0.9203014416258785



Final model best hyperparameters: 

{'max_depth': 10, 'max_features': 10, 'n_estimators': 70} : -1.9613449421152533 neg_root_mean_squared_error

Wells Test Scores
SiO2:
RMSE: 2.0239502882835505
NRMSE: 0.04036396845557263
R2 score: 0.9341769277332768


C1 Prediction Scores
SiO2:
RMSE: 5.254016373879652
NRMSE: 0.10011464127057262
R2 score: 0.8170200690179052



Final model best hyperparameters: 

{'max_depth': 10, 'max_features': 5, 'n_estimators': 150} : -0.02518417723989991 neg_root_mean_squared_error

Wells Test Scores
TiO2:
RMSE: 0.035053982018028854
NRMSE: 0.05801238232193438
R2 score: 0.8406632767696682


C1 Predictio

## Histogram-based Gradient Boosting Regression Tree 

In [12]:
def HGBT(test, C1_test):
    
    wells_scores = pd.DataFrame()
    C1_scores = pd.DataFrame()
    pred = pd.DataFrame(C1_depth)
        
    for col in Tests[test].iloc[:,1:-1]:
        X = Tests[test].drop([col], axis=1)
        y = Tests[test][col]
        
                
        X_train, X_test, y_train, y_test = train_test_split(
                                            X,
                                            y,
                                            train_size   = 0.7,
                                            random_state = 123,
                                            shuffle      = True
                                        ) 


        # Evaluated Hyperparameters
    
        param_grid = {'loss'             : ['squared_error', 'absolute_error'],
                      'learning_rate'    : [0.001, 0.01, 0.1],
                      'max_depth'        : [3, 5, 10, 20],
                      'l2_regularization': [0, 1, 10]
                     }

        # Grid
        
        grid = GridSearchCV(
                estimator  = HistGradientBoostingRegressor(
                                max_iter            = 1000, 
                                random_state        = 123,
                                early_stopping      = True,
                                validation_fraction = 0.1,
                                n_iter_no_change    = 10,
                                tol                 = 1e-7,
                                scoring             = 'loss',
                            ),
                param_grid = param_grid,
                scoring    = 'neg_root_mean_squared_error',
                n_jobs     = multiprocessing.cpu_count() - 1,
                cv         = RepeatedKFold(n_splits=3, n_repeats=1, random_state=123), 
                refit      = True,
                verbose    = 0,
                return_train_score = True
               )

        grid.fit(X = X_train, y = y_train)
        
        
        print("Final model best hyperparameters: ")        
        print("")       
        print(grid.best_params_, ":", grid.best_score_, grid.scoring)
        print("")        
        print(f"Final trees early stopping: {grid.best_estimator_.n_iter_}")
        print("")    
        
        
        # Test scores
    
        final_model = grid.best_estimator_
        predictions = final_model.predict(X = X_test)
        rmse = mean_squared_error(
                y_true  = y_test,
                y_pred  = predictions,
                squared = False
           )
        
        nrmse = rmse/(y_test.max()-y_test.min())
        
        R2 = r2_score(y_test, predictions)        
        
        print("Wells Test Scores")
        print(f"{col}:")
        print(f"RMSE: {rmse}")
        print(f"NRMSE: {nrmse}")        
        print(f"R2 score: {R2}")
        print("")
        print("")
        
                  
        # Save well test validation
        
        wells_scores = wells_scores.append({"y_hat":col,f"rmse_HGBT_{test}": rmse, f"nrmse_HGBT_{test}": nrmse,
                                           f"R2_HGBT_{test}": R2},ignore_index=True)
                
        wells_scores.to_excel(f"Wells_Mayor_scores_HGBT_{test}.xlsx")  
    
           
        # WELL C1 MAYOR ELEMENTS PREDICTIONS 
                
        X_test_C1 = Tests[C1_test].drop([col], axis=1)
        y_test_C1 = Tests[C1_test][col]
        
        C1_predictions = final_model.predict(X_test_C1) 
    
        rmse_C1 = mean_squared_error(
                    y_true  = y_test_C1,
                    y_pred  = C1_predictions,
                    squared = False
                   )
                
        nrmse_C1 = rmse_C1/(y_test_C1.max()-y_test_C1.min())
        
        R2_C1 = r2_score(y_test_C1, C1_predictions) 
                
        
        print("C1 Prediction Scores")
        print(f"{col}:")
        print(f"RMSE: {rmse_C1}")
        print(f"NRMSE: {nrmse_C1}")        
        print(f"R2 score: {R2_C1}")
        print("")
        print("")   
    
        # Save C1 predicted curves
        
        C1_pred = pd.DataFrame()

        C1_pred = C1_pred.assign(Predictions = C1_predictions.flatten().tolist())        
                
        pred = pd.concat([pred,C1_pred],axis=1)
        
        pred.columns = pred.columns.str.replace('Predictions', f"{col}")
        
        pred.to_excel(f"C1_Mayor_pred_HGBT_{test}.xlsx")  
        
        
        # Save C1 predictions scores
    
        C1_scores = C1_scores.append({'y_hat':col,f"rmse_HGBT_{test}": rmse_C1, f"nrmse_HGBT_{test}": nrmse_C1,
                                 f"R2_HGBT_{test}": R2_C1}, ignore_index=True)
        
        C1_scores.to_excel(f"C1_Mayor_scores_HGBT_{test}.xlsx")         
                
    
        print("")
        
    print(f"Well {test} validation")
    print(wells_scores)
    print("")
    print("")
    print(f"C1 {test} scores")
    print(C1_scores)     
           

In [13]:
HGBT('TEST_1', 'C1_TEST_1')

Final model best hyperparameters: 

{'l2_regularization': 10, 'learning_rate': 0.1, 'loss': 'squared_error', 'max_depth': 3} : -0.5851221361984279 neg_root_mean_squared_error

Final trees early stopping: 36

Wells Test Scores
Al2O3:
RMSE: 0.49564725278164157
NRMSE: 0.03843671867187599
R2 score: 0.9408662570320647


C1 Prediction Scores
Al2O3:
RMSE: 1.0037909521323576
NRMSE: 0.06397647878472643
R2 score: 0.8633381431077904



Final model best hyperparameters: 

{'l2_regularization': 0, 'learning_rate': 0.1, 'loss': 'squared_error', 'max_depth': 5} : -2.8235475924983917 neg_root_mean_squared_error

Final trees early stopping: 47

Wells Test Scores
SiO2:
RMSE: 1.6234442468949428
NRMSE: 0.03384243428109182
R2 score: 0.9577774411668333


C1 Prediction Scores
SiO2:
RMSE: 5.289295751514374
NRMSE: 0.10078688550903914
R2 score: 0.8145544918801993



Final model best hyperparameters: 

{'l2_regularization': 1, 'learning_rate': 0.1, 'loss': 'squared_error', 'max_depth': 10} : -0.04345309164510813

In [14]:
HGBT('TEST_2', 'C1_TEST_2')

Final model best hyperparameters: 

{'l2_regularization': 0, 'learning_rate': 0.1, 'loss': 'squared_error', 'max_depth': 5} : -0.5472089445502716 neg_root_mean_squared_error

Final trees early stopping: 107

Wells Test Scores
Al2O3:
RMSE: 0.5363788430063392
NRMSE: 0.04791188679756958
R2 score: 0.9219321023115044


C1 Prediction Scores
Al2O3:
RMSE: 1.046798074833129
NRMSE: 0.06671753185679599
R2 score: 0.8513768046065631



Final model best hyperparameters: 

{'l2_regularization': 10, 'learning_rate': 0.1, 'loss': 'squared_error', 'max_depth': 10} : -2.283808194905339 neg_root_mean_squared_error

Final trees early stopping: 282

Wells Test Scores
SiO2:
RMSE: 2.052602726522527
NRMSE: 0.04330661418770831
R2 score: 0.9332027417221245


C1 Prediction Scores
SiO2:
RMSE: 5.11217077325779
NRMSE: 0.09741179064896703
R2 score: 0.8267667226186981



Final model best hyperparameters: 

{'l2_regularization': 10, 'learning_rate': 0.1, 'loss': 'squared_error', 'max_depth': 3} : -0.03020623034290389 n

In [15]:
HGBT('TEST_3', 'C1_TEST_3')

Final model best hyperparameters: 

{'l2_regularization': 1, 'learning_rate': 0.1, 'loss': 'squared_error', 'max_depth': 3} : -0.5688635243004324 neg_root_mean_squared_error

Final trees early stopping: 250

Wells Test Scores
Al2O3:
RMSE: 0.33644273893612087
NRMSE: 0.023370441046182078
R2 score: 0.9761516949926428


C1 Prediction Scores
Al2O3:
RMSE: 0.9816968692830123
NRMSE: 0.06256831544187459
R2 score: 0.8692879650587503



Final model best hyperparameters: 

{'l2_regularization': 1, 'learning_rate': 0.1, 'loss': 'squared_error', 'max_depth': 3} : -2.6230921761604615 neg_root_mean_squared_error

Final trees early stopping: 177

Wells Test Scores
SiO2:
RMSE: 1.7214989102080622
NRMSE: 0.03433213162901854
R2 score: 0.9553114013048215


C1 Prediction Scores
SiO2:
RMSE: 5.263421071386581
NRMSE: 0.10029384663465284
R2 score: 0.8163644140069741



Final model best hyperparameters: 

{'l2_regularization': 0, 'learning_rate': 0.1, 'loss': 'squared_error', 'max_depth': 5} : -0.0426299293911093

In [16]:
HGBT('TEST_4', 'C1_TEST_4')

Final model best hyperparameters: 

{'l2_regularization': 0, 'learning_rate': 0.1, 'loss': 'squared_error', 'max_depth': 5} : -0.6125556653275294 neg_root_mean_squared_error

Final trees early stopping: 185

Wells Test Scores
Al2O3:
RMSE: 0.4981547636017201
NRMSE: 0.034603500647517944
R2 score: 0.945564732944087


C1 Prediction Scores
Al2O3:
RMSE: 1.1629414713791526
NRMSE: 0.07411991532053236
R2 score: 0.8165674176765976



Final model best hyperparameters: 

{'l2_regularization': 0, 'learning_rate': 0.1, 'loss': 'squared_error', 'max_depth': 5} : -2.4574603841317937 neg_root_mean_squared_error

Final trees early stopping: 276

Wells Test Scores
SiO2:
RMSE: 2.225658021997889
NRMSE: 0.04438665846333727
R2 score: 0.920403249952868


C1 Prediction Scores
SiO2:
RMSE: 5.607740657057741
NRMSE: 0.10685481434942341
R2 score: 0.7915526124300882



Final model best hyperparameters: 

{'l2_regularization': 0, 'learning_rate': 0.1, 'loss': 'squared_error', 'max_depth': 10} : -0.029727896080532095 

# XGBoost

In [17]:
def XGB(test, C1_test):
    
    wells_scores = pd.DataFrame()
    C1_scores = pd.DataFrame()
    pred = pd.DataFrame(C1_depth)
        
    for col in Tests[test].iloc[:,1:-1]:
        X = Tests[test].drop([col], axis=1)
        y = Tests[test][col]
        
                
        X_train, X_test, y_train, y_test = train_test_split(
                                            X,
                                            y,
                                            train_size   = 0.7,
                                            random_state = 123,
                                            shuffle      = True
                                        )        
    
        # Evaluated Hyperparameters    
    
        param_grid = {'max_depth'        : [None, 1, 3, 5, 10, 20],
                      'subsample'        : [0.5, 1],
                      'learning_rate'    : [0.001, 0.01, 0.1],
                      'booster'          : ['gbtree']
                 }


        # Creation of validation set
    
        np.random.seed(123)
        idx_validacion = np.random.choice(
                            X_train.shape[0],
                            size=int(X_train.shape[0]*0.1), 
                            replace=False
                         )

        X_val = X_train.iloc[idx_validacion, :].copy()
        y_val = y_train.iloc[idx_validacion].copy()

        X_train_grid = X_train.reset_index(drop = True).drop(idx_validacion, axis = 0).copy()
        y_train_grid = y_train.reset_index(drop = True).drop(idx_validacion, axis = 0).copy()

        
        fit_params = {"eval_set": [(X_val, y_val)],
                       "verbose": False
                     }

        # Grid
    
        grid = GridSearchCV(
                estimator  = XGBRegressor(
                                n_estimators          = 1000,
                                early_stopping_rounds = 5,
                                eval_metric           = "rmse",
                                random_state          = 123
                            ),
                param_grid = param_grid,
                scoring    = 'neg_root_mean_squared_error',
                n_jobs     = multiprocessing.cpu_count() - 1,
                cv         = RepeatedKFold(n_splits=3, n_repeats=1, random_state=123), 
                refit      = True,
                verbose    = 0,
                return_train_score = True
               )

        grid.fit(X = X_train_grid, y = y_train_grid, **fit_params)

        print("Final model best hyperparameters: ")        
        print("")       
        print(grid.best_params_, ":", grid.best_score_, grid.scoring)
        print("")
        n_trees_incluid = len(grid.best_estimator_.get_booster().get_dump())
        print(f"Final trees early stopping: {n_trees_incluid}")
        print("")
    
        # Test scores       
    
        final_model = grid.best_estimator_
        predictions = final_model.predict(X_test)
        
        rmse = mean_squared_error(
                y_true  = y_test,
                y_pred  = predictions,
                squared = False
               )
                
        nrmse = rmse/(y_test.max()-y_test.min())
        
        R2 = r2_score(y_test, predictions)        
        
        print("Wells Test Scores")
        print(f"{col}:")
        print(f"RMSE: {rmse}")
        print(f"NRMSE: {nrmse}")        
        print(f"R2 score: {R2}")
        print("")
        print("")
    
        # Save well test validation
                
        wells_scores= wells_scores.append({"y_hat":col,f"rmse_XGB_{test}": rmse, f"nrmse_XGB_{test}": nrmse,
                                           f"R2_XGB_{test}": R2}, ignore_index=True)
                
        wells_scores.to_excel(f"Wells_Mayor_scores_XGB_{test}.xlsx") 
        
    
        # WELL C1 MAYOR ELEMENTS PREDICTIONS 
                
        X_test_C1 = Tests[C1_test].drop([col], axis=1)
        y_test_C1 = Tests[C1_test][col]
    
        C1_predictions = final_model.predict(X_test_C1) 
    
        rmse_C1 = mean_squared_error(
                    y_true  = y_test_C1,
                    y_pred  = C1_predictions,
                    squared = False
                   )
                
        nrmse_C1 = rmse_C1/(y_test_C1.max()-y_test_C1.min())
        
        R2_C1 = r2_score(y_test_C1, C1_predictions) 
                
        
        print("C1 Prediction Scores")
        print(f"{col}:")
        print(f"RMSE: {rmse_C1}")
        print(f"NRMSE: {nrmse_C1}")        
        print(f"R2 score: {R2_C1}")
        print("")
        print("")
        
        # Save C1 predicted curves
        
        C1_pred = pd.DataFrame()

        C1_pred = C1_pred.assign(Predictions = C1_predictions.flatten().tolist())        
                
        pred = pd.concat([pred,C1_pred],axis=1)
        
        pred.columns = pred.columns.str.replace('Predictions', f"{col}")
        
        pred.to_excel(f"C1_Mayor_pred_XGB_{test}.xlsx")  
        
        
        # Save C1 predictions scores
    
        C1_scores = C1_scores.append({'y_hat':col,f"rmse_XGB_{test}": rmse_C1, f"nrmse_XGB_{test}": nrmse_C1,
                                 f"R2_XGB_{test}": R2_C1}, ignore_index=True)
        
        C1_scores.to_excel(f"C1_Mayor_scores_XGB_{test}.xlsx")
        
        
        print("")
                
    print(f"Well {test} validation")
    print(wells_scores)
    print("")
    print("")
    print(f"C1 {test} scores")
    print(C1_scores)    


In [18]:
XGB('TEST_1', 'C1_TEST_1')

Final model best hyperparameters: 

{'booster': 'gbtree', 'learning_rate': 0.01, 'max_depth': 3, 'subsample': 0.5} : -0.37103677028311016 neg_root_mean_squared_error

Final trees early stopping: 589

Wells Test Scores
Al2O3:
RMSE: 0.2671491744016307
NRMSE: 0.020717027285578744
R2 score: 0.9828210096675655


C1 Prediction Scores
Al2O3:
RMSE: 0.7584099574421082
NRMSE: 0.048337154712690136
R2 score: 0.9219866699162895



Final model best hyperparameters: 

{'booster': 'gbtree', 'learning_rate': 0.1, 'max_depth': None, 'subsample': 0.5} : -1.978757971434418 neg_root_mean_squared_error

Final trees early stopping: 42

Wells Test Scores
SiO2:
RMSE: 1.3725971099731473
NRMSE: 0.028613257016696814
R2 score: 0.9698174483241756


C1 Prediction Scores
SiO2:
RMSE: 4.344764174985751
NRMSE: 0.08278895150506384
R2 score: 0.8748724234725603



Final model best hyperparameters: 

{'booster': 'gbtree', 'learning_rate': 0.1, 'max_depth': None, 'subsample': 0.5} : -0.028578985563532555 neg_root_mean_square

In [19]:
XGB('TEST_2', 'C1_TEST_2')

Final model best hyperparameters: 

{'booster': 'gbtree', 'learning_rate': 0.1, 'max_depth': 10, 'subsample': 0.5} : -0.3704100352833357 neg_root_mean_squared_error

Final trees early stopping: 37

Wells Test Scores
Al2O3:
RMSE: 0.42825206158258844
NRMSE: 0.038253492960997115
R2 score: 0.9502345276418631


C1 Prediction Scores
Al2O3:
RMSE: 0.8940634578405375
NRMSE: 0.056983011971990925
R2 score: 0.891582983120471



Final model best hyperparameters: 

{'booster': 'gbtree', 'learning_rate': 0.1, 'max_depth': None, 'subsample': 0.5} : -1.7345445130843633 neg_root_mean_squared_error

Final trees early stopping: 73

Wells Test Scores
SiO2:
RMSE: 2.344078175399965
NRMSE: 0.04945627707503653
R2 score: 0.9128849847680952


C1 Prediction Scores
SiO2:
RMSE: 4.590801773102663
NRMSE: 0.08747716793259647
R2 score: 0.8602995847451357



Final model best hyperparameters: 

{'booster': 'gbtree', 'learning_rate': 0.1, 'max_depth': 3, 'subsample': 0.5} : -0.02709090508508721 neg_root_mean_squared_error

In [20]:
XGB('TEST_3', 'C1_TEST_3')

Final model best hyperparameters: 

{'booster': 'gbtree', 'learning_rate': 0.01, 'max_depth': 10, 'subsample': 0.5} : -0.34286924016406944 neg_root_mean_squared_error

Final trees early stopping: 517

Wells Test Scores
Al2O3:
RMSE: 0.27268091889771773
NRMSE: 0.018941331174716847
R2 score: 0.9843344843268416


C1 Prediction Scores
Al2O3:
RMSE: 0.7236761343473521
NRMSE: 0.04612339925732009
R2 score: 0.9289687814762118



Final model best hyperparameters: 

{'booster': 'gbtree', 'learning_rate': 0.01, 'max_depth': 10, 'subsample': 0.5} : -2.02417182217658 neg_root_mean_squared_error

Final trees early stopping: 541

Wells Test Scores
SiO2:
RMSE: 1.6115102953498537
NRMSE: 0.03213861086602889
R2 score: 0.9608393942279121


C1 Prediction Scores
SiO2:
RMSE: 4.734343794512905
NRMSE: 0.09021234364544407
R2 score: 0.8514268950451367



Final model best hyperparameters: 

{'booster': 'gbtree', 'learning_rate': 0.1, 'max_depth': 3, 'subsample': 1} : -0.03351685351957298 neg_root_mean_squared_error

In [21]:
XGB('TEST_4', 'C1_TEST_4')

Final model best hyperparameters: 

{'booster': 'gbtree', 'learning_rate': 0.01, 'max_depth': 20, 'subsample': 0.5} : -0.3566463425975119 neg_root_mean_squared_error

Final trees early stopping: 541

Wells Test Scores
Al2O3:
RMSE: 0.40432292557261496
NRMSE: 0.028085626474194013
R2 score: 0.9641401460273247


C1 Prediction Scores
Al2O3:
RMSE: 0.7763183130591269
NRMSE: 0.04947854130395966
R2 score: 0.9182589097194738



Final model best hyperparameters: 

{'booster': 'gbtree', 'learning_rate': 0.1, 'max_depth': 3, 'subsample': 0.5} : -1.9705281150431844 neg_root_mean_squared_error

Final trees early stopping: 48

Wells Test Scores
SiO2:
RMSE: 2.097709086472814
NRMSE: 0.04183495211592589
R2 score: 0.9292919296703168


C1 Prediction Scores
SiO2:
RMSE: 5.2612872221965725
NRMSE: 0.10025318639856273
R2 score: 0.816513279625293



Final model best hyperparameters: 

{'booster': 'gbtree', 'learning_rate': 0.01, 'max_depth': 3, 'subsample': 0.5} : -0.026379329951705888 neg_root_mean_squared_erro

# LightGBM

In [22]:
def LGBM(test, C1_test):
    
    wells_scores = pd.DataFrame()
    C1_scores = pd.DataFrame()
    pred = pd.DataFrame(C1_depth)
    
    for col in Tests[test].iloc[:,1:-1]:
        X = Tests[test].drop([col], axis=1)
        y = Tests[test][col]
        
                
        X_train, X_test, y_train, y_test = train_test_split(
                                            X,
                                            y,
                                            train_size   = 0.7,
                                            random_state = 123,
                                            shuffle      = True
                                        )        
    
        # Evaluated Hyperparameters
    
        param_grid = {'n_estimators'     : [100, 500, 1000, 5000],
                      'max_depth'        : [-1, 1, 3, 5, 10, 20],
                      'subsample'        : [0.5, 1],
                      'learning_rate'    : [0.001, 0.01, 0.1],
                      'boosting_type'    : ['gbdt']
                     }

        # Grid
        
        grid = GridSearchCV(
                estimator  = LGBMRegressor(random_state=123),
                param_grid = param_grid,
                scoring    = 'neg_root_mean_squared_error',
                n_jobs     = multiprocessing.cpu_count() - 1,
                cv         = RepeatedKFold(n_splits=3, n_repeats=1, random_state=123), 
                refit      = True,
                verbose    = 0,
                return_train_score = True
               )

        grid.fit(X = X_train, y = y_train)

        print("Final model best hyperparameters: ")        
        print("")       
        print(grid.best_params_, ":", grid.best_score_, grid.scoring)
        print("")
    
        
        # Test scores
        
        final_model = grid.best_estimator_
        predictions = final_model.predict(X = X_test)
        
        rmse = mean_squared_error(
                y_true  = y_test,
                y_pred  = predictions,
                squared = False
               )
        
        nrmse = rmse/(y_test.max()-y_test.min())
        
        R2 = r2_score(y_test, predictions)        
        
        print("Wells Test Scores")
        print(f"{col}:")
        print(f"RMSE: {rmse}")
        print(f"NRMSE: {nrmse}")        
        print(f"R2 score: {R2}")
        print("")
        print("")
        
                  
        # Save well test validation
                
        wells_scores= wells_scores.append({"y_hat":col,f"rmse_LGBM_{test}": rmse, f"nrmse_LGBM_{test}": nrmse,
                                           f"R2_LGBM_{test}": R2}, ignore_index=True)
                
        wells_scores.to_excel(f"Wells_Mayor_scores_LGBM_{test}.xlsx")         
    
        
        # WELL C1 MAYOR ELEMENTS PREDICTIONS
    
        X_test_C1 = Tests[C1_test].drop([col], axis=1)
        y_test_C1 = Tests[C1_test][col]
    
        C1_predictions = final_model.predict(X_test_C1) 
    
        rmse_C1 = mean_squared_error(
                    y_true  = y_test_C1,
                    y_pred  = C1_predictions,
                    squared = False
                   )
                
        nrmse_C1 = rmse_C1/(y_test_C1.max()-y_test_C1.min())
        
        R2_C1 = r2_score(y_test_C1, C1_predictions) 
                
        
        print("C1 Prediction Scores")
        print(f"{col}:")
        print(f"RMSE: {rmse_C1}")
        print(f"NRMSE: {nrmse_C1}")        
        print(f"R2 score: {R2_C1}")
        print("")
        print("")
                
         
        # Save C1 predicted curves
        
        C1_pred = pd.DataFrame()

        C1_pred = C1_pred.assign(Predictions = C1_predictions.flatten().tolist())        
                
        pred = pd.concat([pred,C1_pred],axis=1)
        
        pred.columns = pred.columns.str.replace('Predictions', f"{col}")
        
        pred.to_excel(f"C1_Mayor_pred_LGBM_{test}.xlsx") 
        
        
        # Save C1 predictions scores
    
        C1_scores = C1_scores.append({'y_hat':col,f"rmse_LGBM_{test}": rmse_C1, f"nrmse_LGBM_{test}": nrmse_C1,
                                 f"R2_LGBM_{test}": R2_C1}, ignore_index=True)
        
        C1_scores.to_excel(f"C1_Mayor_scores_LGBM_{test}.xlsx")         
                
    
        print("")
        
    print(f"Well {test} validation")
    print(wells_scores)
    print("")
    print("")
    print(f"C1 {test} scores")
    print(C1_scores)
 

In [23]:
LGBM('TEST_1', 'C1_TEST_1')

Final model best hyperparameters: 

{'boosting_type': 'gbdt', 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 5000, 'subsample': 0.5} : -0.462656334114892 neg_root_mean_squared_error

Wells Test Scores
Al2O3:
RMSE: 0.4300201289306756
NRMSE: 0.03334743131570207
R2 score: 0.9554889814230199


C1 Prediction Scores
Al2O3:
RMSE: 1.1291267444652688
NRMSE: 0.0719647383343065
R2 score: 0.8270796315259903



Final model best hyperparameters: 

{'boosting_type': 'gbdt', 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 5000, 'subsample': 0.5} : -2.2734174438611467 neg_root_mean_squared_error

Wells Test Scores
SiO2:
RMSE: 1.8428091546276206
NRMSE: 0.03841533075580601
R2 score: 0.9455960397417151


C1 Prediction Scores
SiO2:
RMSE: 4.369149475977669
NRMSE: 0.08325361044164765
R2 score: 0.8734639068125353



Final model best hyperparameters: 

{'boosting_type': 'gbdt', 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 5000, 'subsample': 0.5} : -0.031551562285284655 neg_root_mean_squar

In [24]:
LGBM('TEST_2', 'C1_TEST_2')

Final model best hyperparameters: 

{'boosting_type': 'gbdt', 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 1000, 'subsample': 0.5} : -0.47783777478635553 neg_root_mean_squared_error

Wells Test Scores
Al2O3:
RMSE: 0.531085042242612
NRMSE: 0.04743901955787947
R2 score: 0.9234654829985812


C1 Prediction Scores
Al2O3:
RMSE: 1.1216977188073995
NRMSE: 0.07149125040200126
R2 score: 0.8293475855632925



Final model best hyperparameters: 

{'boosting_type': 'gbdt', 'learning_rate': 0.1, 'max_depth': -1, 'n_estimators': 5000, 'subsample': 0.5} : -2.062160445300769 neg_root_mean_squared_error

Wells Test Scores
SiO2:
RMSE: 1.9682231934442262
NRMSE: 0.041526341835370656
R2 score: 0.9385817377414043


C1 Prediction Scores
SiO2:
RMSE: 5.2518399046562765
NRMSE: 0.10007316891494429
R2 score: 0.8171716359992929



Final model best hyperparameters: 

{'boosting_type': 'gbdt', 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 500, 'subsample': 0.5} : -0.02831916909420529 neg_root_mean_squ

In [25]:
LGBM('TEST_3', 'C1_TEST_3')

Final model best hyperparameters: 

{'boosting_type': 'gbdt', 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 5000, 'subsample': 0.5} : -0.46934863635872065 neg_root_mean_squared_error

Wells Test Scores
Al2O3:
RMSE: 0.3625083366332196
NRMSE: 0.025181044883969775
R2 score: 0.9723133000072361


C1 Prediction Scores
Al2O3:
RMSE: 0.9230871000989683
NRMSE: 0.05883282983422361
R2 score: 0.884429730082088



Final model best hyperparameters: 

{'boosting_type': 'gbdt', 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 1000, 'subsample': 0.5} : -2.077486595033076 neg_root_mean_squared_error

Wells Test Scores
SiO2:
RMSE: 1.7847663998933014
NRMSE: 0.035593885424406474
R2 score: 0.9519663051172209


C1 Prediction Scores
SiO2:
RMSE: 4.8935008091239185
NRMSE: 0.0932450611494649
R2 score: 0.8412696607137587



Final model best hyperparameters: 

{'boosting_type': 'gbdt', 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 5000, 'subsample': 0.5} : -0.036258533143212264 neg_root_mean_

In [26]:
LGBM('TEST_4', 'C1_TEST_4')

Final model best hyperparameters: 

{'boosting_type': 'gbdt', 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 1000, 'subsample': 0.5} : -0.5051796261982145 neg_root_mean_squared_error

Wells Test Scores
Al2O3:
RMSE: 0.42552956949102544
NRMSE: 0.029558711086005735
R2 score: 0.9602798142579998


C1 Prediction Scores
Al2O3:
RMSE: 1.08900383088063
NRMSE: 0.0694075099350306
R2 score: 0.8391505504964991



Final model best hyperparameters: 

{'boosting_type': 'gbdt', 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 5000, 'subsample': 0.5} : -1.993316326830482 neg_root_mean_squared_error

Wells Test Scores
SiO2:
RMSE: 2.0638253251579863
NRMSE: 0.041159202775250266
R2 score: 0.9315577400938343


C1 Prediction Scores
SiO2:
RMSE: 5.6231183720701345
NRMSE: 0.10714783483365348
R2 score: 0.7904078235567822



Final model best hyperparameters: 

{'boosting_type': 'gbdt', 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 1000, 'subsample': 0.5} : -0.03035106623736551 neg_root_mean_sq