In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder,StandardScaler

import sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm.sklearn import LGBMRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid
from sklearn.inspection import permutation_importance
from sklearn.compose import make_column_transformer
import multiprocessing

from sklearn import metrics
from sklearn.metrics import mean_squared_error, r2_score

import warnings
warnings.filterwarnings('ignore')

## Data Preprocessing

In [3]:
TEST_3=pd.read_excel("TEST-3.xlsx")
Well_C1=pd.read_excel("well_C1.xlsx")

In [4]:
Well_C1=Well_C1.drop(['WELL','LOI', 'Cr2O3'], axis=1)

In [5]:
V_ppm=Well_C1['V2O5'].apply(lambda x: x*0.56016*10000).to_frame().rename(columns={'V2O5':'V'})
Well_C1=Well_C1.drop(['V2O5'], axis=1)
Well_C1=pd.concat([Well_C1,V_ppm], axis=1)

### Categorical features to numerical

In [6]:
def ordinal_encoder(test):
    well_encoder = OrdinalEncoder(categories=[test['WELL'].unique()])

    well_encoder.fit(test[["WELL"]])
    test["Well"] = well_encoder.transform(test[["WELL"]])
    test.drop('WELL', axis=1, inplace= True)
    
    Well_C1["Well"] = test['Well'].nunique() 

In [7]:
ordinal_encoder(TEST_3)

In [8]:
C1_depth=Well_C1.loc[:,'Depth']

In [9]:
X_values=TEST_3.drop(['SO3', 'Ba', 'Cu', 'Ga', 'Mo', 'Nb', 'Ni','Pb', 'Rb', 'Sr', 'Th', 'U', 'Y', 'Zn', 'Zr','Cr'], axis=1)
y_values=TEST_3.drop(['Al2O3', 'SiO2', 'TiO2', 'Fe2O3', 'MnO', 'MgO', 'CaO', 'Na2O', 'K2O','P2O5','V','Depth','Well' ],axis=1)

## Random Forest

In [10]:
wells_scores = pd.DataFrame()
pred = pd.DataFrame(C1_depth)
        
for col in y_values:
    X = X_values
    y = y_values[col]
        
                
    X_train, X_test, y_train, y_test = train_test_split(
                                        X,
                                        y,
                                        train_size   = 0.7,
                                        random_state = 123,
                                        shuffle      = True
                                    )        
    
    # Evaluated Hyperparameters
    
    param_grid = {'n_estimators': [30,70,150],
                    'max_features': [5, 10, 25],
                    'max_depth'   : [None, 3, 10, 20]
                    }

    # Grid
    
    grid = GridSearchCV(
            estimator  = RandomForestRegressor(random_state = 123),
            param_grid = param_grid,
            scoring    = 'neg_root_mean_squared_error',
            n_jobs     = - 1,
            cv         = RepeatedKFold(n_splits=5, n_repeats=3, random_state=123), 
            refit      = True,
            verbose    = 0,
            return_train_score = True
            )

    grid.fit(X = X_train, y = y_train)

    print("Final model best hyperparameters: ")        
    print("")       
    print(grid.best_params_, ":", grid.best_score_, grid.scoring)
    print("")
    
    # Test scores
        
    final_model = grid.best_estimator_
    predictions = final_model.predict(X = X_test)
        
    rmse = mean_squared_error(
            y_true  = y_test,
            y_pred  = predictions,
            squared = False
            )
                
    nrmse = rmse/(y_test.max()-y_test.min())
        
    R2 = r2_score(y_test, predictions)        
        
    print("Wells Test Scores")
    print(f"{col}:")
    print(f"RMSE: {rmse}")
    print(f"NRMSE: {nrmse}")        
    print(f"R2 score: {R2}")
    print("")
    print("")
        
                  
    # Save well test validation
                
    wells_scores = wells_scores.append({"y_hat": col, f"rmse_RF": rmse, f"nrmse_RF": nrmse, f"R2_RF": R2}, ignore_index=True)
                
    wells_scores.to_excel(f"Wells_TE_scores_RF.xlsx")         
    
        
    # WELL C1 MAYOR ELEMENTS PREDICTIONS 
                
    X_test_C1 = Well_C1
    y_test_C1 = y_values[col]
    
    C1_predictions = final_model.predict(X_test_C1) 
    
                   
    # Save C1 predicted curves
        
    C1_pred = pd.DataFrame()

    C1_pred = C1_pred.assign(Predictions = C1_predictions.flatten().tolist())        
                
    pred=pd.concat([pred,C1_pred],axis=1)
        
    pred.columns=pred.columns.str.replace('Predictions', f"{col}")
        
    pred.to_excel(f"C1_TE_pred_RF.xlsx")       
              
    print("")
                
print(f"Well test validation")
print(wells_scores)

Final model best hyperparameters: 

{'max_depth': None, 'max_features': 10, 'n_estimators': 150} : -0.2558826777346141 neg_root_mean_squared_error

Wells Test Scores
SO3:
RMSE: 0.194669791535667
NRMSE: 0.048762656971653905
R2 score: 0.9084749118331812



Final model best hyperparameters: 

{'max_depth': None, 'max_features': 5, 'n_estimators': 150} : -1011.7626333278026 neg_root_mean_squared_error

Wells Test Scores
Ba:
RMSE: 350.591545634206
NRMSE: 0.0646196224202486
R2 score: 0.6211485674088422



Final model best hyperparameters: 

{'max_depth': None, 'max_features': 10, 'n_estimators': 30} : -18.619618804735758 neg_root_mean_squared_error

Wells Test Scores
Cr:
RMSE: 15.885782278594032
NRMSE: 0.0967263371816608
R2 score: 0.6990105118090758



Final model best hyperparameters: 

{'max_depth': None, 'max_features': 10, 'n_estimators': 150} : -4.95827883332648 neg_root_mean_squared_error

Wells Test Scores
Cu:
RMSE: 4.841711915997591
NRMSE: 0.11594690254630408
R2 score: 0.762476359188

# XGBoost

In [11]:
cat_col= ['Well']
categorical = ['c' if col in cat_col else 'q' for col in X_values.columns]

In [12]:
wells_scores = pd.DataFrame()
pred = pd.DataFrame(C1_depth)
        
for col in y_values:
    X = X_values
    y = y_values[col]
        
                
    X_train, X_test, y_train, y_test = train_test_split(
                                        X,
                                        y,
                                        train_size   = 0.7,
                                        random_state = 123,
                                        shuffle      = True
                                    )        
    
    # Evaluated Hyperparameters    
    
    param_grid = {'max_depth'        : [None, 1, 3, 5, 10, 20],
                  'subsample'        : [0.5, 1],
                  'learning_rate'    : [0.001, 0.01, 0.1],
                  'booster'          : ['gbtree']
                 }


    # Creation of validation set
    
    np.random.seed(123)
    idx_validacion = np.random.choice(
                        X_train.shape[0],
                        size=int(X_train.shape[0]*0.1), 
                        replace=False
                        )

    X_val = X_train.iloc[idx_validacion, :].copy()
    y_val = y_train.iloc[idx_validacion].copy()

    X_train_grid = X_train.reset_index(drop = True).drop(idx_validacion, axis = 0).copy()
    y_train_grid = y_train.reset_index(drop = True).drop(idx_validacion, axis = 0).copy()

        
    fit_params = {"eval_set": [(X_val, y_val)],
                   "verbose": False
                    }

    # Grid
    
    grid = GridSearchCV(
            estimator  = XGBRegressor(
                            n_estimators          = 1000,
                            early_stopping_rounds = 5,
                            eval_metric           = "rmse",
                            tree_method           ='hist',
                            enable_categorical    =True,
                            random_state          = 123,
                            feature_types         = categorical,
                        ),
            param_grid = param_grid,
            scoring    = 'neg_root_mean_squared_error',
            n_jobs     = multiprocessing.cpu_count() - 1,
            cv         = RepeatedKFold(n_splits=3, n_repeats=1, random_state=123), 
            refit      = True,
            verbose    = 0,
            return_train_score = True
            )

    grid.fit(X = X_train_grid, y = y_train_grid, **fit_params)

    print("Final model best hyperparameters: ")        
    print("")       
    print(grid.best_params_, ":", grid.best_score_, grid.scoring)
    print("")
    n_trees_incluid = len(grid.best_estimator_.get_booster().get_dump())
    print(f"Final trees early stopping: {n_trees_incluid}")
    print("")
    
    # Test scores       
    
    final_model = grid.best_estimator_
    predictions = final_model.predict(X_test)
        
    rmse = mean_squared_error(
            y_true  = y_test,
            y_pred  = predictions,
            squared = False
               )
                
    nrmse = rmse/(y_test.max()-y_test.min())
        
    R2 = r2_score(y_test, predictions)        
        
    print("Wells Test Scores")
    print(f"{col}:")
    print(f"RMSE: {rmse}")
    print(f"NRMSE: {nrmse}")        
    print(f"R2 score: {R2}")
    print("")
    print("")
    
    # Save well test validation
                
    wells_scores = wells_scores.append({"y_hat":col,f"rmse_XGB": rmse, f"nrmse_XGB": nrmse, f"R2_XGB": R2}, ignore_index=True)
                
    wells_scores.to_excel(f"Wells_TE_scores_XGB.xlsx") 
        
    
    # WELL C1 MAYOR ELEMENTS PREDICTIONS 
                
    X_test_C1 = Well_C1
    y_test_C1 = y_values[col]
    
    C1_predictions = final_model.predict(X_test_C1) 
    
       
    # Save C1 predicted curves
        
    C1_pred = pd.DataFrame()

    C1_pred = C1_pred.assign(Predictions = C1_predictions.flatten().tolist())        
                
    pred = pd.concat([pred,C1_pred],axis=1)
        
    pred.columns = pred.columns.str.replace('Predictions', f"{col}")
        
    pred.to_excel(f"C1_TE_pred_XGB.xlsx")  
    
    print("")
                
print(f"Well test validation")
print(wells_scores)    

Final model best hyperparameters: 

{'booster': 'gbtree', 'learning_rate': 0.1, 'max_depth': 5, 'subsample': 0.5} : -0.18552138002451043 neg_root_mean_squared_error

Final trees early stopping: 66

Wells Test Scores
SO3:
RMSE: 0.16058860209109163
NRMSE: 0.04022569118481125
R2 score: 0.9377165783128505



Final model best hyperparameters: 

{'booster': 'gbtree', 'learning_rate': 0.01, 'max_depth': 10, 'subsample': 0.5} : -1039.3949601650413 neg_root_mean_squared_error

Final trees early stopping: 124

Wells Test Scores
Ba:
RMSE: 407.11820020636173
NRMSE: 0.07503838784861876
R2 score: 0.4891338919817757



Final model best hyperparameters: 

{'booster': 'gbtree', 'learning_rate': 0.1, 'max_depth': 10, 'subsample': 1} : -17.91712964907936 neg_root_mean_squared_error

Final trees early stopping: 29

Wells Test Scores
Cr:
RMSE: 18.018196664856593
NRMSE: 0.10971031425747602
R2 score: 0.6127809081746913



Final model best hyperparameters: 

{'booster': 'gbtree', 'learning_rate': 0.01, 'max_d

## Gradient Boosting: HistGradientBoostingregressor

In [13]:
X_values['Well']=X_values['Well'].astype('category')

In [14]:
Well_C1_=Well_C1.copy()

cols = list(Well_C1_.columns)
cols = [cols[-1]] + cols[:-1]
Well_C1_ = Well_C1_[cols]

In [15]:
wells_scores = pd.DataFrame()
pred = pd.DataFrame(C1_depth)
        
for col in y_values:
    X = X_values
    y = y_values[col]
        
                
    X_train, X_test, y_train, y_test = train_test_split(
                                        X,
                                        y,
                                        train_size   = 0.7,
                                        random_state = 123,
                                        shuffle      = True
                                    )
    
    cat_cols = X_train.select_dtypes(exclude=[np.number]).columns.tolist()
    
    preprocessor = make_column_transformer(
                        (
                            OrdinalEncoder(
                                dtype=int,
                                handle_unknown="use_encoded_value",
                                unknown_value=-1,
                                encoded_missing_value=-1
                            ),
                            cat_cols
                        ),
                        remainder="passthrough",
                        verbose_feature_names_out=False,
                   ).set_output(transform="pandas")
        
    X_train_prep = preprocessor.fit_transform(X_train)
    X_test_prep = preprocessor.transform(X_test)


    # Evaluated Hyperparameters
    
    param_grid = {'loss'             : ['squared_error', 'absolute_error'],
                    'learning_rate'    : [0.001, 0.01, 0.1],
                    'max_depth'        : [3, 5, 10, 20],
                    'l2_regularization': [0, 1, 10]
                    }

    # Grid
        
    grid = GridSearchCV(
            estimator  = HistGradientBoostingRegressor(
                            max_iter            = 1000, 
                            random_state        = 123,
                            early_stopping      = True,
                            validation_fraction = 0.1,
                            n_iter_no_change    = 10,
                            tol                 = 1e-7,
                            scoring             = 'loss',
                            categorical_features = cat_cols
                        ),
            param_grid = param_grid,
            scoring    = 'neg_root_mean_squared_error',
            n_jobs     = multiprocessing.cpu_count() - 1,
            cv         = RepeatedKFold(n_splits=3, n_repeats=1, random_state=123), 
            refit      = True,
            verbose    = 0,
            return_train_score = True
            )

    grid.fit(X = X_train_prep, y = y_train)
        
        
    print("Final model best hyperparameters: ")        
    print("")       
    print(grid.best_params_, ":", grid.best_score_, grid.scoring)
    print("")        
    print(f"Final trees early stopping: {grid.best_estimator_.n_iter_}")
    print("")    
        
        
    # Test scores
    
    final_model = grid.best_estimator_
    predictions = final_model.predict(X = X_test_prep)
    
    rmse = mean_squared_error(
            y_true  = y_test,
            y_pred  = predictions,
            squared = False
           )
        
    nrmse = rmse/(y_test.max()-y_test.min())
        
    R2 = r2_score(y_test, predictions)        
        
    print("Wells Test Scores")
    print(f"{col}:")
    print(f"RMSE: {rmse}")
    print(f"NRMSE: {nrmse}")        
    print(f"R2 score: {R2}")
    print("")
    print("")
        
                  
    # Save well test validation
        
    wells_scores = wells_scores.append({"y_hat":col,f"rmse_HGBT": rmse, f"nrmse_HGBT": nrmse,f"R2_HGBT": R2},ignore_index=True)
                
    wells_scores.to_excel(f"Wells_TE_scores_HGBT.xlsx")  
    
           
    # WELL C1 MAYOR ELEMENTS PREDICTIONS 
                
    X_test_C1 = Well_C1_
    y_test_C1 = y_values[col]
        
    C1_predictions = final_model.predict(X_test_C1) 
    
    # Save C1 predicted curves
        
    C1_pred = pd.DataFrame()

    C1_pred = C1_pred.assign(Predictions = C1_predictions.flatten().tolist())        
                
    pred = pd.concat([pred,C1_pred],axis=1)
        
    pred.columns = pred.columns.str.replace('Predictions', f"{col}")
        
    pred.to_excel(f"C1_TE_pred_HGBT.xlsx")        
    
    print("")
        
print(f"Well test validation")
print(wells_scores)
    

Final model best hyperparameters: 

{'l2_regularization': 1, 'learning_rate': 0.1, 'loss': 'squared_error', 'max_depth': 5} : -0.1972342513448043 neg_root_mean_squared_error

Final trees early stopping: 48

Wells Test Scores
SO3:
RMSE: 0.16927289608764778
NRMSE: 0.042401011997837725
R2 score: 0.9307981224763262



Final model best hyperparameters: 

{'l2_regularization': 10, 'learning_rate': 0.1, 'loss': 'squared_error', 'max_depth': 3} : -1006.8535963665657 neg_root_mean_squared_error

Final trees early stopping: 15

Wells Test Scores
Ba:
RMSE: 392.65962454587753
NRMSE: 0.07237344138442219
R2 score: 0.5247757984397228



Final model best hyperparameters: 

{'l2_regularization': 0, 'learning_rate': 0.1, 'loss': 'squared_error', 'max_depth': 5} : -19.69292385508482 neg_root_mean_squared_error

Final trees early stopping: 48

Wells Test Scores
Cr:
RMSE: 16.557666549829886
NRMSE: 0.10081734783677884
R2 score: 0.6730115739820546



Final model best hyperparameters: 

{'l2_regularization': 

# LightGBM

In [16]:
Well_C1['Well']=Well_C1['Well'].astype('category')

In [17]:
wells_scores = pd.DataFrame()
pred = pd.DataFrame(C1_depth)
    
for col in y_values:
    X = X_values
    y = y_values[col]
        
                
    X_train, X_test, y_train, y_test = train_test_split(
                                        X,
                                        y,
                                        train_size   = 0.7,
                                        random_state = 123,
                                        shuffle      = True
                                    )        
    
    # Evaluated Hyperparameters
    
    param_grid = {'n_estimators'     : [100, 500, 1000, 5000],
                  'max_depth'        : [-1, 1, 3, 5, 10, 20],
                  'subsample'        : [0.5, 1],
                  'learning_rate'    : [0.001, 0.01, 0.1],
                  'boosting_type'    : ['gbdt']
                     }

    # Grid
        
    grid = GridSearchCV(
            estimator  = LGBMRegressor(random_state=123),
            param_grid = param_grid,
            scoring    = 'neg_root_mean_squared_error',
            n_jobs     = multiprocessing.cpu_count() - 1,
            cv         = RepeatedKFold(n_splits=3, n_repeats=1, random_state=123), 
            refit      = True,
            verbose    = 0,
            return_train_score = True
            )

    grid.fit(X = X_train, y = y_train, categorical_feature='auto')

    print("Final model best hyperparameters: ")        
    print("")       
    print(grid.best_params_, ":", grid.best_score_, grid.scoring)
    print("")
    
        
    # Test scores
        
    final_model = grid.best_estimator_
    predictions = final_model.predict(X = X_test)
        
    rmse = mean_squared_error(
            y_true  = y_test,
            y_pred  = predictions,
            squared = False
            )
        
    nrmse = rmse/(y_test.max()-y_test.min())
        
    R2 = r2_score(y_test, predictions)        
        
    print("Wells Test Scores")
    print(f"{col}:")
    print(f"RMSE: {rmse}")
    print(f"NRMSE: {nrmse}")        
    print(f"R2 score: {R2}")
    print("")
    print("")
        
                  
    # Save well test validation
                
    wells_scores = wells_scores.append({"y_hat":col,f"rmse_LGBM": rmse, f"nrmse_LGBM": nrmse, f"R2_LGBM": R2}, ignore_index=True)
                
    wells_scores.to_excel(f"Wells_TE_scores_LGBM.xlsx")         
    
        
    # WELL C1 MAYOR ELEMENTS PREDICTIONS
    
    X_test_C1 = Well_C1
    y_test_C1 = y_values[col]
    
    C1_predictions = final_model.predict(X_test_C1) 
    
    # Save C1 predicted curves
        
    C1_pred = pd.DataFrame()

    C1_pred = C1_pred.assign(Predictions = C1_predictions.flatten().tolist())        
                
    pred = pd.concat([pred,C1_pred],axis=1)
        
    pred.columns = pred.columns.str.replace('Predictions', f"{col}")
        
    pred.to_excel(f"C1_TE_pred_LGBM.xlsx") 
        
        
    print("")
        
print(f"Well test validation")
print(wells_scores)
    

Final model best hyperparameters: 

{'boosting_type': 'gbdt', 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 5000, 'subsample': 0.5} : -0.25951006732549947 neg_root_mean_squared_error

Wells Test Scores
SO3:
RMSE: 0.1562008134419268
NRMSE: 0.03912659804316097
R2 score: 0.9410736405383229



Final model best hyperparameters: 

{'boosting_type': 'gbdt', 'learning_rate': 0.01, 'max_depth': 1, 'n_estimators': 500, 'subsample': 0.5} : -1024.8007921715557 neg_root_mean_squared_error

Wells Test Scores
Ba:
RMSE: 486.3248635728565
NRMSE: 0.0896374412018644
R2 score: 0.27101431035645185



Final model best hyperparameters: 

{'boosting_type': 'gbdt', 'learning_rate': 0.001, 'max_depth': -1, 'n_estimators': 5000, 'subsample': 0.5} : -20.436242241032982 neg_root_mean_squared_error

Wells Test Scores
Cr:
RMSE: 14.707865377837495
NRMSE: 0.08955416364204978
R2 score: 0.7419918596306936



Final model best hyperparameters: 

{'boosting_type': 'gbdt', 'learning_rate': 0.01, 'max_depth': 3, 'n_