In [2]:
import sys
sys.path.insert(0,'/Users/thomasdodd/Library/CloudStorage/OneDrive-MillfieldEnterprisesLimited/github/Omphalos')
sys.path.insert(1,'/Users/thomasdodd/Library/CloudStorage/OneDrive-MillfieldEnterprisesLimited/github')
import numpy as np
import pandas as pd
import xarray as xr

import wandb
import xgboost as xgb
import re

from importlib import reload

# Import Omphalos modules.
from omphalos import file_methods as fm
from omphalos import attributes as attr
from omphalos import labels as lbls
from analysis import helper as hp

from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split

random_state = 69
np.random.state = random_state
np.random.seed = random_state

  from pandas import MultiIndex, Int64Index


# Training Data Import & Cleaning

In [3]:
# Port in and unpack all the runs we need from the .pkl file.
TrainSet_dict = fm.unpickle('/Users/thomasdodd/Library/CloudStorage/OneDrive-MillfieldEnterprisesLimited/Cambridge/AI4ER/Easter/MRes/CrunchFlow_Work/bfm/2022-06-16_bfm_5-3_1D_FB_100oc-100bar/completed_run.pkl')

In [4]:
# Filter all errored files out of the dictionary
dataset_dict, error_dict = hp.filter_errors(TrainSet_dict)

Returned 14556 files without errors out of a total possible 16000.
1444 files had errors.
0 files had unhandled errors.
File failure rate: 9.920307776861776 %.
To see unhandled errors, run with verbose=True.


In [5]:
# Get a df of all the start attributes of interest
attributes_all_df = attr.get_condition(dataset_dict,"f_i_onehundred",species_concs=True)
attribute_names = ["HCO3-","Al+++","Ca++","Fe++","K+","Na+","Mg++","SiO2(aq)"]
attributes_all_df = attributes_all_df.loc[:, attribute_names]
attributes_all_df

  species_attrs = species_attrs.append(primary_species(data_set[i], condition), ignore_index=True)


Unnamed: 0,HCO3-,Al+++,Ca++,Fe++,K+,Na+,Mg++,SiO2(aq)
0,6.959850,0.008639,0.009236,0.008412,0.011782,0.000901,0.018781,0.018063
1,0.362034,0.019889,0.006757,0.014766,0.015169,0.019101,0.012283,0.019721
2,1.727252,0.016818,0.006918,0.013390,0.000664,0.007003,0.009433,0.008969
3,3.691655,0.012079,0.000125,0.009366,0.015292,0.017273,0.018103,0.000800
4,0.271405,0.014512,0.018235,0.012059,0.019059,0.012268,0.017185,0.014145
...,...,...,...,...,...,...,...,...
14551,3.076317,0.017831,0.011572,0.018447,0.004185,0.000735,0.009735,0.004838
14552,3.847435,0.004883,0.013111,0.017648,0.009385,0.004792,0.019129,0.015238
14553,1.981378,0.018599,0.003749,0.019024,0.019137,0.015390,0.016777,0.015538
14554,7.734595,0.008684,0.002643,0.011963,0.010950,0.006106,0.012260,0.008428


In [6]:
# Get an array of the end carbonate volumes generated
NonCalSidMag_arr = ["Diopside","Diopside_a","Hedenbergite","Hedenbergite_a",
                    "Albite","Albite_a","Anorthite","Anorthite_a","M_Microcline",
                    "M_Microcline_a","M_Microcline_b","Forsterite","Forsterite_a",
                    "Fayalite","Fayalite_a","Antigorite","Antigorite_a","Greenalite",
                    "Greenalite_a","Calcite_a","Siderite_a","Magnesite_a"]
CalSidMag_arr = ["Calcite","Siderite","Magnesite"]

Vols_ds = lbls.raw(dataset_dict, 'volume')
NrXDiscretisedBlocks = len(Vols_ds.X.values)
EndVols_ds = Vols_ds.sel(time=280.0)
CalSidMagEndVols_ds = EndVols_ds.drop(labels=NonCalSidMag_arr)

CalSidMagEndVols_da = CalSidMagEndVols_ds.to_array(dim='arbitrary_array')
CalSidMagEndVols_da = CalSidMagEndVols_da.astype(str)
CalSidMagEndVols_da = CalSidMagEndVols_da.str.replace("^\d+\.\d+-\d+$", "0", regex=True)
CalSidMagEndVols_da = CalSidMagEndVols_da.str.replace("^\d+\.\d+\+\d+$", "5000.0000", regex=True)
CalSidMagEndVols_da = CalSidMagEndVols_da.astype(float)

CalSidMagEndVols_ds = CalSidMagEndVols_da.to_dataset(dim="arbitrary_array")
SpatialSummedCalSidMagEndVols_ds = CalSidMagEndVols_ds.sum(dim=["X","Y","Z"])
SpatialSummedCarbEndVols_ds = SpatialSummedCalSidMagEndVols_ds.assign(Carbonates=lambda SpatialSummedCalSidMagEndVols_ds: SpatialSummedCalSidMagEndVols_ds.Calcite + SpatialSummedCalSidMagEndVols_ds.Siderite + SpatialSummedCalSidMagEndVols_ds.Magnesite)
SpatialSummedCarbEndVols_ds = SpatialSummedCarbEndVols_ds.drop(labels=CalSidMag_arr)
SpatialSummedCarbEndVols_arr = np.array(SpatialSummedCarbEndVols_ds.Carbonates.values)
attributes_all_df["Carbonates_sum"] = SpatialSummedCarbEndVols_arr.tolist()
attributes_all_df

Unnamed: 0,HCO3-,Al+++,Ca++,Fe++,K+,Na+,Mg++,SiO2(aq),Carbonates_sum
0,6.959850,0.008639,0.009236,0.008412,0.011782,0.000901,0.018781,0.018063,7.639410e+00
1,0.362034,0.019889,0.006757,0.014766,0.015169,0.019101,0.012283,0.019721,2.509608e-08
2,1.727252,0.016818,0.006918,0.013390,0.000664,0.007003,0.009433,0.008969,2.813153e+00
3,3.691655,0.012079,0.000125,0.009366,0.015292,0.017273,0.018103,0.000800,9.216605e+00
4,0.271405,0.014512,0.018235,0.012059,0.019059,0.012268,0.017185,0.014145,3.416881e-01
...,...,...,...,...,...,...,...,...,...
14551,3.076317,0.017831,0.011572,0.018447,0.004185,0.000735,0.009735,0.004838,5.930189e+00
14552,3.847435,0.004883,0.013111,0.017648,0.009385,0.004792,0.019129,0.015238,5.377640e+00
14553,1.981378,0.018599,0.003749,0.019024,0.019137,0.015390,0.016777,0.015538,2.680194e+00
14554,7.734595,0.008684,0.002643,0.011963,0.010950,0.006106,0.012260,0.008428,9.129131e+00


In [7]:
attributes_cleaned_df = attributes_all_df
attributes_cleaned_df.loc[attributes_cleaned_df['Carbonates_sum'] > (3*NrXDiscretisedBlocks), 'Carbonates_sum'] = np.nan
attributes_cleaned_df.loc[attributes_cleaned_df['Carbonates_sum'] == np.inf, 'Carbonates_sum'] = np.nan
attributes_cleaned_df = attributes_cleaned_df.dropna()
attributes_cleaned_df = attributes_cleaned_df.reset_index(drop=True)
attributes_cleaned_df["Carbonates_avg"] = (attributes_cleaned_df["Carbonates_sum"] / NrXDiscretisedBlocks)
attributes_cleaned_df["Carbonates_pct"] = (attributes_cleaned_df["Carbonates_avg"] / 1) * 100
attributes_cleaned_df = attributes_cleaned_df.drop(['Carbonates_sum', 'Carbonates_avg'], axis=1)
attributes_cleaned_df

Unnamed: 0,HCO3-,Al+++,Ca++,Fe++,K+,Na+,Mg++,SiO2(aq),Carbonates_pct
0,6.959850,0.008639,0.009236,0.008412,0.011782,0.000901,0.018781,0.018063,5.025928e+00
1,0.362034,0.019889,0.006757,0.014766,0.015169,0.019101,0.012283,0.019721,1.651058e-08
2,1.727252,0.016818,0.006918,0.013390,0.000664,0.007003,0.009433,0.008969,1.850759e+00
3,3.691655,0.012079,0.000125,0.009366,0.015292,0.017273,0.018103,0.000800,6.063556e+00
4,0.271405,0.014512,0.018235,0.012059,0.019059,0.012268,0.017185,0.014145,2.247948e-01
...,...,...,...,...,...,...,...,...,...
14548,3.076317,0.017831,0.011572,0.018447,0.004185,0.000735,0.009735,0.004838,3.901440e+00
14549,3.847435,0.004883,0.013111,0.017648,0.009385,0.004792,0.019129,0.015238,3.537921e+00
14550,1.981378,0.018599,0.003749,0.019024,0.019137,0.015390,0.016777,0.015538,1.763285e+00
14551,7.734595,0.008684,0.002643,0.011963,0.010950,0.006106,0.012260,0.008428,6.006007e+00


# XGBoost Model Training

In [8]:
# Split the ML datasets into x's (predictor arrays) and y's (target arrays)
x = attributes_cleaned_df.loc[:, attribute_names].astype(float)
y = attributes_cleaned_df["Carbonates_pct"].values.astype(float)

# Generate a numpy matrix from a pandas dataframe
x = x.to_numpy()

# Normalisation array for each of the columns in x
# Essentially the top values that are found in each of these columns
x_norm = []
for _ in attribute_names:
    max_val = max(attributes_cleaned_df[_].values)
    x_norm.append(max_val)
    print(f"Normalisation Factor for {_} = {max_val}")

# Normalisation of the np matrix using the x_norm array
for i, norm in enumerate(x_norm):
    x[:, i] = x[:, i]/norm

# Generate a vertical array of target values.
y = y.reshape(-1,1)
# y = y * 1e4 ?? Is this normalisation Angus? If so, I'd have done it like this:
y_norm = max(y)
print(f"Normalisation Factor for Carbonate Volume Generated = {y_norm}")
y = y / y_norm

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=random_state)

Normalisation Factor for HCO3- = 8.199201833431928
Normalisation Factor for Al+++ = 0.0199986901647288
Normalisation Factor for Ca++ = 0.0199995119269535
Normalisation Factor for Fe++ = 0.0199992989909211
Normalisation Factor for K+ = 0.0199912702644351
Normalisation Factor for Na+ = 0.0199993159903501
Normalisation Factor for Mg++ = 0.0199986292552286
Normalisation Factor for SiO2(aq) = 0.0199996345822374
Normalisation Factor for Carbonate Volume Generated = [185.97116485]


In [9]:
%%time

xgb_defaults = {
    'tree_method': 'hist',
    'gamma': 0,
    'mds': 0,
    'eta': 0.01,
    'l1' : 0,
    'l2' : 1,
    'max_depth' : 0,
    'max_leaves': 6,
    'objective': 'reg:squarederror',
    'max_bin': 10000,
    'grow_policy': 'lossguide'
}

# Initialise the wandb instance
wandb.init(config=xgb_defaults, project='bcm_4-1')

# Setup xgb matrices; one for training and one testing
dtrain = xgb.DMatrix(x_train, label=y_train)
dtest = xgb.DMatrix(x_test, label=y_test)

evallist = [(dtest, 'eval'), (dtrain, 'train')]

params = {'max_depth': wandb.config.max_depth,
            'tree_method': wandb.config.tree_method,
            'max_delta_step': wandb.config.mds,
            'eta': wandb.config.eta,
            'objective': wandb.config.objective,
            'alpha': wandb.config.l1,
            'lambda': wandb.config.l2,
            'gamma': wandb.config.gamma,
            'max_leaves': wandb.config.max_leaves,
            'max_bin': wandb.config.max_bin,
            'grow_policy': wandb.config.grow_policy
        }

results = {}

num_round = 1000
bst = xgb.train(params, dtrain, num_round, evallist, evals_result=results, verbose_eval=False, callbacks=[wandb.xgboost.WandbCallback()])
wandb.log({"val_loss" : results['rmse'][-1]})

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mtjhd97[0m. Use [1m`wandb login --relogin`[0m to force relogin


KeyError: 'rmse'

In [10]:
bst.save_model("bfm_XGBModel_5-1.json")

# Comparison Data Import and Cleaning

In [None]:
# Port in and unpack all the runs we need from the .pkl file.
CtrlSet_ca_dict = fm.unpickle('/Users/thomasdodd/Library/CloudStorage/OneDrive-MillfieldEnterprisesLimited/cambridge/ai4er/easter/mres/CrunchFlow_Work/bcm/2022-05-31_bcm_4-1/control_ca/basalt_cl-_ctrl-ca++_2.pkl')
CtrlSet_fe_dict = fm.unpickle('/Users/thomasdodd/Library/CloudStorage/OneDrive-MillfieldEnterprisesLimited/cambridge/ai4er/easter/mres/CrunchFlow_Work/bcm/2022-05-31_bcm_4-1/control_fe/basalt_cl-_ctrl-fe++_2.pkl')
CtrlSet_mg_dict = fm.unpickle('/Users/thomasdodd/Library/CloudStorage/OneDrive-MillfieldEnterprisesLimited/cambridge/ai4er/easter/mres/CrunchFlow_Work/bcm/2022-05-31_bcm_4-1/control_mg/basalt_cl-_ctrl-mg++_2.pkl')

In [None]:
# Create a number of datasets from the control RTM experiments
dataset_ca_dict, error_dict = hp.filter_errors(CtrlSet_ca_dict)
dataset_fe_dict, error_dict = hp.filter_errors(CtrlSet_fe_dict)
dataset_mg_dict, error_dict = hp.filter_errors(CtrlSet_mg_dict)

In [None]:
# Get all the attribute df's for the control RTM experiments
attributes_ca_df = attr.get_condition(dataset_ca_dict,"basalt",species_concs=True)
attributes_ca_df = attributes_ca_df.loc[:, ['Ca++', 'Fe++', 'Mg++']]
attributes_fe_df = attr.get_condition(dataset_fe_dict,"basalt",species_concs=True)
attributes_fe_df = attributes_fe_df.loc[:, ['Ca++', 'Fe++', 'Mg++']]
attributes_mg_df = attr.get_condition(dataset_mg_dict,"basalt",species_concs=True)
attributes_mg_df = attributes_mg_df.loc[:, ['Ca++', 'Fe++', 'Mg++']]

In [None]:
# Retrieval of xarray datasets containing final and initial mineralogical volume fractions for the control RTM experiments

# Final mineralogical volumes xarray datasets
FinalVols_ca_ds = lbls.raw(dataset_ca_dict, 'volume')
FinalVols_ca_ds = NrFormatFixer_XYZTime(FinalVols_ca_ds)
FinalVols_ca_ds = FinalVols_ca_ds.astype(float)
FinalVols_fe_ds = lbls.raw(dataset_fe_dict, 'volume')
FinalVols_fe_ds = NrFormatFixer_XYZTime(FinalVols_fe_ds)
FinalVols_fe_ds = FinalVols_fe_ds.astype(float)
FinalVols_mg_ds = lbls.raw(dataset_mg_dict, 'volume')
FinalVols_mg_ds = NrFormatFixer_XYZTime(FinalVols_mg_ds)
FinalVols_mg_ds = FinalVols_mg_ds.astype(float)

# Start mineralogical volumes xarray datasets
InitialVols_ca_ds = attr.initial_conditions(dataset_ca_dict, concentrations=False, minerals=True)
InitialVols_ca_ds = NrFormatFixer_XYZ(InitialVols_ca_ds)
InitialVols_ca_ds = InitialVols_ca_ds.astype(float)
InitialVols_fe_ds = attr.initial_conditions(dataset_fe_dict, concentrations=False, minerals=True)
InitialVols_fe_ds = NrFormatFixer_XYZ(InitialVols_fe_ds)
InitialVols_fe_ds = InitialVols_fe_ds.astype(float)
InitialVols_mg_ds = attr.initial_conditions(dataset_fe_dict, concentrations=False, minerals=True)
InitialVols_mg_ds = NrFormatFixer_XYZ(InitialVols_mg_ds)
InitialVols_mg_ds = InitialVols_mg_ds.astype(float)

# Generation of an xarray dataset representing change in mineralogical volume fractions
DeltVolFrac_ca_ds = FinalVols_ca_ds - InitialVols_ca_ds
DeltVolFrac_fe_ds = FinalVols_fe_ds - InitialVols_fe_ds
DeltVolFrac_mg_ds = FinalVols_mg_ds - InitialVols_mg_ds

# Generation of an narrowed-down xarray dataset containing only data on a specific time and place
DeltVolFrac_ca_TenYr_ds = DeltVolFrac_ca_ds.sel(X=0.5,Y=0.5,Z=0.5,time=10)
DeltVolFrac_fe_TenYr_ds = DeltVolFrac_fe_ds.sel(X=0.5,Y=0.5,Z=0.5,time=10)
DeltVolFrac_mg_TenYr_ds = DeltVolFrac_mg_ds.sel(X=0.5,Y=0.5,Z=0.5,time=10)

# Generation of an additional xarray variable entitled Carbonates; placed into the above xarray dataset
DeltVolFrac_ca_TenYr_ds = DeltVolFrac_ca_TenYr_ds.assign(Carbonates=lambda DeltVolFrac_ca_TenYr_ds: DeltVolFrac_ca_TenYr_ds.Calcite + DeltVolFrac_ca_TenYr_ds.Siderite + DeltVolFrac_ca_TenYr_ds.Magnesite)
DeltVolFrac_fe_TenYr_ds = DeltVolFrac_fe_TenYr_ds.assign(Carbonates=lambda DeltVolFrac_fe_TenYr_ds: DeltVolFrac_fe_TenYr_ds.Calcite + DeltVolFrac_fe_TenYr_ds.Siderite + DeltVolFrac_fe_TenYr_ds.Magnesite)
DeltVolFrac_mg_TenYr_ds = DeltVolFrac_mg_TenYr_ds.assign(Carbonates=lambda DeltVolFrac_mg_TenYr_ds: DeltVolFrac_mg_TenYr_ds.Calcite + DeltVolFrac_mg_TenYr_ds.Siderite + DeltVolFrac_mg_TenYr_ds.Magnesite)

In [None]:
# Control Experiments
# Split the datasets into x's and y's
# Normalise the control x's and y's in the same way as the training data

# Control 1 - Ca++ Varied, Fe++/Mg++ Held-Steady
x_ca = attributes_ca_df.loc[:, ['Ca++', 'Fe++','Mg++']].astype(float)
y_ca = DeltVolFrac_ca_TenYr_ds.Carbonates.values.astype(float)

x_ca = x_ca.to_numpy()
x_ca = NormalizeData(x_ca, x_lower_normalizer, x_upper_normalizer)
y_ca = NormalizeData(y_ca, y_lower_normalizer, y_upper_normalizer)
y_ca = y_ca.reshape(-1,1)


# Control 2 - Fe++ Varied, Ca++/Mg++ Held-Steady
x_fe = attributes_fe_df.loc[:, ['Ca++', 'Fe++','Mg++']].astype(float)
y_fe = DeltVolFrac_fe_TenYr_ds.Carbonates.values.astype(float)

x_fe = x_fe.to_numpy()
x_fe = NormalizeData(x_fe, x_lower_normalizer, x_upper_normalizer)
y_fe = NormalizeData(y_fe, y_lower_normalizer, y_upper_normalizer)
y_fe = y_fe.reshape(-1,1)


# Prediction 3 - Mg++ Varied, Ca++/Fe++ Held-Steady
x_mg = attributes_mg_df.loc[:, ['Ca++', 'Fe++','Mg++']].astype(float)
y_mg = DeltVolFrac_mg_TenYr_ds.Carbonates.values.astype(float)

x_mg = x_mg.to_numpy()
x_mg = NormalizeData(x_mg, x_lower_normalizer, x_upper_normalizer)
y_mg = NormalizeData(y_mg, y_lower_normalizer, y_upper_normalizer)
y_mg = y_mg.reshape(-1,1)

# Comparison of RTM Ground Truth and XGB Predictions

In [None]:
# Prediction Experiments

# The variable as varied between lower and upper bounds
variable_range_lower, variable_range_upper = 0, 3.0E-4
# The controlled values in the base .in file
controlled_ca_val = 1.0922841574764136E-4
controlled_fe_val = 2.0458518197753574E-4
controlled_mg_val = 2.8166497794217927E-4

# The number of ypred value samples desired
samples = 100
# The number dimensions (i.e. predictors i.e. ca, fe, mg)
dims = 3

# Prediction 1 - Ca++ Varied, Fe++/Mg++ Held-Steady
pred_ca_matrix = np.ones((samples, dims))
pred_ca_matrix[:,0] = np.linspace(variable_range_lower, variable_range_upper, samples)
pred_ca_matrix[:,1] = controlled_fe_val
pred_ca_matrix[:,2] = controlled_mg_val
pred_ca_matrix = NormalizeData(pred_ca_matrix, x_lower_normalizer, x_upper_normalizer)
xpred_ca = pred_ca_matrix[:,0]
ypred_ca = bst.predict(xgb.DMatrix(pred_ca_matrix))

# Prediction 2 - Fe++ Varied, Ca++/Mg++ Held-Steady
pred_fe_matrix = np.ones((samples, dims))
pred_fe_matrix[:,0] = controlled_ca_val
pred_fe_matrix[:,1] = np.linspace(variable_range_lower, variable_range_upper, samples)
pred_fe_matrix[:,2] = controlled_mg_val
pred_fe_matrix = NormalizeData(pred_fe_matrix, x_lower_normalizer, x_upper_normalizer)
xpred_fe = pred_fe_matrix[:,1]
ypred_fe = bst.predict(xgb.DMatrix(pred_fe_matrix))

# Prediction 3 - Mg++ Varied, Ca++/Fe++ Held-Steady
pred_mg_matrix = np.ones((samples, dims))
pred_mg_matrix[:,0] = controlled_ca_val
pred_mg_matrix[:,1] = controlled_fe_val
pred_mg_matrix[:,2] = np.linspace(variable_range_lower, variable_range_upper, samples)
pred_mg_matrix = NormalizeData(pred_mg_matrix, x_lower_normalizer, x_upper_normalizer)
xpred_mg = pred_mg_matrix[:,2]
ypred_mg = bst.predict(xgb.DMatrix(pred_mg_matrix))

In [None]:
# Plotting the control RTM experiments against predictions made by the XGBoost Model
fig, ((ax1),(ax2),(ax3)) = plt.subplots(1,3)
fig.suptitle('FB - Flow Basalt OD Model - Omphalos Test - Carbonates Volume Change over 10yrs - Cl- Charge Balance - RTM Ground Truth vs XGB Predictions')

fig.set_figheight(6)
fig.set_figwidth(21)

ax1.set_title("Ca++", loc='center')
ax2.set_title("Fe++", loc='center')
ax3.set_title("Mg++", loc='center')

ax1.set_ylabel('Tot. Carbonates Percentage Vol. Frac. Increase (norm.)')

ax1.set_xlabel('[Ca++] (normalised)')
ax2.set_xlabel('[Fe++] (normalised)')
ax3.set_xlabel('[Mg++] (normalised)')

ax1.set_ylim(0.61,0.65)
ax2.set_ylim(0.61,0.65)
ax3.set_ylim(0.61,0.65)

# ax1.set_ylim(0,1)
# ax2.set_ylim(0,1)
# ax3.set_ylim(0,1)

ax1_x_rtm = x_ca[:,0]
ax1_y_rtm = y_ca
ax1_x_xgb = xpred_ca
ax1_y_xgb = ypred_ca

ax2_x_rtm = x_fe[:,1]
ax2_y_rtm = y_fe
ax2_x_xgb = xpred_fe
ax2_y_xgb = ypred_fe

ax3_x_rtm = x_mg[:,2]
ax3_y_rtm = y_mg
ax3_x_xgb = xpred_mg
ax3_y_xgb = ypred_mg

ax1.scatter(x=ax1_x_rtm, y=ax1_y_rtm, c='blue', s=1, alpha=1, label="RTM")
ax1.scatter(x=ax1_x_xgb, y=ax1_y_xgb, c='red', s=1, alpha=1, label="XGB")
ax2.scatter(x=ax2_x_rtm, y=ax2_y_rtm, c='blue', s=1, alpha=1, label="RTM")
ax2.scatter(x=ax2_x_xgb, y=ax2_y_xgb, c='red', s=1, alpha=1, label="XGB")
ax3.scatter(x=ax3_x_rtm, y=ax3_y_rtm, c='blue', s=1, alpha=1, label="RTM")
ax3.scatter(x=ax3_x_xgb, y=ax3_y_xgb, c='red', s=1, alpha=1, label="XGB")

ax1.legend()

plt.tight_layout()