## This notebook calibrates the Cpx-Liq machine learning barometr used by Thermobar. 
- We follow Petrelli, where a different random state is used for pressure and temperature
- Note, this will likely need rerunning a lot, because it seems if sklearn changes version too much, the pickles wont load. 
- With each change in Sklearn, the answer will likely change. The change from v023.2 resulted in a change of up to ~0.8 kbar per Cpx. This is sadly unavoidable as far as we know, although if you have a suggestion please let us know!

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import mean_squared_error 
from sklearn.metrics import r2_score
import scipy.cluster.hierarchy as shc
from sklearn.cluster import AgglomerativeClustering
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingRegressor

%matplotlib inline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from scipy import stats
import warnings as w
import sys
sys.path.append("../../..")
import Thermobar as pt
pd.set_option('display.max_columns', None)

In [2]:
out_train=pt.import_excel('Jorgenson_Input_filtered.xlsx',
                          sheet_name='Jorgenson_Input_filtered_ThBar')
out_train_input=out_train['my_input']
Cpx_train=out_train['Cpxs']
Liq_train=out_train['Liqs']
Cpx_train_noID_noT=Cpx_train.drop(['Sample_ID_Cpx'], axis=1)
Liq_train_noID_noT=Liq_train#pt.normalize_liquid_jorgenson(liq_comps=Liq_train)
Liq_train_noID_noT.drop(['Sample_ID_Liq',  'Fe3Fet_Liq', 'NiO_Liq',
                                   'CoO_Liq', 'CO2_Liq', 'H2O_Liq'], 
                        axis=1, inplace=True)
Cpx_Liq_Combo_train=pd.concat([Liq_train_noID_noT, Cpx_train_noID_noT], axis=1)
x_train=Cpx_Liq_Combo_train.values
y_train_T=out_train_input['T_K']
y_train_P=out_train_input['P_kbar']

In [3]:
# Scaling
#scaler = StandardScaler().fit(x_train)
x_train_scaled =x_train# scaler.transform(x_train)

# Corin says max features=#*2.3
maxF=int(np.round(21*(2/3), 0))
crit="squared_error"
# Define the regressor, in our case the Extra Tree Regressor
regr_T = ExtraTreesRegressor(n_estimators=201, criterion=crit,
                             max_features=maxF, random_state=280) # random_state fixed for reproducibility

# Define the regressor, in our case the Extra Tree Regressor
regr_P = ExtraTreesRegressor(n_estimators=201, criterion=crit, 
                             max_features=maxF, random_state=80) # random_state fixed for reproducibility

# Train the model
regr_T.fit(x_train_scaled, y_train_T.ravel())
regr_P.fit(x_train_scaled, y_train_P.ravel())

## These were saved April 24th, 2022. Used in Thermobar until Oct 6th, 2023

In [4]:
# from pickle import dump
# #dump(scaler, open('scaler_Jorg21_Cpx_Liq_April24.pkl', 'wb'))


# import joblib
# joblib.dump(regr_P,  'ETR_Press_Jorg21_Cpx_Liq_April24_NotNorm.pkl',compress=3)

# joblib.dump(regr_T,  'ETR_Temp_Jorg21_Cpx_Liq_April24_NotNorm.pkl',compress=3)

## New Sklearn version saved Oct 6th, 2023
- Asked Jorgenson didnt use a scalar

In [6]:
from pickle import dump
#dump(scaler, open('scaler_Jorg21_Cpx_Liq_sklearn_1_3.pkl', 'wb'))


import joblib
joblib.dump(regr_P,  'ETR_Press_Jorg21_Cpx_Liq_NotNorm_sklearn_1_3.pkl',compress=3)

joblib.dump(regr_T,  'ETR_Temp_Jorg21_Cpx_Liq_NotNorm_sklearn_1_3.pkl',compress=3)

['ETR_Temp_Jorg21_Cpx_Liq_NotNorm_sklearn_1_3.pkl']