In [16]:
'''
This code uses Lazy Predict model.
Lazy Predict is a Python library that automates the machine learning model
building process. It quickly analyzes data and fits multiple machine learning
models to identify top candidates based on performance metrics. This allows
users to get an initial understanding of the potential performance of different
machine learning algorithms on their datasets without manually tuning each
individual model.
'''

'\nThis code uses Lazy Predict model.\nLazy Predict is a Python library that automates the machine learning model\nbuilding process. It quickly analyzes data and fits multiple machine learning\nmodels to identify top candidates based on performance metrics. This allows\nusers to get an initial understanding of the potential performance of different\nmachine learning algorithms on their datasets without manually tuning each\nindividual model.\n'

In [17]:
!pip install lazypredict



In [18]:

import os
import math
import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_predict, train_test_split, GridSearchCV
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    accuracy_score,
    classification_report,
    r2_score,
    mean_absolute_percentage_error
)
from sklearn.ensemble import IsolationForest

from lazypredict.Supervised import LazyRegressor


In [19]:
# ********************** FUNCTION DEFINITIONS **********************

def check_and_get_filename(directory, base_name, extension):
    """
    Check if the specific file already exists and create a new name if necessary.
    """
    count = 0
    while True:
        file_name = f"{base_name}{'' if count == 0 else f'_{count}'}.{extension}"
        full_path = os.path.join(directory, file_name)
        if not os.path.exists(full_path):
            return full_path
        count += 1

def save_to_file_and_print(message, file):
    """Prints a message and appends it to a file."""
    try:
        print(message)
        with open(file, 'a') as f:
            f.write(message)
    except Exception as e:
        print(f"An error occurred while saving the message to the file: {e}")


def format_value(value, decimal_places):
    format_string = "{:." + str(decimal_places) + "f}"
    return float(format_string.format(value))


def calculate_metrics(phase_name, y_true, y_pred, metrics_to_print=None, file=None):
    '''
    Calculates metrics.
    If no specific metrics are requested (metrics_to_print=None), it calculates
    and includes all available metrics in the message. Otherwise, it only
    calculates the necessary ones.
    The message is saved in a file if requested (file = 'name')
    '''
    if metrics_to_print is None:
        metrics_to_print = ['MAPE', 'MAE', 'MSE', 'RMSE', 'R2']

    metric_functions = {
        'MAPE': lambda y_true, y_pred: mean_absolute_percentage_error(y_true, y_pred),
        'MAE': mean_absolute_error,
        'MSE': mean_squared_error,
        'RMSE': lambda y_true, y_pred: math.sqrt(mean_squared_error(y_true, y_pred)),
        'R2': r2_score
    }

    output_message = f'\nMetrics for {phase_name} phase:\n'

    for metric in metrics_to_print:
        value = metric_functions[metric.upper()](y_true, y_pred)
        if metric.upper() == 'MAPE':
            formatted_value = format_value(value, 2)
            formatted_value = f'{formatted_value*100}%'
        else:
            formatted_value = format_value(value, 6)
        output_message += f"{metric}: {formatted_value}\n"

    if file:
        save_to_file_and_print(output_message, file)

    return output_message


In [20]:

# ********************** PARAMETER DEFINITIONS **********************
property = "Volume (m3/kg)"
property_name = property.split(' (')[0]

'''
"Density (kg/m3)","Volume (m3/kg)",
"Internal Energy (kJ/mol)","Enthalpy (kJ/mol)","Entropy (J/mol*K)",
"Cv (J/mol*K)","Cp (J/mo*lK)","Sound Spd. (m/s)","Joule-Thomson (K/MPa)",
"Viscosity (uPas)","Therm. Cond. (W/m*K)","Phase"
'''

## files and directories
#current_directory = os.getcwd()    #if used on the notebook
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
current_directory = 'gdrive/MyDrive/Thermophysical/Volume'


Mounted at /content/gdrive


In [21]:

# ********************** DATA ACQUISITION **********************
                     #*****  NIST  *****
            # **** Temperature, Pressure, Phase ****
'''
In this section, data from NIST is acquired for Temperature, Pressure, and the
corresponding physical state. These data are solely used to train a decision
tree to determine the phase in which the fluid exists at a given pressure and
temperature.
'''
NIST_directory = os.path.join(current_directory, "NIST")
txt_files_NIST = [file for file in os.listdir(NIST_directory) if file.endswith(".txt")]

X_NIST = []
state_column = []

for file in txt_files_NIST:
  df_NIST = pd.read_csv(os.path.join(NIST_directory, file), delimiter='\t')
  X_NIST_file = df_NIST[["Temperature (K)","Pressure (MPa)"]].values
  state_column_file = df_NIST.iloc[:, -1]  # Assuming the last column indicates the state

  X_NIST.extend(X_NIST_file)
  state_column.extend(state_column_file)

X_NIST = np.array(X_NIST)

# ***** DECISION TREE *****
DT_file = check_and_get_filename(current_directory, "DecisionTree_Physical-State-Determination_LazyPredict", "log")

## Mapping of physical states. Associate them with numbers (0, 1, 2) for use in the decision tree
state_mapping = {'liquid': 0, 'vapor': 1, 'supercritical': 2}
state_mapped = [state_mapping[state] for state in state_column]

# Spliting the data into training (70%) and testing (30%) sets (better than 80-20)
X_train, X_test, state_train, state_test = train_test_split(X_NIST, state_mapped, test_size=0.3, random_state=42)

log_info = f"Decision tree to predict physical states based on temperature and pressure inputs.\nNumber of data: {len(X_NIST)}\nNumber of train data: {len(X_train)}\nNumber of test data: {len(X_test)}"
save_to_file_and_print(log_info, DT_file)

decision_tree = DecisionTreeClassifier(random_state=42)

decision_tree.fit(X_train, state_train)

state_pred = decision_tree.predict(X_test)

accuracy_tree = accuracy_score(state_test, state_pred)

save_to_file_and_print(f"\nAccuracy of Decision Tree model: {accuracy_tree * 100:.3f}%", DT_file)
save_to_file_and_print(classification_report(state_test, state_pred), DT_file)

# Metrics - Decision Tree Evaluation
save_to_file_and_print("\n\n > Decision Tree Evaluation <", DT_file)
calculate_metrics('Decision Tree', state_test, state_pred, ['RMSE', 'R2'], DT_file)




Decision tree to predict physical states based on temperature and pressure inputs.
Number of data: 4991983
Number of train data: 3494388
Number of test data: 1497595

Accuracy of Decision Tree model: 99.998%
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85252
           1       1.00      1.00      1.00      7157
           2       1.00      1.00      1.00   1405186

    accuracy                           1.00   1497595
   macro avg       1.00      1.00      1.00   1497595
weighted avg       1.00      1.00      1.00   1497595



 > Decision Tree Evaluation <

Metrics for Decision Tree phase:
RMSE: 0.004834
R2: 0.999893



'\nMetrics for Decision Tree phase:\nRMSE: 0.004834\nR2: 0.999893\n'

In [22]:


# ********************** DATA ACQUISITION **********************
            # **** Temperature, Pressure, Property ****
print(f"\n\n*************Start analysis of property '{property_name}'*************")

property_directory = os.path.join(current_directory, property_name)
Experimental_directory = os.path.join(property_directory, "Experimental")


# ********************** LazyPredict **********************
LazyPredict_directory = os.path.join(property_directory, "LazyPredict")
os.makedirs(LazyPredict_directory, exist_ok=True)
LazyPredict_file = check_and_get_filename(LazyPredict_directory, "LazyPredict", "log")


# *****  EXPERIMENTAL  *****
txt_files_Exp = [file for file in os.listdir(Experimental_directory) if file.endswith(".csv")]
y_Exp = []  ## Dependent variable for all states
X_Exp = []  ## Independent variables (Temperature and Pressure) for all states

for file_exp in txt_files_Exp:
    df_Exp = pd.read_csv(os.path.join(Experimental_directory, file_exp), delimiter=',')
    X_Exp_file = df_Exp[["Temperature (K)","Pressure (MPa)"]].values
    y_Exp_file = df_Exp[property].values
    X_Exp.extend(X_Exp_file)
    y_Exp.extend(y_Exp_file)

X_Exp = np.array(X_Exp)
X_Exp=X_Exp.tolist()
if property == "Therm. Cond. (W/m*K)":
    y_Exp = [x * 1000 for x in  y_Exp]
else:
    pass
y_Exp = [[x] for x in  y_Exp]

print(X_Exp)



*************Start analysis of property 'Cv'*************
[[278.0, 0.1], [222.0, 0.08], [180.0, 0.03], [83.0, 0.03], [200.94, 6.49], [202.61, 6.8], [205.12, 7.26], [208.47, 7.89], [212.7, 8.69], [217.78, 9.65], [190.96, 4.67], [192.49, 4.94], [194.09, 5.23], [195.71, 5.52], [197.28, 5.81], [198.91, 6.11], [200.56, 6.42], [202.22, 6.72], [204.72, 7.19], [208.09, 7.82], [212.33, 8.62], [170.4, 6.54], [172.19, 7.69], [174.9, 9.42], [178.51, 11.72], [182.09, 13.99], [146.61, 6.85], [148.36, 8.54], [150.09, 10.2], [152.69, 12.68], [156.17, 15.96], [160.06, 19.58], [164.35, 23.52], [200.34, 5.69], [201.34, 5.8], [207.85, 6.52], [208.17, 6.56], [215.4, 7.35], [222.94, 8.16], [222.99, 8.17], [230.37, 8.96], [230.45, 8.97], [237.53, 9.72], [238.13, 9.79], [244.73, 10.49], [245.48, 10.57], [252.81, 11.34], [266.35, 12.77], [195.78, 5.39], [198.09, 5.73], [198.43, 5.78], [202.6, 6.4], [203.14, 6.49], [203.38, 6.52], [203.73, 6.57], [208.7, 7.32], [209.06, 7.37], [209.53, 7.44], [209.85, 7.49], 

In [23]:

# *****  NIST  *****
'''
NIST data that presents the same temperature and pressure conditions as those
used in the experimental data has been acquired.
'''
NIST_Expbased_directory = os.path.join(NIST_directory, "results_experimental-based")
txt_files_NIST_Expbased = [file for file in os.listdir(NIST_Expbased_directory) if file.endswith(".txt")]

y_NIST_Expbased = []  ## Dependent variable for all states
X_NIST_Expbased = []  ## Independent variables (Temperature and Pressure) for all states

for file_exp in txt_files_NIST_Expbased:
  df_NIST_Expbased = pd.read_csv(os.path.join(NIST_Expbased_directory, file_exp), delimiter='\t')
  X_NIST_Expbased_file = df_NIST_Expbased[["Temperature (K)","Pressure (MPa)"]].values
  y_NIST_Expbased_file = df_NIST_Expbased[property].values
  X_NIST_Expbased.extend(X_NIST_Expbased_file)
  y_NIST_Expbased.extend(y_NIST_Expbased_file)


'''
Since y_NIST_Expbased is a scalar value and X_NIST_Expbased is a NumPy array (array([T,P])),
They cannot be accessed using indexing.
One possible solution is to convert it into a list.
'''
X_NIST_Expbased = np.array(X_NIST_Expbased)
X_NIST_Expbased=X_NIST_Expbased.tolist()
if property == "Therm. Cond. (W/m*K)":
  y_NIST_Expbased = [x * 1000 for x in  y_NIST_Expbased]
else:
  pass
y_NIST_Expbased = [[x] for x in  y_NIST_Expbased]

In [24]:

'''
Given that not all experimental data falls within the NIST range of Temperature
and pressure, the values that lie within the range are stored in a variable for
error analysis.
'''

missing_data_X = []
matched_data = []

for x_exp, y_exp in zip(X_Exp, y_Exp):
    if any(np.array_equal(x_exp, x_nist) for x_nist in X_NIST_Expbased):
        matched_data.append((x_exp, y_exp))
    else:
        missing_data_X.append(x_exp)

missing_data_y = [y for x, y in zip(X_Exp, y_Exp) if any(np.array_equal(x, x_nist) for x_nist in missing_data_X)]

matched_data_X = [x for x, _ in matched_data]
matched_data_y = [y for _, y in matched_data]

save_to_file_and_print(f"Number of matched data between NIST and Experimental: {len(matched_data_X)}\nNumber of missing data between NIST and Experimental: {len(missing_data_X)}", LazyPredict_file)

X_liquid_Exp = []
y_liquid_Exp = []
X_vapor_Exp = []
y_vapor_Exp = []
X_supercritical_Exp = []
y_supercritical_Exp = []


Number of matched data between NIST and Experimental: 315
Number of missing data between NIST and Experimental: 1


In [25]:
#**** Discovering the physical state in the studied condition and separating the set****
#{'liquid': 0, 'vapor': 1, 'supercritical': 2}
state_pred_list_Exp_Expbased = decision_tree.predict(matched_data_X)

#Convert to a list type:
state_pred_list_Exp_Expbased = [[x] for x in state_pred_list_Exp_Expbased]


for i in range(len(state_pred_list_Exp_Expbased)):

  if state_pred_list_Exp_Expbased[i] == [0]:  # Liquid state
    X_liquid_Exp.append(matched_data_X[i])
    y_liquid_Exp.append(matched_data_y[i])
  elif state_pred_list_Exp_Expbased[i] == [1]:  # Vapor state
    X_vapor_Exp.append(matched_data_X[i])
    y_vapor_Exp.append(matched_data_y[i])
  elif state_pred_list_Exp_Expbased[i] == [2]:  # Supercritical state
    X_supercritical_Exp.append(matched_data_X[i])
    y_supercritical_Exp.append(matched_data_y[i])

print(f"Number of data points in Liquid state: {len(X_liquid_Exp)}")
print(f"Number of data points in Vapor state: {len(X_vapor_Exp)}")
print(f"Number of data points in Supercritical state: {len(X_supercritical_Exp)}")



Number of data points in Liquid state: 110
Number of data points in Vapor state: 4
Number of data points in Supercritical state: 201


In [26]:

#===========================================
#               LAZY PREDICT
#===========================================
save_to_file_and_print("\n\n>>> LIQUID PHASE <<<\n", LazyPredict_file)





>>> LIQUID PHASE <<<



In [27]:

#           >>>>>       LIQUID PART     <<<<<
# Spliting the data into training (80%) and testing (20%) sets for liquid state
X_liquid_train, X_liquid_test, y_liquid_train, y_liquid_test = train_test_split(X_liquid_Exp, y_liquid_Exp, test_size=0.2, random_state=42)

X_liquid_train_array = np.array(X_liquid_train)
X_liquid_test_array = np.array(X_liquid_test)
y_liquid_train_array = np.array(y_liquid_train)
y_liquid_test_array = np.array(y_liquid_test)

reg_liquid = LazyRegressor(verbose=0, ignore_warnings=False, custom_metric=None)
models_liquid,predictions = reg_liquid.fit(X_liquid_train_array, X_liquid_test_array, y_liquid_train_array, y_liquid_test_array)

save_to_file_and_print(f"\nResults from lazy predict:\n {models_liquid}.\n", LazyPredict_file)

save_to_file_and_print("\n\n>>> VAPOR PHASE <<<\n", LazyPredict_file)



 81%|████████  | 34/42 [00:01<00:00, 32.29it/s]

QuantileRegressor model failed to execute
Solver interior-point is not anymore available in SciPy >= 1.11.0.


100%|██████████| 42/42 [00:01<00:00, 30.23it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000075 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 62
[LightGBM] [Info] Number of data points in the train set: 88, number of used features: 2
[LightGBM] [Info] Start training from score 31.923750

Results from lazy predict:
                                Adjusted R-Squared  R-Squared  RMSE  Time Taken
Model                                                                         
KNeighborsRegressor                          0.53       0.57  1.78        0.01
ExtraTreesRegressor                          0.48       0.53  1.88        0.09
HistGradientBoostingRegressor                0.34       0.41  2.10        0.06
LGBMRegressor                                0.34       0.41  2.10        0.09
XGBRegressor                                 0.33       0.40  2.12        0.04
NuSVR                                        0.33       0.39  2.12        0.01
SV




In [28]:
#           >>>>>       VAPOR PART     <<<<<
# Spliting the data into training (80%) and testing (20%) sets for vapor state
X_vapor_train, X_vapor_test, y_vapor_train, y_vapor_test = train_test_split(X_vapor_Exp, y_vapor_Exp, test_size=0.2, random_state=42)

X_vapor_train_array = np.array(X_vapor_train)
X_vapor_test_array = np.array(X_vapor_test)
y_vapor_train_array = np.array(y_vapor_train)
y_vapor_test_array = np.array(y_vapor_test)

reg_vapor = LazyRegressor(verbose=0, ignore_warnings=False, custom_metric=None)
models_vapor,predictions = reg_vapor.fit(X_vapor_train_array, X_vapor_test_array, y_vapor_train_array, y_vapor_test_array)

save_to_file_and_print(f"\nResults from lazy predict:\n {models_vapor}.\n", LazyPredict_file)


 21%|██▏       | 9/42 [00:00<00:01, 28.04it/s]

ElasticNetCV model failed to execute
Cannot have number of splits n_splits=5 greater than the number of samples: n_samples=3.


 43%|████▎     | 18/42 [00:00<00:00, 32.21it/s]

KNeighborsRegressor model failed to execute
Expected n_neighbors <= n_samples,  but n_samples = 3, n_neighbors = 5
LarsCV model failed to execute
Cannot have number of splits n_splits=5 greater than the number of samples: n_samples=3.
LassoCV model failed to execute
Cannot have number of splits n_splits=5 greater than the number of samples: n_samples=3.
LassoLarsCV model failed to execute
Cannot have number of splits n_splits=5 greater than the number of samples: n_samples=3.
LassoLarsIC model failed to execute
You are using LassoLarsIC in the case where the number of samples is smaller than the number of features. In this setting, getting a good estimate for the variance of the noise is not possible. Provide an estimate of the noise variance in the constructor.


 62%|██████▏   | 26/42 [00:00<00:00, 36.88it/s]

OrthogonalMatchingPursuitCV model failed to execute
Cannot have number of splits n_splits=5 greater than the number of samples: n_samples=3.
QuantileRegressor model failed to execute
Solver interior-point is not anymore available in SciPy >= 1.11.0.


100%|██████████| 42/42 [00:01<00:00, 31.05it/s]

[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 3, number of used features: 0
[LightGBM] [Info] Start training from score 27.733334

Results from lazy predict:
                                Adjusted R-Squared  R-Squared  RMSE  Time Taken
Model                                                                         
AdaBoostRegressor                             NaN        NaN  0.87        0.02
BaggingRegressor                              NaN        NaN  3.31        0.05
BayesianRidge                                 NaN        NaN  0.48        0.02
DecisionTreeRegressor                         NaN        NaN  2.18        0.01
DummyRegressor                                NaN        NaN  3.09        0.01
ElasticNet                                    NaN        NaN  1.71        0.01
ExtraTreeRegressor                            NaN        NaN  0.87        0.01
ExtraTreesRegressor                           NaN        NaN  0.87        0.16
GammaReg




In [29]:
print(f'X train: {X_vapor_train_array}\n')
print(f'X test: {X_vapor_test_array}\n')
print(f'y train: {y_vapor_train_array}\n')
print(f'y test: {y_vapor_test_array}\n')

X train: [[1.855e+02 4.250e+00]
 [2.780e+02 1.000e-01]
 [1.800e+02 3.000e-02]]

X test: [[2.22e+02 8.00e-02]]

y train: [[32.61]
 [26.82]
 [23.77]]

y test: [[24.64]]



In [30]:

save_to_file_and_print("\n\n>>> SUPERCRITICAL PHASE <<<\n", LazyPredict_file)
#           >>>>>       SUPERCRITICAL PART     <<<<<
# Spliting the data into training (80%) and testing (20%) sets for liquid state
X_supercritical_train, X_supercritical_test, y_supercritical_train, y_supercritical_test = train_test_split(X_supercritical_Exp, y_supercritical_Exp, test_size=0.2, random_state=42)

X_supercritical_train_array = np.array(X_supercritical_train)
X_supercritical_test_array = np.array(X_supercritical_test)
y_supercritical_train_array = np.array(y_supercritical_train)
y_supercritical_test_array = np.array(y_supercritical_test)

reg_supercritical = LazyRegressor(verbose=0, ignore_warnings=False, custom_metric=None)
models_supercritical,predictions = reg_supercritical.fit(X_supercritical_train_array, X_supercritical_test_array, y_supercritical_train_array, y_supercritical_test_array)

save_to_file_and_print(f"\nResults from lazy predict:\n {models_supercritical}.\n", LazyPredict_file)




>>> SUPERCRITICAL PHASE <<<



 79%|███████▊  | 33/42 [00:02<00:00, 18.08it/s]

QuantileRegressor model failed to execute
Solver interior-point is not anymore available in SciPy >= 1.11.0.


100%|██████████| 42/42 [00:03<00:00, 13.10it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000026 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 109
[LightGBM] [Info] Number of data points in the train set: 160, number of used features: 2
[LightGBM] [Info] Start training from score 31.275688

Results from lazy predict:
                                Adjusted R-Squared  R-Squared  RMSE  Time Taken
Model                                                                         
KNeighborsRegressor                          0.98       0.98  0.23        0.01
ExtraTreesRegressor                          0.97       0.98  0.28        0.17
RandomForestRegressor                        0.97       0.97  0.32        0.27
BaggingRegressor                             0.96       0.96  0.36        0.06
HistGradientBoostingRegressor                0.95       0.96  0.37        0.10
LGBMRegressor                                0.94       0.95  0.41        0.08



