Necessary imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RandomizedSearchCV, cross_validate
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import make_scorer, r2_score, mean_squared_error
import joblib

Processing the data


In [2]:
# Load the data
data = pd.read_csv('Data/simulation_results.csv')

# Split the data into features and targets
X = data[['Density', 'PotEng', 'Enthalpy']]
y = data[['Epsilon', 'Sigma']]


Hyper Parameter tuning

In [3]:
# Initialize the model
model = MultiOutputRegressor(GradientBoostingRegressor(random_state=42))

# Define the hyperparameter distribution
param_dist = {
    "estimator__n_estimators": [100, 200, 300],
    "estimator__learning_rate": [0.01, 0.1, 0.2],
    "estimator__max_depth": [3, 4, 5],
    "estimator__min_samples_split": [2, 5, 10],
    "estimator__min_samples_leaf": [1, 2, 4],
    "estimator__subsample": [0.8, 0.9, 1.0]
}

# Define scoring metrics
scoring = {
    'neg_mean_squared_error': make_scorer(mean_squared_error, greater_is_better=False)
}

# Perform Randomized Search with cross-validation
random_search = RandomizedSearchCV(model, param_distributions=param_dist, scoring=scoring, refit='neg_mean_squared_error', cv=3, n_jobs=-1, n_iter=50, random_state=42)

# Fit the model
random_search.fit(X, y)

# Get the best model
best_model = random_search.best_estimator_

# Print the best hyperparameters
print("Best hyperparameters found: ", random_search.best_params_)

Best hyperparameters found:  {'estimator__subsample': 0.8, 'estimator__n_estimators': 100, 'estimator__min_samples_split': 5, 'estimator__min_samples_leaf': 4, 'estimator__max_depth': 5, 'estimator__learning_rate': 0.1}


Cross validating to evaluate the model

In [4]:
# Perform cross-validation with the best model to get RMSE scores
cv_results = cross_validate(best_model, X, y, scoring=scoring, cv=5)
cv_mse = -cv_results['test_neg_mean_squared_error']
cv_rmse = np.sqrt(cv_mse)

print(f'Average RMSE: {cv_rmse.mean()}')

Average RMSE: 0.17304777512076827


Train and save the model

In [5]:
# Train the best model on the entire dataset
best_model.fit(X, y)

# Save the best model and the
joblib.dump(best_model, 'Models/GBM/gradient_boosting_combined_best.pkl')

['Models/GBM/gradient_boosting_combined_best.pkl']

Testing the model for one custom sample

In [3]:
# Function to calculate percentage relative error
def percentage_relative_error(actual, predicted):
    return abs((predicted - actual) / actual) * 100

# Load the trained model
best_model = joblib.load('Models/GBM/gradient_boosting_combined_best.pkl')

# Define the custom input data
custom_data = pd.DataFrame(np.array([[1.3052555611111112,-652.6723927777779,-499.243345]]), columns=['Density', 'PotEng', 'Enthalpy'])
# Make predictions
predicted = best_model.predict(custom_data)
predicted_epsilon = predicted[0][0]
predicted_sigma = predicted[0][1]

# Define the real values (actual values)
real_epsilon = 0.2385
real_sigma = 3.405

# Print the results with specific formatting
print("Epsilon:")
print(f"real value: {real_epsilon}  predicted value: {predicted_epsilon} \nrelative error: {percentage_relative_error(real_epsilon, predicted_epsilon):.2f} %")

print("\nSigma:")
print(f"real value: {real_sigma}  predicted value: {predicted_sigma} \nrelative error: {percentage_relative_error(real_sigma, predicted_sigma):.2f} %")

Epsilon:
real value: 0.2385  predicted value: 0.24183173386534992 
relative error: 1.40 %

Sigma:
real value: 3.405  predicted value: 3.4061915015797455 
relative error: 0.03 %
