Necessary imports

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RandomizedSearchCV, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer, r2_score, mean_squared_error
import joblib

Processing the data


In [2]:
# Load the data
data = pd.read_csv('Data/simulation_results.csv')

# Split the data into features and targets
X = data[['Density', 'PotEng', 'Enthalpy']]
y = data[['Epsilon', 'Sigma']]

# Normalize the features
scaler = StandardScaler()
X = scaler.fit_transform(X)

Hyper Parameter tuning

In [4]:
# Initialize the model
model = MultiOutputRegressor(RandomForestRegressor(random_state=42))

# Define the hyperparameter distribution
param_dist = {
    "estimator__n_estimators": [100, 200, 300],
    "estimator__max_features": ['sqrt', 'log2', None],
    "estimator__max_depth": [10, 20, 30, None],
    "estimator__min_samples_split": [2, 5, 10],
    "estimator__min_samples_leaf": [1, 2, 4],
    "estimator__bootstrap": [True, False]
}

# Define scoring metrics
scoring = {
    'neg_mean_squared_error': make_scorer(mean_squared_error, greater_is_better=False)
}

# Perform Randomized Search with cross-validation
random_search = RandomizedSearchCV(model, param_distributions=param_dist, scoring=scoring, refit='neg_mean_squared_error', cv=3, n_jobs=-1, n_iter=50, random_state=42)

# Fit the model
random_search.fit(X, y)

# Get the best model
best_model = random_search.best_estimator_

# Print the best hyperparameters
print("Best hyperparameters found: ", random_search.best_params_)


Best hyperparameters found:  {'estimator__n_estimators': 200, 'estimator__min_samples_split': 2, 'estimator__min_samples_leaf': 4, 'estimator__max_features': None, 'estimator__max_depth': 20, 'estimator__bootstrap': False}


Cross validating to evaluate the model

In [5]:
# Perform cross-validation with the best model to get RMSE scores
cv_results = cross_validate(best_model, X, y, scoring=scoring, cv=5)
cv_mse = -cv_results['test_neg_mean_squared_error']
cv_rmse = np.sqrt(cv_mse)

print(f'Average RMSE: {cv_rmse.mean()}')

Average RMSE: 0.15412393397545016


Train and save the model

In [6]:
# Train the best model on the entire dataset
best_model.fit(X, y)

# Save the best model and the scaler
joblib.dump(best_model, 'Models/RF/random_forest_combined_best.pkl')
joblib.dump(scaler, 'Models/RF/scaler.pkl')

['Models/RF/scaler.pkl']

Testing the model for one custom sample

In [5]:
# Function to calculate percentage relative error
def percentage_relative_error(actual, predicted):
    return abs((predicted - actual) / actual) * 100

# Load the trained model and scaler
best_model = joblib.load('Models/RF/random_forest_combined_best.pkl')
scaler = joblib.load('Models/RF/scaler.pkl')

# Define the custom input data
custom_data = pd.DataFrame(np.array([[1.3052555611111112,-652.6723927777779,-499.243345]]), columns=['Density', 'PotEng', 'Enthalpy'])

# Normalize the custom input data
custom_data_normalized = scaler.transform(custom_data)

# Make predictions
predicted = best_model.predict(custom_data_normalized)
predicted_epsilon = predicted[0][0]
predicted_sigma = predicted[0][1]

# Define the real values (actual values)
real_epsilon = 0.2385
real_sigma = 3.405

# Print the results with specific formatting
print("Epsilon:")
print(f"real value: {real_epsilon}  predicted value: {predicted_epsilon} \nrelative error: {percentage_relative_error(real_epsilon, predicted_epsilon):.2f} %")

print("\nSigma:")
print(f"real value: {real_sigma}  predicted value: {predicted_sigma} \nrelative error: {percentage_relative_error(real_sigma, predicted_sigma):.2f} %")

Epsilon:
real value: 0.2385  predicted value: 0.23784249999999957 
relative error: 0.28 %

Sigma:
real value: 3.405  predicted value: 3.40857142857139 
relative error: 0.10 %
