In [None]:
import pickle
import matplotlib.pyplot as plt
import numpy as np

plt.rcParams['figure.dpi'] = 300

# Load data
def load_pickle(filepath):
    with open(filepath, 'rb') as f:
        return pickle.load(f)

# The file names here are chose based on which training sets are considered
train_scores = load_pickle('7_Operations_SISSO/train_scores.pkl')
test_scores = load_pickle('7_Operations_SISSO/test_scores.pkl')
train_scores_whole = load_pickle('Train_with_the_whole_descriptor_set/train_scores.pkl')
test_scores_whole = load_pickle('Train_with_the_whole_descriptor_set/test_scores.pkl')

# Compute medians
median = np.median(test_scores)
median_whole = np.median(test_scores_whole)

# Compute score distributions
def compute_score_distribution(scores, bins):
    hist, _ = np.histogram(scores, bins=bins)
    return hist / len(scores)

bins = np.arange(-3.0, 1.1, 0.1)  # Defines bin edges for histogram
score_number_ratio = compute_score_distribution(test_scores, bins)
score_number_ratio_whole = compute_score_distribution(test_scores_whole, bins)
score_x = bins[:-1]  # Midpoints of bins for plotting

# Plot histogram
plt.bar(score_x - 0.02, score_number_ratio_whole, width=0.04, color='#264653', label='$N_{descriptor} = 6$')
plt.vlines(median_whole, 0, 1.00, color='#264653', linestyle='dashed')
plt.bar(score_x + 0.02, score_number_ratio, width=0.04, color='#ffb703', label='$N_{descriptor} = 2$')
plt.vlines(median, 0, 1.00, color='#ffb703', linestyle='dashed')

# Formatting
plt.xlabel('$R^2$ on the Testing Set', fontsize=16)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.ylabel('Ratio', fontsize=16)
plt.ylim(0, 0.18)
plt.legend(fontsize=16)
plt.title('Energy Decomposition Analysis', fontsize=16)
plt.show()
