## PR-guided dataset construction demo -- comparative analysis with low density bulk carbon

This notebook is one of three that were created to conduct the analysis presented in Section 4 of [Chong et al.](https://pubs.rsc.org/en/content/articlelanding/2024/fd/d4fd00101j), "Prediction rigidities for data-driven chemistry".

Analysis is devised such that we study the selection of structures that can best reduce the error for surface-containing carbon structures.

Three approaches were considered:

1) random selection of bulk carbon structures

2) random selection of low-density carbon structures (this notebook)

3) selecting the structure that improves the PR the most

In [None]:
from LE_ACE import LE_ACE
import torch

torch.set_default_dtype(torch.float64)

device = "cuda" if torch.cuda.is_available() else "cpu"
batch_size = 50


import ase.io
import numpy as np

import tqdm

### define ACE parameters and utility functions

In [None]:
le_ace = LE_ACE(
    r_cut_rs=4.5,
    r_cut=4.5,
    E_max=[0.0, 1000.0, 300.0, 200.0, 100.0],
    all_species=[6],
    le_type="physical",
    factor=1.5,
    factor2=-1.0,
    cost_trade_off=False,
    fixed_stoichiometry=False,
    is_trace=False,
    n_trace=-1,
    device=device
)
n_feat = sum(tensor.shape[0] for tensor in le_ace.extended_LE_energies)

def get_batches(list: list, batch_size: int) -> list:
    batches = []
    n_full_batches = len(list)//batch_size
    for i_batch in range(n_full_batches):
        batches.append(list[i_batch*batch_size:(i_batch+1)*batch_size])
    if len(list) % batch_size != 0:
        batches.append(list[n_full_batches*batch_size:])
    return batches

def add_features_to_covariance(calculator, batch, covariance):
    features = calculator.compute_features(batch)
    covariance += features.T @ features

### load datasets

In [None]:
training_set = ase.io.read("C_highdens.xyz", ":")
test_set = ase.io.read("C_surfamo.xyz", ":")
candidate_set = ase.io.read("C_alldens.xyz", ":")
lowdens_set =  ase.io.read("C_lowdens.xyz", ":")

In [None]:
for f in training_set:
    f.info['energy'] /= len(f)
for f in test_set:
    f.info['energy'] /= len(f)
for f in candidate_set:
    f.info['energy'] /= len(f)                
for f in lowdens_set:
    f.info['energy'] /= len(f)                    

### perform baseline fit

In [None]:

orig_covariance = torch.zeros(n_feat, n_feat, device=device)
for batch in get_batches(training_set, batch_size):
    add_features_to_covariance(le_ace, batch, orig_covariance)
orig_covariance = orig_covariance + 1e-5 * torch.eye(orig_covariance.shape[0], device=orig_covariance.device, dtype=orig_covariance.dtype)
inv_covariance = torch.linalg.inv(orig_covariance)
target_features = le_ace.compute_features(test_set)
features = torch.concatenate([le_ace.compute_features(batch) for batch in get_batches(candidate_set, batch_size)])
cur_PR = 1 / torch.einsum("ij, jk, ik -> i", target_features, inv_covariance, target_features)

print(f"INITIAL PR: {cur_PR.detach().cpu().numpy().mean()}")

In [None]:
# Preliminary fit (baseline):
accuracy_dict = le_ace.train(
    train_structures=training_set,
    validation_structures=training_set,
    test_structures=test_set,
    do_gradients=False,
    batch_size=10,
)

print(accuracy_dict)

### select additional structures and compute changes in the PR & RMSE

In [None]:
n_structures_to_add = 10

In [None]:

chosen_strucs = []
all_RMSEs = []
all_PRs = []

for ii in range(n_structures_to_add):

    cur_PRs = []
    cur_RMSEs = []

    ## repeating 10 times for statistics over random selection
    for j in range(10):

        np.random.shuffle(lowdens_set)
        cur_comb_set = lowdens_set[:ii+1]    
        cur_covariance = orig_covariance.clone()
        add_features_to_covariance(le_ace, cur_comb_set, cur_covariance)
        inv_covariance = torch.linalg.inv(cur_covariance)
        cur_PR = 1 / torch.einsum("ij, jk, ik -> i", target_features, inv_covariance, target_features)
        cur_PRs.append(cur_PR.detach().cpu().numpy().mean())
            
        accuracy_dict = le_ace.train(
            train_structures=training_set + cur_comb_set,
            validation_structures=training_set,
            test_structures=test_set,
            do_gradients=False,
            batch_size=10,
        )

        cur_RMSEs.append(accuracy_dict['test RMSE energies'])

    all_RMSEs.append(np.array(cur_RMSEs))
    all_PRs.append(np.array(cur_PRs))


In [None]:
np.save("lowdens_random_RSEs.npy", np.array(all_RMSEs))
np.save("lowdens_random_PRs.npy", np.array(all_PRs))