In [1]:

import matplotlib.pyplot as plt
import numpy as np
from ase.io import read


### utility functions

In [2]:

def train_model(X_struc, E_struc, alpha):

    ## get Cov matrix
    XX = X_struc.T @ X_struc

    ## add diagonal regularization matrix of the smoothness prior multiplied by alpha,
    ## then also with body-order dependent beta*exp(nu)
    reg_mat = np.eye(XX.shape[0]) * alpha
    Xprime = XX + reg_mat
    Xinv = np.linalg.inv(Xprime)

    E_mean = E_struc.mean()
    Y = X_struc.T @ (E_struc - E_mean)
    
    weights = Xinv @ Y
        
    return weights, E_mean


def predict_compwise(X_struc, weights, E_mean, comp_dims):

    raw_E = X_struc @ weights
    tot_pred = raw_E + E_mean
    
    cw_preds = np.zeros((len(comp_dims), len(X_struc)))
    comp_idxs = np.cumsum(np.array([0] + comp_dims))

    for ci in range(len(comp_dims)):
        cur_mask = np.zeros(np.array(comp_dims).sum())
        cur_mask[comp_idxs[ci]:comp_idxs[ci+1]] += np.ones(comp_dims[ci])
        masked_weights = weights * cur_mask
        cw_preds[ci] = X_struc @ masked_weights + E_mean

    return tot_pred, cw_preds


In [3]:
def calculate_CPR(
    train_struc_feats,
    test_struc_feats,
    alpha,
    comp_dims,
):
    
    X_struc_train = train_struc_feats
    X_struc_test = test_struc_feats
        
    XX = X_struc_train.T @ X_struc_train
    reg_mat = (np.eye(XX.shape[0])) * alpha
    Xprime = XX + reg_mat

    Xinv = np.linalg.inv(Xprime)

    CPR = np.zeros((len(comp_dims), len(X_struc_test)))

    comp_idxs = np.cumsum(np.array([0] + comp_dims))
    
    for ci in range(len(comp_dims)):
        cur_mask = np.zeros(np.array(comp_dims).sum())
        cur_mask[comp_idxs[ci]:comp_idxs[ci+1]] += np.ones(comp_dims[ci])
        X_struc_test_cur = X_struc_test * cur_mask
        CPR[ci] = 1 / np.einsum("ij, jk, ik -> i", X_struc_test_cur, Xinv, X_struc_test_cur)
    
    return CPR


### load clusters and ACE feature vectors

In [4]:

mer2 = read('datasets/Si2.xyz', ':')
mer3 = read('datasets/Si3.xyz', ':')
mer4 = read('datasets/Si4.xyz', ':')
mer5 = read('datasets/Si5.xyz', ':')


In [5]:

## read feature vectors and smoothness prior from Julia
path = "./Si_ACE_featvecs/pure/"

X2 = [np.vstack([np.loadtxt(path + "2mer/{}.txt".format(i))]) for i in range(len(mer2))]
X3 = [np.vstack([np.loadtxt(path + "3mer/{}.txt".format(i))]) for i in range(len(mer3))]
X4 = [np.vstack([np.loadtxt(path + "4mer/{}.txt".format(i))]) for i in range(len(mer4))]
X5 = [np.vstack([np.loadtxt(path + "5mer/{}.txt".format(i))]) for i in range(len(mer5))]

## shuffle amoliq ids 
np.random.seed(1215)

ids = np.array([i for i in range(len(mer2))])
np.random.shuffle(ids)
X2 = [X2[ii] for ii in ids]
mer2 = [mer2[ii] for ii in ids]

ids = np.array([i for i in range(len(mer3))])
np.random.shuffle(ids)
X3 = [X3[ii] for ii in ids]
mer3 = [mer3[ii] for ii in ids]

ids = np.array([i for i in range(len(mer4))])
np.random.shuffle(ids)
X4 = [X4[ii] for ii in ids]
mer4 = [mer4[ii] for ii in ids]

ids = np.array([i for i in range(len(mer5))])
np.random.shuffle(ids)
X5 = [X5[ii] for ii in ids]
mer5 = [mer5[ii] for ii in ids]


In [6]:

comp_dims = [48, 273, 505, 127]
comp_idxs = np.cumsum(np.array([0] + comp_dims))
nu = np.array([1, 2, 3, 4])


In [8]:

# subtract isolated atom energy
isoE = -7881.32677981122

E2 = [f.info['free_energy']/len(f) - isoE for f in mer2]
E2 = np.array(E2)

E3 = [f.info['free_energy']/len(f) - isoE for f in mer3]
E3 = np.array(E3)

E4 = [f.info['free_energy']/len(f) - isoE for f in mer4]
E4 = np.array(E4)

E5 = [f.info['free_energy']/len(f) - isoE for f in mer5]
E5 = np.array(E5)


In [9]:
dimer = read('datasets/Si2.xyz', ':')
X_dimer = [np.vstack([np.loadtxt(path + "2mer/{}.txt".format(i))]).mean(axis=0) for i in range(len(dimer))]
E_dimer = [f.info['free_energy']/len(f) - isoE for f in dimer]

In [10]:
dists = [f.get_all_distances(mic=True).max() for f in dimer]

### Train Models

In [11]:
train = 500
add = 50

X_vnl = np.vstack([feats.mean(axis=0) for feats in X5])
E_vnl = E5

X_2b = np.vstack([feats.mean(axis=0) for feats in X5[:train - add]] + \
                 [feats.mean(axis=0) for feats in X2[:add]]) 
E_2b = np.hstack([E5[:train - add], E2[:add]])
                  
X_3b = np.vstack([feats.mean(axis=0) for feats in X5[:train - 2*add]] + \
                 [feats.mean(axis=0) for feats in X2[:add]] + \
                 [feats.mean(axis=0) for feats in X3[:add]]) 
E_3b = np.hstack([E5[:train - 2*add], E2[:add], E3[:add]])

X_4b = np.vstack([feats.mean(axis=0) for feats in X5[:train - 3*add]] + \
                 [feats.mean(axis=0) for feats in X2[:add]] + \
                 [feats.mean(axis=0) for feats in X3[:add]] + \
                 [feats.mean(axis=0) for feats in X4[:add]]) 
E_4b = np.hstack([E5[:train - 3*add], E2[:add], E3[:add], E4[:add]])

In [13]:
alpha = 1e-3

w_vnl, E_mean_vnl = train_model(X_vnl, E_vnl, alpha)
w_2b, E_mean_2b = train_model(X_2b, E_2b, alpha)
w_3b, E_mean_3b = train_model(X_3b, E_3b, alpha)
w_4b, E_mean_4b = train_model(X_4b, E_4b, alpha)

E5pred_vnl, _ = predict_compwise(X_vnl, w_vnl, E_mean_vnl, comp_dims)
E5pred_2b, _ = predict_compwise(X_vnl, w_2b, E_mean_2b, comp_dims)
E5pred_3b, _ = predict_compwise(X_vnl, w_3b, E_mean_3b, comp_dims)
E5pred_4b, _ = predict_compwise(X_vnl, w_4b, E_mean_4b, comp_dims)

E2pred_vnl, E2_cw_vnl = predict_compwise(X_dimer, w_vnl, E_mean_vnl, comp_dims)
E2pred_2b, E2_cw_2b = predict_compwise(X_dimer, w_2b, E_mean_2b, comp_dims)
E2pred_3b, E2_cw_3b = predict_compwise(X_dimer, w_3b, E_mean_3b, comp_dims)
E2pred_4b, E2_cw_4b = predict_compwise(X_dimer, w_4b, E_mean_4b, comp_dims)


In [17]:
CPR_vnl = calculate_CPR(X_vnl, X_vnl, alpha, comp_dims)
CPR_2b = calculate_CPR(X_2b, X_vnl, alpha, comp_dims)
CPR_3b = calculate_CPR(X_3b, X_vnl, alpha, comp_dims)
CPR_4b = calculate_CPR(X_4b, X_vnl, alpha, comp_dims)

In [19]:
np.savez("pure_results.npz",
         CPR_vnl = CPR_vnl.mean(axis=1),
         CPR_2b = CPR_2b.mean(axis=1),
         CPR_3b = CPR_3b.mean(axis=1),
         CPR_4b = CPR_4b.mean(axis=1),
         dists = np.array(dists),
         E2pred_vnl=E2pred_vnl,
         E2pred_2b=E2pred_2b,
         E2pred_3b=E2pred_3b,
         E2pred_4b=E2pred_4b,
         E2_cw_vnl=E2_cw_vnl,
         E2_cw_2b=E2_cw_2b,
         E2_cw_3b=E2_cw_3b,
         E2_cw_4b=E2_cw_4b,
         )