In [None]:
import os
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
from scipy import stats

def calc_dist(read_path, save_path, criteria_path=None):
    dist_HH, dist_CH, dist_NH, dist_OH, dist_CC, dist_CO, dist_CN, dist_NN, dist_NO, dist_OO = [], [], [], [], [], [], [], [], [], []

    for file in tqdm(os.listdir(read_path)):
        if file.endswith('.npz') and ((criteria_path is None) or (file in os.listdir(criteria_path))):
            data = np.load(os.path.join(read_path, file))
        else:
            continue
        Z = data['Z']
        R = data['R']
        dist_matrix = np.linalg.norm(R[:, :, None, :] - R[:, None, :, :], axis=-1)
        dist_matrix[:, range(R.shape[1]), range(R.shape[1])] = np.nan
        dist_HH_temp = dist_matrix[(Z[:, :, None] == 1) & (Z[:, None, :] == 1)]
        dist_CH_temp = dist_matrix[(Z[:, :, None] == 1) & (Z[:, None, :] == 6)]
        dist_NH_temp = dist_matrix[(Z[:, :, None] == 1) & (Z[:, None, :] == 7)]
        dist_OH_temp = dist_matrix[(Z[:, :, None] == 1) & (Z[:, None, :] == 8)]
        dist_CC_temp = dist_matrix[(Z[:, :, None] == 6) & (Z[:, None, :] == 6)]
        dist_CN_temp = dist_matrix[(Z[:, :, None] == 6) & (Z[:, None, :] == 7)]
        dist_CO_temp = dist_matrix[(Z[:, :, None] == 6) & (Z[:, None, :] == 8)]
        dist_NN_temp = dist_matrix[(Z[:, :, None] == 7) & (Z[:, None, :] == 7)]
        dist_NO_temp = dist_matrix[(Z[:, :, None] == 7) & (Z[:, None, :] == 8)]
        dist_OO_temp = dist_matrix[(Z[:, :, None] == 8) & (Z[:, None, :] == 8)]
        dist_HH.append(dist_HH_temp[~np.isnan(dist_HH_temp)])
        dist_CH.append(dist_CH_temp[~np.isnan(dist_CH_temp)])
        dist_NH.append(dist_NH_temp[~np.isnan(dist_NH_temp)])
        dist_OH.append(dist_OH_temp[~np.isnan(dist_OH_temp)])
        dist_CC.append(dist_CC_temp[~np.isnan(dist_CC_temp)])
        dist_CN.append(dist_CN_temp[~np.isnan(dist_CN_temp)])
        dist_CO.append(dist_CO_temp[~np.isnan(dist_CO_temp)])
        dist_NN.append(dist_NN_temp[~np.isnan(dist_NN_temp)])
        dist_NO.append(dist_NO_temp[~np.isnan(dist_NO_temp)])
        dist_OO.append(dist_OO_temp[~np.isnan(dist_OO_temp)])
    dist_HH = np.concatenate(dist_HH, dtype=np.float16)
    dist_CH = np.concatenate(dist_CH, dtype=np.float16)
    dist_NH = np.concatenate(dist_NH, dtype=np.float16)
    dist_OH = np.concatenate(dist_OH, dtype=np.float16)
    dist_CC = np.concatenate(dist_CC, dtype=np.float16)
    dist_CN = np.concatenate(dist_CN, dtype=np.float16)
    dist_CO = np.concatenate(dist_CO, dtype=np.float16)
    dist_NN = np.concatenate(dist_NN, dtype=np.float16)
    dist_NO = np.concatenate(dist_NO, dtype=np.float16)
    dist_OO = np.concatenate(dist_OO, dtype=np.float16)
    np.savez_compressed(
        save_path,
        dist_HH=dist_HH, 
        dist_CH=dist_CH, 
        dist_NH=dist_NH, 
        dist_OH=dist_OH,
        dist_CC=dist_CC,
        dist_CN=dist_CN,
        dist_CO=dist_CO,
        dist_NN=dist_NN,
        dist_NO=dist_NO,
        dist_OO=dist_OO,
        )
    print('done!')

def read_dist(read_path):
    dist = np.load(read_path)
    dist = {
        'HH': dist['dist_HH'],
        'CH': dist['dist_CH'],
        'NH': dist['dist_NH'],
        'OH': dist['dist_OH'],
        'CC': dist['dist_CC'],
        'CN': dist['dist_CN'],
        'CO': dist['dist_CO'],
        'NN': dist['dist_NN'],
        'NO': dist['dist_NO'],
        'OO': dist['dist_OO'],
        }
    print('done!')
    return dist

In [None]:
calc_dist(read_path='Data/ANI1/data/', save_path='ANI1/dist.npz')
calc_dist(read_path='Data/ANI1x/data/', save_path='ANI1x/dist.npz')
calc_dist(read_path='Data/Transition1x/composition_data/', save_path='Transition1x/dist.npz')
calc_dist(read_path='Data/QM9/data/', save_path='QM9/dist.npz')

In [None]:
dist_ani = read_dist('Data/ANI1/dist.npz')
dist_ani1x = read_dist('Data/ANI1x/dist.npz')
dist_t1x = read_dist('Data/Transition1x/dist.npz')
dist_qm9 = read_dist('Data/QM9/dist.npz')

In [None]:
fig, axs = plt.subplots(3, 1, sharex=True, sharey=True, figsize=(4, 3))

data = dist_ani['CC'].astype(np.float32) / 1.531
axs[0].plot(np.linspace(0.98, 1.52, 1000), stats.gaussian_kde(data)(np.linspace(0.98, 1.52, 1000)), color='tab:gray', linestyle='--')
data = dist_t1x['CC'].astype(np.float32) / 1.531
axs[0].plot(np.linspace(0.98, 1.52, 1000), stats.gaussian_kde(data)(np.linspace(0.98, 1.52, 1000)), color='tab:red', linestyle='-')
axs[0].set_title('C–C', x=0.1, y=0.1, fontsize=10)
axs[0].legend(['ANI-1', 'T1x'], loc='upper right')
axs[0].set_ylabel('')

data = dist_ani['CN'].astype(np.float32) / 1.460
axs[1].plot(np.linspace(0.98, 1.52, 1000), stats.gaussian_kde(data)(np.linspace(0.98, 1.52, 1000)), color='tab:gray', linestyle='--')
data = dist_t1x['CN'].astype(np.float32) / 1.460
axs[1].plot(np.linspace(0.98, 1.52, 1000), stats.gaussian_kde(data)(np.linspace(0.98, 1.52, 1000)), color='tab:red', linestyle='-')
axs[1].set_title('C–N', x=0.1, y=0.1, fontsize=10)
axs[1].set_ylabel('Atomic pair distribution density')

data = dist_ani['CO'].astype(np.float32) / 1.421
axs[2].plot(np.linspace(0.98, 1.52, 1000), stats.gaussian_kde(data)(np.linspace(0.98, 1.52, 1000)), color='tab:gray', linestyle='--')
data = dist_t1x['CO'].astype(np.float32) / 1.421
axs[2].plot(np.linspace(0.98, 1.52, 1000), stats.gaussian_kde(data)(np.linspace(0.98, 1.52, 1000)), color='tab:red', linestyle='-')
axs[2].set_title('C–O', x=0.1, y=0.1, fontsize=10)
axs[2].set_ylabel('')

plt.xlim(0.98, 1.52)
plt.ylim(5e-3, 2e0)
plt.yscale('log')
plt.xlabel('$R / R_0$')
plt.savefig('BondDist.pdf', bbox_inches='tight')
plt.show()