Making a small dataset of 10 samples from qm7 to then test if the atom centred symmetry functions work with it.

In [1]:
from aglaia import aglaia
import qml
import os
import numpy as np
import joblib

  from ._conv import register_converters as _register_converters


In [2]:
def list_files(dir, key):
    """
    This function walks through a directory and makes a list of the files that have a name containing a particular string
    :dir: path to the directory to explore
    :key: string to look for in file names
    :return: list of files containing "key" in their filename
    """

    r = []  # List of files to be joined together
    subdirs = [x[0] for x in os.walk(dir)]
    for subdir in subdirs:
        files = next(os.walk(subdir))[2]

        for file in files:
            isTrajectory = file.find(key)
            if isTrajectory >= 0:
                r.append(subdir + "/" + file)
    return r

In [3]:
# Making a list of all the compounds
filenames = list_files("/Volumes/Transcend/repositories/Aglaia/data/qm7", ".xyz")
compounds = []
for file in filenames[:5]:
    compound = qml.Compound(xyz=file)
    compounds.append(compound)

In [4]:
mbtypes = qml.representations.get_slatm_mbtypes([mol.nuclear_charges for mol in compounds])

In [5]:
elements = []
element_pairs = []

# Splitting the one and two body interactions in mbtypes
for item in mbtypes:
    if len(item) == 1:
        elements.append(item[0])
    if len(item) == 2:
        element_pairs.append(list(item))
    if len(item) == 3:
        break

# Need the element pairs in descending order for TF
for item in element_pairs:
    item.reverse()

In [6]:
xyzs = []
zs = []
max_n_atoms=0

for compound in compounds:
    xyzs.append(compound.coordinates)
    zs.append(compound.nuclear_charges)
    if len(compound.nuclear_charges) > max_n_atoms:
        max_n_atoms = len(compound.nuclear_charges)

In [7]:
n_samples = len(zs)
for i in range(n_samples):
    current_n_atoms = len(zs[i])
    missing_n_atoms = max_n_atoms - current_n_atoms
    zs_padding = np.zeros(missing_n_atoms)
    zs[i] = np.concatenate((zs[i], zs_padding))
    xyz_padding = np.zeros((missing_n_atoms, 3))
    xyzs[i] = np.concatenate((xyzs[i], xyz_padding))

In [8]:
zs = np.asarray(zs, dtype=np.int32)
xyzs = np.asarray(xyzs, dtype=np.float32)

In [9]:
np.savez("qm7_testdata.npz", xyzs, zs, elements, element_pairs)