In [1]:
from ase.io import read, write
import ase
import numpy as np
import chemiscope
import random
import matplotlib.pyplot as plt

from tqdm import tqdm

In [2]:
frames = read("/home/chong/water.xyz", ":50000")

In [3]:

cg_frames = []
num_water_per_cell = 128
mol_idx = np.arange(num_water_per_cell + 1) * 3

for f in tqdm(frames):
    cg_frame_pos = []
    cg_frame_forces = []
    for i in range(num_water_per_cell):

        # define ase.Atoms object with one molecule
        cur_mol = f[mol_idx[i]:mol_idx[i+1]]

        # relocate molecule to cell center and wrap
        cell_center = cur_mol.cell.cellpar()[:3] / 2
        O_center_vec = cell_center - cur_mol.positions[0]
        cur_mol.positions += O_center_vec
        cur_mol.wrap()

        # obtain center of mass from cell center, cast 
        # it back to original position of the molecule
        cg_frame_pos.append(
            cur_mol.get_center_of_mass() - O_center_vec
        )

        # save the summed forces (NOISE!)
        cg_frame_forces.append(
            np.sum(cur_mol.arrays['forces'], axis=0)
        )
    cg_frame_pos = np.vstack(cg_frame_pos)
    cg_frame_forces = np.vstack(cg_frame_forces)

    # sav CG frame
    cur_cg_frame = ase.Atoms(
            "X128",
            positions = cg_frame_pos,
            cell = f.cell,
            pbc = f.pbc,
    )
    cur_cg_frame.arrays['forces'] = cg_frame_forces
    cg_frames.append(cur_cg_frame)

100%|██████████| 50000/50000 [13:34<00:00, 61.39it/s]


In [5]:
write("/home/chong/CG_water.xyz", cg_frames)

In [None]:
frames = read("/home/chong/CG_water.xyz", ":")

random.seed(20240406)
random.shuffle(cg_frames)

In [6]:
subs = np.arange(4)

for sub in subs:
    write(f"/home/chong/CG_water_train_50_{sub}.xyz", cg_frames[sub*50:(sub+1)*50])
for sub in subs:
    write(f"/home/chong/CG_water_train_100_{sub}.xyz", cg_frames[sub*100:(sub+1)*100])
for sub in subs:
    write(f"/home/chong/CG_water_train_1k_{sub}.xyz", cg_frames[sub*1000:(sub+1)*1000])
for sub in subs:
    write(f"/home/chong/CG_water_train_10k_{sub}.xyz", cg_frames[sub*10000:(sub+1)*10000])

write("/home/chong/CG_water_val_1k.xyz", cg_frames[40000:41000])
write("/home/chong/CG_water_test_1k.xyz", cg_frames[41000:42000])

In [None]:
plt.hist(np.array([f.arrays['forces'] for f in frames[200000:205000]]).flatten(), bins=200);
np.sqrt(np.mean(np.power(np.array([f.arrays['forces'] for f in frames]).flatten(), 2)))