# Transform data


Transforms the datasets we have to .xyz format, so they are compatible with the CryinGAN repository.

In [None]:
import os
import ase
from ase.io import read, write
import numpy as np
import shutil
from pathlib import Path
from tqdm import tqdm

%cd ..
from src.utils import load_raw_data, read_raw_sample

from CCGAN.tools import BatchDistance2D, BatchDistance

%cd -

import torch

In [None]:
def makedir_if_not_exists(path):
    try:
        if not os.path.isdir(path):
            print("Creating directory {}".format(path))
            os.mkdir(path)
    except OSError:
        print("Creation of the directory %s failed" % path)
        makedir_if_not_exists(path.parent) # Recursive call to create parent directory
    return

path = Path("../data/raw/crystal/Sq")
path = Path("../data/raw/samples")


phis = [
    0.70,
    0.72,
    0.74,
    0.76,
    0.78,
    0.80,
    # 0.81, # Left out on purpose to enable testing interpolation 
    0.82,
    0.83,
    0.84,
    0.85,
    0.86,
    # 0.8625 # Left out on purpose to enable testing extrapolation
    ]

# Low packing fraction 0.70
#    2. Mid 0.78
#    3. High packing fraction 0.84
#    4. Very high 0.86

files, dataframe, metadata = load_raw_data(path, phi=phis, subpath="")

In [None]:
import pandas as pd

SCALING_FACTOR = 1 # The largest distance in the system is a bit over 47 so we scale it down to <1

max_files = np.inf # NOTE: Limit the amount of data to speed up training

input_paths = [file for file in files]
output_paths = {input_paths[i]: Path(str(input_paths[i]).replace("raw", "processed")).parent for i in range(len(input_paths))}

for path in output_paths.values():
    # Remove the folder if it exists
    if path.is_dir():
        print("Removing folder {}".format(path))
        shutil.rmtree(path)


for i, file in tqdm(enumerate(files), total=len(files)):
    dataframe, metadata = read_raw_sample(file)
    dataframe = dataframe.sort(values=["r"],ascending=True)
    # NOTE: This is the order in which the particles are added to the system
    # NOTE: The samples should always be in the same order

    output_folder = output_paths[file]

    output_folder.mkdir(parents=True, exist_ok=True)

    output_samples = output_folder / "samples.extxyz"
    output_metadata = output_folder / "metadata.csv"
    radius_file = output_folder / "radius.csv"

    xyz = dataframe[["class", "x", "y"]].reset_index(drop=True)
    r = dataframe[["r"]].reset_index(drop=True) / SCALING_FACTOR # NOTE: Scale with 50

    N = metadata.iloc[0,0] # N particles

    phi, sample = dataframe.index.unique()[0]
    phi_value = float(phi.split("-")[-1])
    xyz["z"] = 0

    # Create an ASE Atoms object

    L = metadata["L"].iloc[0] / SCALING_FACTOR # NOTE: Scale with 50

    # NOTE: Radius is not right
    atoms = ase.Atoms(
        numbers=xyz["class"].values,
        positions=(xyz[["x", "y", "z"]].values / SCALING_FACTOR)+[L/2, L/2, 0],
        cell=[L, L, 0], # NOTE: 2D system
        pbc=[True, True, False], # NOTE: 2D system
        info={
            "phi": phi_value,
            "sample": sample,
            "N": N,
            "L": L,
            "SCALING_FACTOR": SCALING_FACTOR},#, "r": r["r"].values},
    )
    atoms.new_array('rmt', r.values)

    # Save to xyz file
    # NOTE: This is the format used by ASE

    with open(output_samples, "a+") as f:
        write(f, atoms, format="extxyz", append=True)

    atoms_flipped = ase.Atoms(
        numbers=xyz["class"].values,
        positions=(xyz[["y", "x", "z"]].values / SCALING_FACTOR)+[L/2, L/2, 0], # NOTE: Flip
        cell=[L, L, 0], # NOTE: 2D system
        pbc=[True, True, False], # NOTE: 2D system
        info={
            "phi": phi_value,
            "sample": sample,
            "N": N,
            "L": L,
            "SCALING_FACTOR": SCALING_FACTOR},#, "r": r["r"].values},
    )
    atoms_flipped.new_array('rmt', r.values)

    # Save to xyz file
    # NOTE: This is the format used by ASE

    with open(output_samples, "a+") as f:
        write(f, atoms_flipped, format="extxyz", append=True)


    pd.DataFrame(
        {
            "phi": [phi_value],
            "sample": [sample],
            "N": [N],
            "L": [L],
        }
    ).to_csv(
        output_metadata,
        header=False,
        index=False,
        sep="\t",
        mode="a+",
    )
    
    if i > max_files:
        break


display(dataframe.head(10))
metadata

In [None]:
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

PHI = "0.70"

padded_phi = format(float(PHI), ".2f")

file = f"../data/processed/samples/phi-{padded_phi}/samples.extxyz"

print("Reading file: ", file)

all_atoms = read(file, index=":", format="extxyz")
idxs = [0,5,19]

fig = plt.figure(figsize=(7, 7*len(idxs)))
gs  = gridspec.GridSpec(len(idxs), 1, hspace=0.3)

# top row: two scatter plots

for i, idx in enumerate(idxs):
    np_coords = all_atoms[idx].get_positions()
    radii = all_atoms[idx].get_array("rmt")

    ax_gen = fig.add_subplot(gs[i])
    ax_gen.scatter(
        np_coords[:,0],
        np_coords[:,1],
        c=radii, s=radii*20, alpha=0.5
    )

In [None]:
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

PHI = "0.86"

padded_phi = format(float(PHI), ".2f")

file = f"../data/processed/samples/phi-{padded_phi}/samples.extxyz"

print("Reading file: ", file)

all_atoms = read(file, index=":", format="extxyz")
idxs = [0,5,19]

fig = plt.figure(figsize=(7, 7*len(idxs)))
gs  = gridspec.GridSpec(len(idxs), 1, hspace=0.3)

# top row: two scatter plots

for i, idx in enumerate(idxs):
    np_coords = all_atoms[idx].get_positions()
    radii = all_atoms[idx].get_array("rmt")

    ax_gen = fig.add_subplot(gs[i])
    ax_gen.scatter(
        np_coords[:,0],
        np_coords[:,1],
        c=radii, s=radii*20, alpha=0.5
    )

In [None]:
# Check the  ASE Atoms object to make sure the cell sizes are correct
from ase.visualize import view

atoms = all_atoms[0]

atoms.get_cell(), atoms.get_positions().max()


Works

# Make sure BatchDistance2D works too

## Minimal smoke test, should print 0.5

In [None]:
import torch
from torch.utils.data import DataLoader

# simple 2-atom motif in a square cell of side 1.0
lat = torch.eye(3)
coords = torch.tensor([[[[0.25, 0.25, 0.0],
                         [0.75, 0.25, 0.0]]]])  # shape (1,1,2,3)

ds2d = BatchDistance2D(coords, n_neighbors=4, lat_matrix=lat)
out = ds2d.append_dist()  
# out shape (1,1,2,3+1) → last column is the nearest‐neighbor distance
print(out)  
# should be 0.50 exactly (half the cell in x), for both atoms
