# Transform data


Transforms the datasets we have to .xyz format, so they are compatible with the CryinGAN repository.

In [None]:
import os
import ase
from ase.io import read, write
import numpy as np
import shutil
from pathlib import Path
%cd ..

from src.load_data import get_descriptors
from src.utils import load_raw_data, read_raw_sample

%cd -

def makedir_if_not_exists(path):
    try:
        if not os.path.isdir(path):
            print("Creating directory {}".format(path))
            os.mkdir(path)
    except OSError:
        print("Creation of the directory %s failed" % path)
        makedir_if_not_exists(path.parent) # Recursive call to create parent directory
    return

path = Path("../data/raw/crystal/Sq")
path = Path("../data/raw/samples")


phis = [
    0.70,
    0.80,
    0.84,
    0.86,
    ]

# Low packing fraction 0.70
#    2. Mid 0.78
#    3. High packing fraction 0.84
#    4. Very high 0.86
files, dataframe, metadata = load_raw_data(path, phi=phis, subpath="")
files, dataframe, metadata = load_raw_data(path, phi=phis, subpath="")


/Users/veikko/Documents/GitHub/hard-spheres
/Users/veikko/Documents/GitHub/hard-spheres/Notebooks
Loading data from /Users/veikko/Documents/GitHub/hard-spheres/data/raw/samples
Number of Files found :  896
Loading data from /Users/veikko/Documents/GitHub/hard-spheres/data/raw/samples
Number of Files found :  896


In [104]:
dataframe

Unnamed: 0_level_0,Unnamed: 1_level_0,class,x,y,r
experiment,sample,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
phi-0.70,sample-201,3,11.765579,-12.703711,0.823684
phi-0.70,sample-201,3,7.360477,8.344230,0.823684
phi-0.70,sample-201,11,-10.054668,-22.694506,1.198421
phi-0.70,sample-201,3,13.464811,-7.661156,0.823684
phi-0.70,sample-201,6,-11.595127,-13.888290,0.964211
phi-0.70,...,...,...,...,...
phi-0.70,sample-201,2,-4.634730,-6.837372,0.776842
phi-0.70,sample-201,1,-1.425673,-22.556146,0.730000
phi-0.70,sample-201,17,-21.178358,15.246681,1.479474
phi-0.70,sample-201,1,-4.135334,6.317064,0.730000


In [108]:
r

Unnamed: 0,r
0,1.245263
1,0.730000
2,0.823684
3,0.823684
4,1.198421
...,...
1995,1.385789
1996,0.730000
1997,0.964211
1998,0.870526


In [140]:
import pandas as pd
from ase.units import Bohr

max_files = np.inf # NOTE: Limit the amount of data to speed up training

input_paths = [file for file in files]
output_paths = {input_paths[i]: Path(str(input_paths[i]).replace("raw", "processed")).parent for i in range(len(input_paths))}

for path in output_paths.values():
    # Remove the folder if it exists
    if path.is_dir():
        print("Removing folder {}".format(path))
        shutil.rmtree(path)

for i, file in enumerate(files):
    dataframe, metadata = read_raw_sample(file)

    output_folder = output_paths[file]

    output_folder.mkdir(parents=True, exist_ok=True)

    output_samples = output_folder / "samples.extxyz"
    output_metadata = output_folder / "metadata.csv"
    radius_file = output_folder / "radius.csv"

    xyz = dataframe[["class", "x", "y"]].reset_index(drop=True)
    r = dataframe[["r"]].reset_index(drop=True)

    N = metadata.iloc[0,0] # N particles

    phi, sample = dataframe.index.unique()[0]
    phi_value = float(phi.split("-")[-1])
    xyz["z"] = 0

    # Create an ASE Atoms object

    L = metadata["L"].iloc[0]

    # NOTE: Radius is not right
    atoms = ase.Atoms(
        numbers=xyz["class"].values,
        positions=xyz[["x", "y", "z"]].values+[L/2, L/2, 0], # NOTE: Displace the system
        cell=[L, L, 0], # NOTE: 2D system
        pbc=[True, True, False], # NOTE: 2D system
        info={"phi": phi_value, "sample": sample, "N": N, "L":L},#, "r": r["r"].values},
    )
    atoms.new_array('rmt', r.values)
    # Save to xyz file
    # NOTE: This is the format used by ASE

    with open(output_samples, "a+") as f:
        write(f, atoms, format="extxyz", append=True)


    pd.DataFrame(
        {
            "phi": [phi_value],
            "sample": [sample],
            "N": [N],
            "L": [L],
        }
    ).to_csv(
        output_metadata,
        header=False,
        index=False,
        sep="\t",
        mode="a+",
    )
    
    if i > max_files:
        break


display(dataframe.head(10))
metadata

Removing folder ../data/processed/samples/phi-0.84
Removing folder ../data/processed/samples/phi-0.70


Unnamed: 0_level_0,Unnamed: 1_level_0,class,x,y,r
experiment,sample,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
phi-0.70,sample-201,3,11.765579,-12.703711,0.823684
phi-0.70,sample-201,3,7.360477,8.34423,0.823684
phi-0.70,sample-201,11,-10.054668,-22.694506,1.198421
phi-0.70,sample-201,3,13.464811,-7.661156,0.823684
phi-0.70,sample-201,6,-11.595127,-13.88829,0.964211
phi-0.70,sample-201,1,-7.155412,3.739408,0.73
phi-0.70,sample-201,1,-10.995767,-7.467527,0.73
phi-0.70,sample-201,2,-6.05125,-12.65186,0.776842
phi-0.70,sample-201,2,17.5157,12.600803,0.776842
phi-0.70,sample-201,1,19.312371,6.903855,0.73


Unnamed: 0_level_0,Unnamed: 1_level_0,N,L,A
experiment,sample,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
phi-0.70,sample-201,2000,48.208116,1626.815709


In [141]:
slab_from_file = read("../data/processed/samples/phi-0.70/samples.extxyz", index=0, format="extxyz")

from ase.visualize import view
view(slab_from_file, viewer="x3d")

In [None]:
slab_from_file.info["phi"]

{'phi': 0.7, 'sample': 'sample-245', 'N': 2000, 'L': 48.2081159270186}

In [139]:
slab_from_file.get_array("rmt").max()

0.0162

Works