In [1]:
import os
import random
import operator
from typing import List, Union

# Import necessary ASE modules
from ase import Atoms
from ase.io import read, write

In [None]:
def process_all_traj_files(
    target_directory: str = ".",
    num_snapshots_to_select: int = 100,
    output_subdir_name: str = "subsets"
) -> None:
    """
    Scans a target directory for .traj files, loads each file, selects a random
    subset of snapshots, and saves these subsets to new .traj files in a
    specified output subdirectory.

    The new filename will be derived from the original filename by appending
    '_subset' before the file extension.

    Parameters:
        target_directory (str): The directory to scan for .traj files.
                                Defaults to the current directory.
        num_snapshots_to_select (int): The maximum number of random snapshots
                                       to select for each subset. If a trajectory
                                       has fewer snapshots, all available snapshots
                                       will be selected for that file.
        output_subdir_name (str): The name of the subdirectory within target_directory
                                  where the subset .traj files will be saved.
    """
    print(f"Scanning directory: '{target_directory}' for .traj files...")

    # Construct the full path for the output subdirectory
    full_output_directory: str = os.path.join(target_directory, output_subdir_name)
    os.makedirs(full_output_directory, exist_ok=True)
    print(f"Output subsets will be saved in: '{full_output_directory}'")

    # List all files in the target directory
    files_in_dir: List[str] = os.listdir(target_directory)

    # Filter for .traj files only
    traj_files: List[str] = [
        f for f in files_in_dir
        if f.endswith(".traj") and os.path.isfile(os.path.join(target_directory, f))
    ]

    if not traj_files:
        print(f"No .traj files found in '{target_directory}'.")
        return

    for traj_file_name in traj_files:
        input_filepath: str = os.path.join(target_directory, traj_file_name)

        try:
            # 1. Load the entire trajectory from the current input file
            all_data: List[Atoms] = read(input_filepath, index=':')
            total_snapshots: int = len(all_data)

            if total_snapshots == 0:
                print(f"Skipping '{input_filepath}': Contains no snapshots.")
                continue # Move to the next file

            print(f"\nProcessing '{input_filepath}' (Total snapshots: {total_snapshots}).")

            # 2. Determine the actual number of snapshots to sample
            actual_num_to_sample: int = min(num_snapshots_to_select, total_snapshots)

            if actual_num_to_sample == 0:
                print(f"Warning: No snapshots to select from '{input_filepath}'.")
                continue # Move to the next file

            # 3. Generate random unique indices for the subset
            random_indices: List[int] = random.sample(range(total_snapshots), actual_num_to_sample)
            # Optional: random_indices.sort() # Uncomment if you want the subset in original order

            # 4. Use operator.itemgetter for efficient subset selection
            get_subset_items = operator.itemgetter(*random_indices)

            # Handle the case where only one item is selected by itemgetter
            if actual_num_to_sample == 1:
                subset_data: List[Atoms] = [get_subset_items(all_data)]
            else:
                subset_data: List[Atoms] = list(get_subset_items(all_data))

            print(f"  Selected {len(subset_data)} random snapshots.")

            # 5. Construct the output filename
            name_without_ext, ext = os.path.splitext(traj_file_name)
            output_filename: str = f"{name_without_ext}_subset{ext}"
            output_filepath: str = os.path.join(full_output_directory, output_filename)

            # 6. Save the subset data to the new .traj file
            write(output_filepath, subset_data)
            print(f"  Subset saved to '{output_filepath}'.")

        except FileNotFoundError:
            print(f"Error: Input file '{input_filepath}' not found during processing.")
        except Exception as e:
            print(f"An unexpected error occurred while processing '{input_filepath}': {e}")

    print("\nAll .traj files processed.")

In [None]:
process_all_traj_files(target_directory="databases/length_generalization_small/train", num_snapshots_to_select=1000, output_subdir_name="subset_results_simple")


Scanning directory: 'databases/length_generalization_small/test_single' for .traj files...
Output subsets will be saved in: 'databases/length_generalization_small/test_single/subset_results_simple'

Processing 'databases/length_generalization_small/test_single/CCCCCCCC_total.traj' (Total snapshots: 11001).
  Selected 100 random snapshots.
  Subset saved to 'databases/length_generalization_small/test_single/subset_results_simple/CCCCCCCC_total_subset.traj'.

All .traj files processed.


In [2]:
from gpaw import GPAW

from ase import Atoms
from ase.io import Trajectory

atoms = Atoms('N2', positions=[[0, 0, -1], [0, 0, 1]])
atoms.center(vacuum=3.0)

calc = GPAW(mode='lcao', basis='dzp', txt='gpaw.txt')
atoms.calc = calc

traj = Trajectory('binding_curve.traj', 'w')

step = 0.05
nsteps = int(3 / step)

for i in range(nsteps):
    d = 0.5 + i * step
    atoms.positions[1, 2] = atoms.positions[0, 2] + d
    atoms.center(vacuum=3.0)
    e = atoms.get_potential_energy()
    f = atoms.get_forces()
    print('distance, energy', d, e)
    print('force', f)
    traj.write(atoms)

ModuleNotFoundError: No module named 'gpaw'

In [3]:
from ase.build import molecule
atoms = molecule('H2O', vacuum=3.0)

In [None]:
from ase.collections import g2
from ase.visualize import view

print(g2.names)  # These are the molecule names
atoms = g2['CH3CH2OH']
view(atoms)
view(g2)  # View all 162 systems

['PH3', 'P2', 'CH3CHO', 'H2COH', 'CS', 'OCHCHO', 'C3H9C', 'CH3COF', 'CH3CH2OCH3', 'HCOOH', 'HCCl3', 'HOCl', 'H2', 'SH2', 'C2H2', 'C4H4NH', 'CH3SCH3', 'SiH2_s3B1d', 'CH3SH', 'CH3CO', 'CO', 'ClF3', 'SiH4', 'C2H6CHOH', 'CH2NHCH2', 'isobutene', 'HCO', 'bicyclobutane', 'LiF', 'Si', 'C2H6', 'CN', 'ClNO', 'S', 'SiF4', 'H3CNH2', 'methylenecyclopropane', 'CH3CH2OH', 'F', 'NaCl', 'CH3Cl', 'CH3SiH3', 'AlF3', 'C2H3', 'ClF', 'PF3', 'PH2', 'CH3CN', 'cyclobutene', 'CH3ONO', 'SiH3', 'C3H6_D3h', 'CO2', 'NO', 'trans-butane', 'H2CCHCl', 'LiH', 'NH2', 'CH', 'CH2OCH2', 'C6H6', 'CH3CONH2', 'cyclobutane', 'H2CCHCN', 'butadiene', 'C', 'H2CO', 'CH3COOH', 'HCF3', 'CH3S', 'CS2', 'SiH2_s1A1d', 'C4H4S', 'N2H4', 'OH', 'CH3OCH3', 'C5H5N', 'H2O', 'HCl', 'CH2_s1A1d', 'CH3CH2SH', 'CH3NO2', 'Cl', 'Be', 'BCl3', 'C4H4O', 'Al', 'CH3O', 'CH3OH', 'C3H7Cl', 'isobutane', 'Na', 'CCl4', 'CH3CH2O', 'H2CCHF', 'C3H7', 'CH3', 'O3', 'P', 'C2H4', 'NCCN', 'S2', 'AlCl3', 'SiCl4', 'SiO', 'C3H4_D2d', 'H', 'COF2', '2-butyne', 'C2H5', 'BF3'

<Popen: returncode: None args: ['/home/amir/miniconda3/envs/KRMDT/bin/python...>

usage: ase [-h] [--version] [-T]
           {help,info,test,gui,db,run,band-structure,build,dimensionality,eos,ulm,find,nebplot,nomad-upload,nomad-get,convert,reciprocal,completion,diff,exec}
           ...
ase: error: TclError: no display name and no $DISPLAY environment variable
To get a full traceback, use: ase -T gui ...
usage: ase [-h] [--version] [-T]
           {help,info,test,gui,db,run,band-structure,build,dimensionality,eos,ulm,find,nebplot,nomad-upload,nomad-get,convert,reciprocal,completion,diff,exec}
           ...
ase: error: TclError: no display name and no $DISPLAY environment variable
To get a full traceback, use: ase -T gui ...
