In [1]:
import numpy as np
import glob

In [2]:
def read_xyz(file_path):
    """
    Reads an .xyz file and returns atom symbols and coordinates.
    
    Args:
        file_path (str): Path to the .xyz file.
    
    Returns:
        tuple: (list of atom symbols, NumPy array of coordinates).
    """
    with open(file_path, 'r') as f:
        lines = f.readlines()
        num_atoms = int(lines[0].strip())
        # Skip the comment line (line 2)
        atoms = []
        coords = []
        for line in lines[2:2 + num_atoms]:
            parts = line.split()
            atoms.append(parts[0])
            coords.append([float(parts[1]), float(parts[2]), float(parts[3])])
        return atoms, np.array(coords)

def center_coordinates(coords):
    """
    Centers the coordinates by subtracting the centroid.
    
    Args:
        coords (np.ndarray): Array of shape (n_atoms, 3) with x, y, z coordinates.
    
    Returns:
        np.ndarray: Centered coordinates.
    """
    centroid = np.mean(coords, axis=0)
    return coords - centroid

def perform_pca(coords):
    """
    Performs PCA on the coordinates to find principal axes.
    
    Args:
        coords (np.ndarray): Centilidade coordinates.
    
    Returns:
        tuple: (eigenvalues, eigenvectors) sorted by eigenvalues in descending order.
    """
    cov_matrix = np.cov(coords.T)
    eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)
    # Sort eigenvalues and eigenvectors in descending order
    idx = np.argsort(eigenvalues)[::-1]
    eigenvalues = eigenvalues[idx]
    eigenvectors = eigenvectors[:, idx]
    return eigenvalues, eigenvectors

def construct_rotation_matrix(eigenvectors):
    """
    Constructs a rotation matrix to align the principal axes with standard axes.
    
    Args:
        eigenvectors (np.ndarray): Matrix with eigenvectors as columns.
    
    Returns:
        np.ndarray: Rotation matrix (transpose of eigenvectors).
    """
    return eigenvectors.T

def rotate_coordinates(coords, rotation_matrix):
    """
    Rotates the coordinates using the rotation matrix.
    
    Args:
        coords (np.ndarray): Centered coordinates.
        rotation_matrix (np.ndarray): Rotation matrix.
    
    Returns:
        np.ndarray: Rotated coordinates.
    """
    return np.dot(coords, rotation_matrix.T)

def write_xyz(file_path, atoms, coords):
    """
    Writes atom symbols and coordinates to a new .xyz file.
    
    Args:
        file_path (str): Path to the output .xyz file.
        atoms (list): List of atom symbols.
        coords (np.ndarray): Array of rotated coordinates.
    """
    with open(file_path, 'w') as f:
        f.write(f"{len(atoms)}\n")
        f.write("Molecule rotated to XY-plane\n")
        for atom, coord in zip(atoms, coords):
            f.write(f"{atom} {coord[0]:.6f} {coord[1]:.6f} {coord[2]:.6f}\n")

def process_file(input_path, output_path):
    """
    Processes a single .xyz file: reads, rotates, and writes the result.
    
    Args:
        input_path (str): Path to the input .xyz file.
        output_path (str): Path to the output .xyz file.
    """
    # Read the file
    atoms, coords = read_xyz(input_path)
    
    # Center the coordinates
    centered_coords = center_coordinates(coords)
    
    # Perform PCA
    eigenvalues, eigenvectors = perform_pca(centered_coords)
    
    # Construct rotation matrix
    rotation_matrix = construct_rotation_matrix(eigenvectors)
    
    # Rotate coordinates
    rotated_coords = rotate_coordinates(centered_coords, rotation_matrix)
    
    # Write to new file
    write_xyz(output_path, atoms, rotated_coords)

def main(directory):
    """
    Processes all .xyz files in the specified directory.
    
    Args:
        directory (str): Path to the directory containing .xyz files.
    """
    for input_path in glob.glob(f"{directory}/*.xyz"):
        output_path = input_path.replace(".xyz", "_rotated.xyz")
        print(f"Processing {input_path} -> {output_path}")
        process_file(input_path, output_path)

if __name__ == "__main__":
    # Replace with the actual path to your directory containing .xyz files
    main("path/to/your/directory")

In [3]:
import numpy as np
import glob
import os

def compute_planarity_ratio(atom_pos):
    """
    Compute the planarity ratio of a molecule based on its atomic positions.
    Returns a value between 0 and 1, where 1 is perfectly planar.
    """
    # Verify atom_pos is a 2D array with shape (n_atoms, 3)
    if len(atom_pos.shape) != 2 or atom_pos.shape[1] != 3:
        raise ValueError("atom_pos must be a 2D array with shape (n_atoms, 3)")
    
    # Center the coordinates around the origin
    centroid = np.mean(atom_pos, axis=0)
    centered_coords = atom_pos - centroid
    
    # Compute the covariance matrix
    cov_matrix = np.cov(centered_coords, rowvar=False)
    
    # Compute eigenvalues of the covariance matrix (ascending order)
    eigenvalues = np.linalg.eigh(cov_matrix)[0]
    
    # Sort eigenvalues in descending order
    eigenvalues = sorted(eigenvalues, reverse=True)
    
    # Compute planarity ratio: (sum of two largest eigenvalues) / total variance
    total_variance = sum(eigenvalues)
    if total_variance == 0:
        return 1.0  # All atoms at the same point (trivially planar)
    planarity_ratio = (eigenvalues[0] + eigenvalues[1]) / total_variance
    return planarity_ratio

def find_planar_molecules(directory, threshold=0.9):
    """
    Identify .npz files in the directory with molecules meeting the planarity threshold.
    Prints the file names that satisfy the condition.
    """
    # Get list of all .npz files in the directory
    npz_files = glob.glob(f"{directory}/*.npz")
    names = []  # To store file names, as in your snippet
    
    print(f"Checking for molecules at least {threshold*100}% planar:")
    count = 0
    for npz_file in npz_files:
        # Append the file name, matching your snippet
        names.append(os.path.basename(npz_file))
        
        # Load the .npz file and extract data, as per your example
        try:
            with np.load(npz_file) as data:
                atom_pos = data['atom_pos']
                # You can also access these if needed later:
                # atomic_numbers = data['atomic_numbers']
                # frequencies = data['frequencies']
                # spectrums = data['spectrums']
            
            # Compute planarity ratio
            ratio = compute_planarity_ratio(atom_pos)
            
            # Check if the molecule meets the planarity threshold
            if ratio >= threshold:
                print(os.path.basename(npz_file))
                count += 1
                
        except Exception as e:
            print(f"Error processing {os.path.basename(npz_file)}: {e}")

    print(f"Found {count} molecules meeting the planarity threshold.")

# Example usage
if __name__ == "__main__":
    # Replace with your directory containing .npz files
    directory = "/scratch/phys/sin/sethih1/data_files/balanced_group"
    
    # Set the planarity threshold (e.g., 0.9 for 90% planar)
    threshold = 1  # Adjust this as needed
    
    find_planar_molecules(directory, threshold)

Checking for molecules at least 100% planar:
24530.npz
123195.npz
142488.npz
115034.npz
945.npz
6350.npz
947.npz
62317.npz
140063.npz
67520.npz
948.npz
24529.npz
Found 12 molecules meeting the planarity threshold.


In [7]:
# Example usage
if __name__ == "__main__":
    # Replace with your directory containing .npz files
    directory = "/scratch/phys/sin/sethih1/data_files/first_group_images"
    
    # Set the planarity threshold (e.g., 0.9 for 90% planar)
    threshold = 1  # Adjust this as needed
    
    find_planar_molecules(directory, threshold)

Checking for molecules at least 100% planar:
160438_0.npz
104752_0.npz
62317_0.npz
123083_0.npz
11199_0.npz
6325_0.npz
24602_0.npz
Found 7 molecules meeting the planarity threshold.


In [8]:
# Example usage
if __name__ == "__main__":
    # Replace with your directory containing .npz files
    directory = "/scratch/phys/sin/sethih1/data_files/second_group_images"
    
    # Set the planarity threshold (e.g., 0.9 for 90% planar)
    threshold = 1  # Adjust this as needed
    
    find_planar_molecules(directory, threshold)

Checking for molecules at least 100% planar:
947_0.npz
280_0.npz
281_0.npz
67520_0.npz
123178_0.npz
24529_0.npz
139859_0.npz
138112_0.npz
783_0.npz
139505_0.npz
119388_0.npz
24823_0.npz
142488_0.npz
115034_0.npz
123195_0.npz
105142_0.npz
137677_0.npz
540_0.npz
6350_0.npz
948_0.npz
945_0.npz
136069_0.npz
140063_0.npz
768_0.npz
138825_0.npz
24530_0.npz
137676_0.npz
962_0.npz
9999_0.npz
Found 29 molecules meeting the planarity threshold.


In [9]:
# Example usage
if __name__ == "__main__":
    # Replace with your directory containing .npz files
    directory = "/scratch/phys/sin/sethih1/data_files/all_group_images_new"
    
    # Set the planarity threshold (e.g., 0.9 for 90% planar)
    threshold = 0.999  # Adjust this as needed
    
    find_planar_molecules(directory, threshold)

Checking for molecules at least 99.9% planar:
764.npz
33776.npz
19.npz
78847.npz
192303.npz
19914.npz
10435.npz
191078.npz
15875.npz
137676.npz
6846.npz
1491.npz
9222.npz
456.npz
7971.npz
95303.npz
15863.npz
27492.npz
92970.npz
68401.npz
67126.npz
69318.npz
136679.npz
96194.npz
10366.npz
151097.npz
68442.npz
4649.npz
214615.npz
133922.npz
237859.npz
9338.npz
7492.npz
14101.npz
96389.npz
137009.npz
123419.npz
126347.npz
153669.npz
72914.npz
8784.npz
202937.npz
11473.npz
141184.npz
77987.npz
88025.npz
1456.npz
24530.npz
66921.npz
9231.npz
14526.npz
119218.npz
12087.npz
74115.npz
6275.npz
164592.npz
214301.npz
162804.npz
249592.npz
230976.npz
247394.npz
88085.npz
1057.npz
187896.npz
10268.npz
83191.npz
21119.npz
84684.npz
6998.npz
247543.npz
162542.npz
219401.npz
220188.npz
209813.npz
169306.npz
11089.npz
9226.npz
26130.npz
136601.npz
90070.npz
87759.npz
24955.npz
68514.npz
20467.npz
79470.npz
228506.npz
69257.npz
72924.npz
202767.npz
124218.npz
110064.npz
139859.npz
72926.npz
72766.npz
9

In [6]:
import numpy as np
import glob
import os
import shutil  # For copying files

def compute_planarity_ratio(atom_pos):
    """
    Compute the planarity ratio of a molecule based on its atomic positions.
    Returns a value between 0 and 1, where 1 is perfectly planar.
    """
    # Verify atom_pos is a 2D array with shape (n_atoms, 3)
    if len(atom_pos.shape) != 2 or atom_pos.shape[1] != 3:
        raise ValueError("atom_pos must be a 2D array with shape (n_atoms, 3)")
    
    # Center the coordinates around the origin
    centroid = np.mean(atom_pos, axis=0)
    centered_coords = atom_pos - centroid
    
    # Compute the covariance matrix
    cov_matrix = np.cov(centered_coords, rowvar=False)
    
    # Compute eigenvalues of the covariance matrix (ascending order)
    eigenvalues = np.linalg.eigh(cov_matrix)[0]
    
    # Sort eigenvalues in descending order
    eigenvalues = sorted(eigenvalues, reverse=True)
    
    # Compute planarity ratio: (sum of two largest eigenvalues) / total variance
    total_variance = sum(eigenvalues)
    if total_variance == 0:
        return 1.0  # All atoms at the same point (trivially planar)
    planarity_ratio = (eigenvalues[0] + eigenvalues[1]) / total_variance
    return planarity_ratio

def copy_planar_molecules(source_directory, dest_directory, threshold=0.9):
    """
    Identify .npz files in the source_directory with molecules meeting the planarity threshold,
    and copy them to the dest_directory.
    """
    # Create the destination directory if it doesn't exist
    os.makedirs(dest_directory, exist_ok=True)
    
    # Get list of all .npz files in the source directory
    npz_files = glob.glob(os.path.join(source_directory, "*.npz"))
    
    print(f"Copying molecules at least {threshold*100}% planar from {source_directory} to {dest_directory}:")
    count = 0
    for npz_file in npz_files:
        try:
            with np.load(npz_file) as data:
                atom_pos = data['atom_pos']
            
            # Compute planarity ratio
            ratio = compute_planarity_ratio(atom_pos)
            
            # Check if the molecule meets the planarity threshold
            if ratio >= threshold:
                dest_path = os.path.join(dest_directory, os.path.basename(npz_file))
                shutil.copy2(npz_file, dest_path)
                print(f"Copied: {os.path.basename(npz_file)} (ratio: {ratio:.3f})")
                count += 1
                
        except Exception as e:
            print(f"Error processing {os.path.basename(npz_file)}: {e}")

    print(f"Copied {count} molecules meeting the planarity threshold.")

# Example usage
if __name__ == "__main__":
    # Replace these with your directories
    source_directory = "/scratch/phys/sin/sethih1/data_files/second_group_images_256"
    dest_directory = "/scratch/phys/sin/sethih1/data_files/planar_molecules_256"
    
    # Set the planarity threshold (e.g., 0.999 for 99.9% planar)
    threshold = 0.999  # Adjust this as needed
    
    copy_planar_molecules(source_directory, dest_directory, threshold)


Copying molecules at least 99.9% planar from /scratch/phys/sin/sethih1/data_files/second_group_images_256 to /scratch/phys/sin/sethih1/data_files/planar_molecules_256:
Copied: 764.npz (ratio: 1.000)
Copied: 33776.npz (ratio: 1.000)
Copied: 19.npz (ratio: 1.000)
Copied: 78847.npz (ratio: 1.000)
Copied: 10435.npz (ratio: 1.000)
Copied: 137676.npz (ratio: 1.000)
Copied: 1491.npz (ratio: 1.000)
Copied: 9222.npz (ratio: 1.000)
Copied: 7971.npz (ratio: 1.000)
Copied: 95303.npz (ratio: 1.000)
Copied: 15863.npz (ratio: 1.000)
Copied: 92970.npz (ratio: 1.000)
Copied: 67126.npz (ratio: 1.000)
Copied: 69318.npz (ratio: 1.000)
Copied: 136679.npz (ratio: 1.000)
Copied: 96194.npz (ratio: 1.000)
Copied: 151097.npz (ratio: 1.000)
Copied: 68442.npz (ratio: 1.000)
Copied: 133922.npz (ratio: 1.000)
Copied: 237859.npz (ratio: 0.999)
Copied: 9338.npz (ratio: 1.000)
Copied: 14101.npz (ratio: 1.000)
Copied: 96389.npz (ratio: 1.000)
Copied: 137009.npz (ratio: 1.000)
Copied: 123419.npz (ratio: 1.000)
Copied: 1