## Amino-acid polarization energy estimation based on fitted atomic polarizabilities

### Convert XYZ files with background charges into EF files containing total, parallel, and perpendicular electric-field descriptors

In [2]:
import numpy as np
import math
import os
from tqdm import tqdm


AMINO_CONNECT = {
    "ALA": [[1,2],[2,1],[3,1],[4,3],[5,3],[6,5],[7,5],[8,5],[9,3],[10,9],[11,1],[12,9]],
    "ARG": [[1,2],[2,1],[3,1],[4,3],[5,3],[6,5],[7,5],[8,5],[9,8],[10,8],[11,8],[12,11],[13,11],[14,11],[15,14],[16,14],[17,16],[18,17],[19,17],[20,16],[21,20],[22,20],[23,3],[24,23],[25,1],[26,23]],
    "ASN": [[1,2],[2,1],[3,1],[4,3],[5,3],[6,5],[7,5],[8,5],[9,8],[10,8],[11,10],[12,10],[13,3],[14,13],[15,1],[16,13]],
    "ASP": [[1,2],[2,1],[3,1],[4,3],[5,3],[6,5],[7,5],[8,5],[9,8],[10,8],[11,3],[12,11],[13,1],[14,11]],
    "CYS": [[1,2],[2,1],[3,1],[4,3],[5,3],[6,5],[7,5],[8,5],[9,8],[10,3],[11,10],[12,1],[13,10]],
    "GLN": [[1,2],[2,1],[3,1],[4,3],[5,3],[6,5],[7,5],[8,5],[9,8],[10,8],[11,10],[12,11],[13,11],[14,13],[15,13],[16,3],[17,16],[18,1],[19,16]],
    "GLU": [[1,2],[2,1],[3,1],[4,3],[5,3],[6,5],[7,5],[8,5],[9,8],[10,8],[11,8],[12,11],[13,11],[14,3],[15,14],[16,1],[17,14]],
    "GLY": [[1,2],[2,1],[3,1],[4,3],[5,3],[6,3],[7,6],[8,1],[9,6]],
    "HID": [[1,2],[2,1],[3,1],[4,3],[5,3],[6,5],[7,5],[8,5],[9,8],[10,9],[11,9],[12,11],[13,11],[14,13],[15,14],[16,3],[17,16],[18,1],[19,16]],
    "HIE": [],
    "ILE": [[1,2],[2,1],[3,1],[4,3],[5,3],[6,5],[7,5],[8,7],[9,7],[10,7],[11,5],[12,11],[13,11],[14,11],[15,14],[16,14],[17,14],[18,3],[19,18],[20,1],[21,18]],
    "LEU": [[1,2],[2,1],[3,1],[4,3],[5,3],[6,5],[7,5],[8,5],[9,8],[10,8],[11,10],[12,10],[13,10],[14,8],[15,14],[16,14],[17,14],[18,3],[19,18],[20,1],[21,18]],
    "LYS": [[1,2],[2,1],[3,1],[4,3],[5,3],[6,5],[7,5],[8,5],[9,8],[10,8],[11,8],[12,11],[13,11],[14,11],[15,14],[16,14],[17,14],[18,17],[19,17],[20,17],[21,3],[22,21],[23,1],[24,21]],
    "MET": [[1,2],[2,1],[3,1],[4,3],[5,3],[6,5],[7,5],[8,5],[9,8],[10,8],[11,8],[12,11],[13,12],[14,12],[15,12],[16,3],[17,16],[18,1],[19,16]],
    "PHE": [[1,2],[2,1],[3,1],[4,3],[5,3],[6,5],[7,5],[8,5],[9,8],[10,9],[11,9],[12,11],[13,11],[14,13],[15,13],[16,15],[17,15],[18,17],[19,3],[20,19],[21,1],[22,19]],
    "PRO": [[1,15],[2,1],[3,2],[4,2],[5,2],[6,5],[7,5],[8,5],[9,8],[10,8],[11,8],[12,11],[13,11],[14,13],[15,1],[16,13]],
    "SER": [[1,2],[2,1],[3,1],[4,3],[5,3],[6,5],[7,5],[8,5],[9,8],[10,3],[11,10],[12,1],[13,10]],
    "THR": [[1,2],[2,1],[3,1],[4,3],[5,3],[6,5],[7,5],[8,7],[9,7],[10,7],[11,5],[12,11],[13,3],[14,13],[15,1],[16,13]],
    "TRP": [[1,2],[2,1],[3,1],[4,3],[5,3],[6,5],[7,5],[8,5],[9,8],[10,9],[11,9],[12,11],[13,11],[14,13],[15,14],[16,14],[17,16],[18,16],[19,18],[20,18],[21,20],[22,20],[23,3],[24,23],[25,1],[26,23]],
    "TYR": [[1,2],[2,1],[3,1],[4,3],[5,3],[6,5],[7,5],[8,5],[9,8],[10,9],[11,9],[12,11],[13,11],[14,13],[15,14],[16,13],[17,16],[18,16],[19,18],[20,3],[21,20],[22,1],[23,20]],
    "VAL": [[1,2],[2,1],[3,1],[4,3],[5,3],[6,5],[7,5],[8,7],[9,7],[10,7],[11,5],[12,11],[13,11],[14,11],[15,3],[16,15],[17,1],[18,15]],
    "ACE": [[1,2],[2,5],[3,2],[4,2],[5,6],[6,5],[7,5]],
    "NME": [[1,2],[2,1],[3,1],[4,3],[5,3],[6,3],[7,1]],
}

def process_xyz_to_ef(xyz_file, ef_dir, ALA):
    """Process a single .xyz file and write the corresponding .ef file."""
    with open(xyz_file, 'r') as f:
        lines = f.readlines()

    n_atoms = int(lines[0].strip())  # Number of atoms

    coord_dict = {
        atom_i: [float(parts[1]), float(parts[2]), float(parts[3])]
        for atom_i, line in zip(range(1, n_atoms + 1), lines[2:2 + n_atoms])
        for parts in [line.split()]
    }

    # Compute bond-direction unit vectors
    result = []
    for (a1, a2) in ALA:
        x1, y1, z1 = coord_dict[a1]
        x2, y2, z2 = coord_dict[a2]
        dx, dy, dz = x2 - x1, y2 - y1, z2 - z1
        length = math.sqrt(dx**2 + dy**2 + dz**2)
        dx /= length
        dy /= length
        dz /= length
        result.append([a1, x1, y1, z1, dx, dy, dz])

    # Read background charges
    charge = []
    for line in lines[2 + n_atoms + 1:]:
        parts = line.split()
        if len(parts) >= 4:
            charge.append([float(parts[0]), float(parts[1]), float(parts[2]), float(parts[3])])

    ANG2AU = 1 / 0.5291772083
    new_result = []

    for atom_data in result:
        atom_i, x, y, z, dx, dy, dz = atom_data
        i_atom_coord = np.array([x, y, z])
        i_E_total = np.array([0.0, 0.0, 0.0])

        for qx, qy, qz, q in charge:
            i_charge_coord = np.array([qx, qy, qz])
            r = (i_atom_coord - i_charge_coord) * ANG2AU
            r_mag = np.linalg.norm(r)
            if r_mag != 0:
                E_i = q * r / (r_mag ** 3)
                i_E_total += E_i

        i_E_total_norm = np.linalg.norm(i_E_total)
        i_atom_vec = np.array([dx, dy, dz])
        i_E_paral = np.dot(i_E_total, i_atom_vec) * i_atom_vec
        i_E_verti = i_E_total - i_E_paral
        i_E_paral_norm, i_E_verti_norm = np.linalg.norm(i_E_paral), np.linalg.norm(i_E_verti)

        new_result.append(atom_data + [
            i_E_total[0], i_E_total[1], i_E_total[2],
            i_E_total_norm, i_E_paral_norm, i_E_verti_norm
        ])

    # Write the .ef file
    os.makedirs(ef_dir, exist_ok=True)
    base_name = os.path.basename(xyz_file)
    ef_file = os.path.join(ef_dir, os.path.splitext(base_name)[0] + ".ef")

    header = "atom_id  x  y  z  dx  dy  dz  Ex  Ey  Ez  |E|  |E_parallel|  |E_vertical|\n"
    with open(ef_file, "w") as f:
        f.write(header)
        for row in new_result:
            f.write(" ".join(f"{val:12.6f}" if isinstance(val, float) else f"{val:4d}" for val in row) + "\n")

    return ef_file


def batch_process_xyz(base_dir, amino_name):
    """Look up the connectivity table by amino-acid name and batch-convert xyz files to ef files."""
    if amino_name not in AMINO_CONNECT:
        print(f"‚ùå Connectivity for amino acid {amino_name} is not defined. Please add it to the AMINO_CONNECT dictionary first.")
        return

    ALA = AMINO_CONNECT[amino_name]

    # Build amino-acid directory paths
    amino_dir = os.path.join(base_dir, amino_name)
    xyz_dir = os.path.join(amino_dir, "xyz")
    ef_dir = os.path.join(amino_dir, "ef")
    os.makedirs(ef_dir, exist_ok=True)

    xyz_files = [os.path.join(xyz_dir, f) for f in os.listdir(xyz_dir) if f.endswith(".xyz")]
    if not xyz_files:
        print(f"‚ö†Ô∏è No .xyz files found in {xyz_dir}.")
        return

    print(f"üîç Processing amino acid {amino_name}: {len(xyz_files)} file(s)...\n")

    for xyz_file in tqdm(xyz_files, desc=f"Processing {amino_name}"):
        try:
            ef_path = process_xyz_to_ef(xyz_file, ef_dir, ALA)
        except Exception as e:
            print(f"‚ùå Failed to process file: {xyz_file}\n   Error: {e}\n")

    print(f"\n‚úÖ {amino_name} Finished. Results saved to: {ef_dir}\n")


In [None]:
base_dir = "example"  # Relative path to the example dataset folder

for amino in ["ALA",]:
    batch_process_xyz(base_dir, amino)
