### Script summary (alignment of antigen–antibody complexes on E2 chain)

- **Purpose:** Align all antigen–antibody complex structures based on the **antigen (E2) chain**.
- **Input:**  
  - `input.csv` → contains `pdb_id, antigen_chain_id, heavy_chain_id, light_chain_id, antibody`.  
  - Renumbered PDBs located in `pdb_files/`.
- **Reference:**  
  - The **first PDB’s antigen chain** is used as the reference for alignment.
- **Method:**  
  - Uses `tmtools.tm_align` to superimpose each structure’s antigen onto the reference.  
  - Applies the resulting transformation to the full antigen–antibody complex.
- **Output:**  
  - Aligned structures saved in `aligned_structures_E2/` as  
    `aligned_<pdbid>_<antigen>_<heavy>_<light>.pdb`.


In [3]:
import pandas as pd
from tmtools.io import get_structure, get_residue_data
from tmtools import tm_align
from tmtools.helpers import transform_structure
import os
from Bio.PDB import PDBParser, PDBIO
import numpy as np

try:
    from Bio.PDB.Polypeptide import protein_letters_3to1
except ImportError:
    from Bio.PDB import protein_letters_3to1
    
# Path to CSV and PDB files
csv_file = 'input.csv'
pdb_dir = 'pdb_files'
aligned_dir = 'aligned_structures_E2'
os.makedirs(aligned_dir, exist_ok=True)

# Read the CSV file
df = pd.read_csv(csv_file, header=None, names=['pdb_id', 'antigen_chain_id', 'heavy_chain_id', 'light_chain_id','antibody'])

# Initialize PDB parser
parser = PDBParser()

# Function to extract Fv region data
def extract_Ag_data(structure, chain_id):
    chain = structure[0][chain_id]
    coords = []
    seq = ''
    for res in chain:
        if 'CA' in res.child_dict:  
            coords.append(res['CA'].coord)
            if res.resname == 'HSD' or res.resname == 'HSP':
                seq += 'H' 
            else:
                seq += protein_letters_3to1[res.resname]
    return coords, seq

# first structure as the reference
first_row = df.iloc[0]
ref_file = f"{pdb_dir}/{first_row['pdb_id']}_{first_row['antigen_chain_id']}_{first_row['heavy_chain_id']}_{first_row['light_chain_id']}_renumbered.pdb"
ref_structure = get_structure(ref_file)
ref_coords, ref_seq = extract_Ag_data(ref_structure, first_row['antigen_chain_id'])

io = PDBIO()

# Align all structures to the reference
for index, row in df.iterrows():
    target_file = f"{pdb_dir}/{row['pdb_id']}_{row['antigen_chain_id']}_{row['heavy_chain_id']}_{row['light_chain_id']}_renumbered.pdb"
    target_structure = get_structure(target_file)
    target_coords, target_seq = extract_Ag_data(target_structure, row['antigen_chain_id'])

    result = tm_align(target_coords, ref_coords, target_seq, ref_seq)
    
    
    # Apply transformation
    aligned_target_struct = transform_structure(target_structure, result)
    io.set_structure(aligned_target_struct)
    io.save(f"{aligned_dir}/aligned_{row['pdb_id']}_{row['antigen_chain_id']}_{row['heavy_chain_id']}_{row['light_chain_id']}.pdb")
    

    print(f"Aligned {target_file} to {ref_file}")
    print(f"TM score normalized by chain 1: {result.tm_norm_chain1}")
    print(f"TM score normalized by chain 2: {result.tm_norm_chain2}")
    print(f"RMSD: {result.rmsd}")

print("Alignment complete. Aligned structures are saved in:", aligned_dir)


Aligned pdb_files/4mwf_C_H_L_renumbered.pdb to pdb_files/4mwf_C_H_L_renumbered.pdb
TM score normalized by chain 1: 1.0
TM score normalized by chain 2: 1.0
RMSD: 3.2144549883632137e-06
Aligned pdb_files/4web_E_H_L_renumbered.pdb to pdb_files/4mwf_C_H_L_renumbered.pdb
TM score normalized by chain 1: 0.7843149368131582
TM score normalized by chain 2: 0.5843944354885263
RMSD: 2.697586046658922
Aligned pdb_files/6bkb_E_H_L_renumbered.pdb to pdb_files/4mwf_C_H_L_renumbered.pdb
TM score normalized by chain 1: 0.843444598660511
TM score normalized by chain 2: 0.7979111204957595
RMSD: 2.07448823947105
Aligned pdb_files/6bkc_E_H_L_renumbered.pdb to pdb_files/4mwf_C_H_L_renumbered.pdb
TM score normalized by chain 1: 0.8773003707924605
TM score normalized by chain 2: 0.7510102949755187
RMSD: 1.7974587247797096
Aligned pdb_files/6bkd_E_H_L_renumbered.pdb to pdb_files/4mwf_C_H_L_renumbered.pdb
TM score normalized by chain 1: 0.8482494096066797
TM score normalized by chain 2: 0.7612111074329138
RMSD: