### Script summary (PDB download → filter → renumber)

- Reads `input.csv` with columns: `pdb_id, antigen_chain, heavy_chain, light_chain, antibody name`.
- Downloads PDB structure file to the directory `pdb_files/` 
- Keeps only specified chains; excludes waters and heteroatoms.
- Saves filtered file as `<pdbid>_<chains>_new.pdb`, then renumbers via ANARCI `ImmunoPDB.py` (Chothia) to `<pdbid>_<chains>_renumbered.pdb`.
- Requires: `biopython`, `pandas`, ANARCI installed and `ImmunoPDB.py` path valid.


In [2]:
from Bio.PDB import PDBList, PDBParser, Select, PDBIO
import os
import pandas as pd
import warnings
from Bio import BiopythonWarning
import subprocess

# Load input CSV file
filename = 'input.csv'
data = pd.read_csv(filename, header=None, names=['pdb_id', 'antigen_chain', 'heavy_chain', 'light_chain', 'antibody'])


warnings.simplefilter('ignore', BiopythonWarning) # Suppress specific Biopython warnings

class ChainSelect(Select):
    def __init__(self, chain_letters):
        self.chain_letters = set(chain_letters)

    def accept_chain(self, chain):
        return chain.get_id() in self.chain_letters

    def accept_residue(self, residue):
        # Exclude water and heteroatoms (non-standard amino acids including ligands, ions)
        if residue.id[0] != ' ':
            return False
        # exclude water molecules
        if residue.get_resname() == 'HOH':
            return False
        return True

def download_and_filter_structures(data, pdb_dir):
    pdbl = PDBList()
    parser = PDBParser()
    errors = []

    for index, row in data.iterrows():
        pdb_id = row['pdb_id'].strip().lower()
        chains = [row['antigen_chain'].strip().upper(), row['heavy_chain'].strip().upper(), row['light_chain'].strip().upper()]
        chains = [chain for chain in chains if chain]  
        chains_str = '_'.join(chains)  

        try:
            file_path = pdbl.retrieve_pdb_file(pdb_id, pdir=pdb_dir, file_format='pdb')
            structure = parser.get_structure(pdb_id, file_path)
        except Exception as e:
            errors.append(f"Error with {pdb_id}: {str(e)}")
            continue

        # Check if all specified chains are present
        chain_ids = {chain.id for chain in structure.get_chains()}
        if not all(chain in chain_ids for chain in chains):
            errors.append(f"Missing chains in {pdb_id}: {[chain for chain in chains if chain not in chain_ids]}")
            continue
        
        # Save new PDB file with selected chains only, excluding unwanted residues
        io = PDBIO()
        io.set_structure(structure)
        output_path = os.path.join(pdb_dir, f"{pdb_id}_{chains_str}_new.pdb")
        io.save(output_path, select=ChainSelect(chains))
        
        # Renumber the newly created pdb file
        renumber_pdb(output_path)

    remove_ent_files(pdb_dir)

    return errors

def renumber_pdb(pdb_path):
    output_path = pdb_path.replace('_new.pdb', '_renumbered.pdb')
    command = f"python ~/ANARCI/Example_scripts_and_sequences/ImmunoPDB.py -i {pdb_path} -o {output_path} -s c"
    subprocess.run(command, shell=True, check=True)
    
    if os.path.exists(pdb_path):
        os.remove(pdb_path)

def remove_ent_files(pdb_dir):
    for file in os.listdir(pdb_dir):
        if file.endswith(".ent"):
            os.remove(os.path.join(pdb_dir, file))

pdb_dir = 'pdb_files'

if not os.path.exists(pdb_dir):
    os.makedirs(pdb_dir)

errors = download_and_filter_structures(data, pdb_dir)

for error in errors:
    print(error)


Downloading PDB structure '4mwf'...


  description='''


Downloading PDB structure '4web'...


  description='''


Downloading PDB structure '6bkb'...


  description='''


Downloading PDB structure '6bkc'...


  description='''


Downloading PDB structure '6bkd'...


  description='''


Downloading PDB structure '6meh'...


  description='''


Downloading PDB structure '6mei'...


  description='''


Downloading PDB structure '6mej'...


  description='''


Structure exists: 'pdb_files/pdb6mej.ent' 


  description='''


Downloading PDB structure '6mek'...


  description='''


Structure exists: 'pdb_files/pdb6mek.ent' 


  description='''


Downloading PDB structure '6urh'...


  description='''


Downloading PDB structure '6uyd'...


  description='''


Downloading PDB structure '6uyf'...


  description='''


Downloading PDB structure '6uyg'...


  description='''


Structure exists: 'pdb_files/pdb6uyg.ent' 


  description='''


Downloading PDB structure '6uym'...


  description='''


Downloading PDB structure '6wo3'...


  description='''


Downloading PDB structure '6wo4'...


  description='''


Downloading PDB structure '6wo5'...


  description='''


Structure exists: 'pdb_files/pdb6wo5.ent' 


  description='''


Downloading PDB structure '6woq'...


  description='''


Structure exists: 'pdb_files/pdb6woq.ent' 


  description='''


Downloading PDB structure '7jtf'...


  description='''


Downloading PDB structure '7jtg'...


  description='''


Downloading PDB structure '7mww'...


  description='''


Downloading PDB structure '7mwx'...


  description='''


Downloading PDB structure '7rfb'...


  description='''


Downloading PDB structure '7rfc'...


  description='''


Downloading PDB structure '7t6x'...


  description='''


Structure exists: 'pdb_files/pdb7t6x.ent' 


  description='''


Limiting hmmer search to species ['human', 'mouse'] was requested but hits did not achieve a high enough bitscore. Reverting to using any species
Downloading PDB structure '8fsj'...


  description='''


Structure exists: 'pdb_files/pdb8fsj.ent' 


  description='''


Downloading PDB structure '8tgv'...


  description='''


Downloading PDB structure '8thz'...


  description='''


Structure exists: 'pdb_files/pdb8thz.ent' 


  description='''


Downloading PDB structure '8w0v'...


  description='''


Downloading PDB structure '8w0w'...


  description='''


Downloading PDB structure '8w0x'...


  description='''


Downloading PDB structure '8w0y'...


  description='''


Downloading PDB structure '8txq'...


  description='''
