# This script take csv

In [None]:
import os
import requests
import pandas as pd
from tqdm import tqdm
from Bio import PDB
from Bio.PDB.Polypeptide import PPBuilder

def extract_sequence(chain):
    """Extract sequence from a chain."""
    ppb = PPBuilder()
    peptides = ppb.build_peptides(chain)
    sequence = ""
    for pp in peptides:
        sequence += pp.get_sequence()
    return str(sequence)

def query_abnum(sequence, scheme="imgt"):
    """Query AbNum server to get IMGT numbering."""
    url = "http://www.bioinf.org.uk/abs/abnum/abnum.cgi"
    params = {
        "plain": 1,
        "scheme": "-c",  # "-c" for Chothia/IMGT, "-k" for Kabat
        "aaseq": sequence
    }
    response = requests.get(url, params=params)
    if response.status_code != 200:
        raise Exception("Failed to query AbNum server")
    result = response.text.strip().splitlines()
    numbered_residues = [line.split() for line in result]
    return numbered_residues  # list of [position, residue]

def build_mapping(original_chain, abnum_result):
    """Build mapping from original residues to AbNum numbering."""
    residues = [res for res in original_chain if PDB.is_aa(res, standard=True)]
    if len(residues) != len(abnum_result):
        raise ValueError(f"Length mismatch: {len(residues)} residues vs {len(abnum_result)} AbNum results")

    mapping = {}
    for res, (pos, aa) in zip(residues, abnum_result):
        mapping[res] = pos
    return mapping

def renumber_structure(structure, vh_seq, vl_seq):
    """Renumber structure chains based on AbNum."""
    chains = list(structure.get_chains())
    chain_seq_dict = {chain.id: extract_sequence(chain) for chain in chains}

    mappings = {}

    for chain_id, seq in chain_seq_dict.items():
        if seq.replace('X', '') in vh_seq.replace('X', '') or vh_seq.replace('X', '') in seq.replace('X', ''):
            abnum_result = query_abnum(seq, scheme="imgt")
            mappings[chain_id] = build_mapping(structure[0][chain_id], abnum_result)
        elif seq.replace('X', '') in vl_seq.replace('X', '') or vl_seq.replace('X', '') in seq.replace('X', ''):
            abnum_result = query_abnum(seq, scheme="imgt")
            mappings[chain_id] = build_mapping(structure[0][chain_id], abnum_result)
        else:
            print(f"Warning: Chain {chain_id} does not match VH or VL, keeping original numbering.")

    # Apply renumbering
    for chain in chains:
        if chain.id in mappings:
            for residue in chain:
                if residue in mappings[chain.id]:
                    new_id = mappings[chain.id][residue]
                    pos = new_id[1:]  # remove first 'H' or 'L'
                    num = ""
                    insertion = " "
                    for c in pos:
                        if c.isdigit():
                            num += c
                        else:
                            insertion = c
                            break
                    resnum = int(num)
                    residue.id = (' ', resnum, insertion)


def renumber_pdb(input_pdb_path, vh_seq, vl_seq, output_pdb_path):
    parser = PDB.PDBParser(QUIET=True)
    io = PDB.PDBIO()

    structure = parser.get_structure("structure", input_pdb_path)

    try:
        renumber_structure(structure, vh_seq, vl_seq)
        io.set_structure(structure)
        io.save(output_pdb_path)
    except Exception as e:
        print(f"Error renumbering {input_pdb_path}: {e}")

# --- Load sequences ---
csv_path = "/home/eva/Structure_model_benchmark/antibody.csv"
df = pd.read_csv(csv_path)

# --- Define models and repeats ---
models = ["igfold", "immunebuilder"]
repeats = ["1", "2", "3"]

# --- Setup output folders ---
for model in models:
    for rep in repeats:
        os.makedirs(f"renumbered_output/{model}_{rep}", exist_ok=True)

# --- Main renumbering loop ---
for idx, row in tqdm(df.iterrows(), total=len(df)):
    name = row['name']
    vh_seq = row['vh']
    vl_seq = row['vl']

    for model in models:
        for rep in repeats:
            input_path = f"predictions_output/{model}_{rep}/{name}_{model}_{rep}.pdb"
            output_path = f"renumbered_output/{model}_{rep}/{name}_{model}_{rep}_renumbered.pdb"

            if os.path.exists(input_path):
                renumber_pdb(input_path, vh_seq, vl_seq, output_path)

print("All PDB structures renumbered to IMGT format and saved.")


100%|██████████| 121/121 [00:15<00:00,  7.99it/s]

✅ All PDB structures renumbered to IMGT format and saved.



