In [7]:
import sys
import warnings
import tempfile
from pathlib import Path
import numpy as np
from Bio.Align import PairwiseAligner, substitution_matrices
from Bio.PDB import PDBParser, NeighborSearch, Select, Superimposer
from Bio.PDB.PDBIO import PDBIO
from Bio.SeqUtils import seq1
from scipy.spatial import cKDTree
from joblib import Parallel, delayed
from tqdm import tqdm
from loguru import logger

warnings.filterwarnings("ignore", category=UserWarning, module="MDAnalysis.core.universe")

logger.remove()
logger.add(sys.stdout, format="{message}", level="DEBUG")

class LigandSelect(Select):
    def __init__(self, ligand_residues, interacting_chains):
        self.ligand_residues = ligand_residues
        self.interacting_chains = interacting_chains
        self.ligand_ids = {(residue.get_parent().id, residue.id) for residue in ligand_residues}

    def accept_chain(self, chain):
        return chain.id in self.interacting_chains or any(
            chain.id == residue.get_parent().id for residue in self.ligand_residues
        )

    def accept_residue(self, residue):
        chain_id = residue.get_parent().id
        if (chain_id, residue.id) in self.ligand_ids:
            return True
        elif residue.id[0] == ' ':  # Protein residues
            return True
        else:
            return False


def align_sequences(seq1, seq2):
    blosum62 = substitution_matrices.load("BLOSUM62")
    aligner = PairwiseAligner()
    aligner.substitution_matrix = blosum62
    aligner.mode = 'global'
    aligner.open_gap_score = -10
    aligner.extend_gap_score = -1

    alignments = aligner.align(seq1, seq2)
    aligned_seq1, aligned_seq2 = alignments[0].aligned[0], alignments[0].aligned[1]

    aligned_seq1_str, aligned_seq2_str = [], []
    for (start1, end1), (start2, end2) in zip(aligned_seq1, aligned_seq2):
        aligned_seq1_str.append(seq1[start1:end1])
        aligned_seq2_str.append(seq2[start2:end2])

    return ''.join(aligned_seq1_str), ''.join(aligned_seq2_str)

def calculate_aligned_rmsd(structure1_file, structure2_file, input_file_path, identity_threshold=0.9):
    parser = PDBParser(QUIET=True)
    structure1 = parser.get_structure('struct1', structure1_file)
    structure2 = parser.get_structure('struct2', structure2_file)

    residues1 = [res for res in structure1.get_residues() if res.id[0] == ' ']
    residues2 = [res for res in structure2.get_residues() if res.id[0] == ' ']

    seq1_str = ''.join([seq1(res.get_resname()) for res in residues1])
    seq2_str = ''.join([seq1(res.get_resname()) for res in residues2])

    if not seq1_str or not seq2_str:
        logger.warning(f"Empty sequence in file: {input_file_path}")
        return np.inf

    aligned_seq1, aligned_seq2 = align_sequences(seq1_str, seq2_str)

    logger.debug(f"Aligned sequences for file: {input_file_path}")
    logger.debug(f"Sequence 1: {aligned_seq1}")
    logger.debug(f"Sequence 2: {aligned_seq2}")

    aligned_length = sum(1 for a, b in zip(aligned_seq1, aligned_seq2) if a != '-' and b != '-')
    matches = sum(1 for a, b in zip(aligned_seq1, aligned_seq2) if a == b and a != '-' and b != '-')

    if aligned_length > 0:
        identity_ratio = matches / aligned_length
    else:
        identity_ratio = 0.0

    if identity_ratio >= identity_threshold:
        logger.debug(f"Sequences are {identity_ratio:.2%} identical. Skipping RMSD calculation.")
        return 0.0

    idx1, idx2 = 0, 0
    atom_pairs = []
    for a, b in zip(aligned_seq1, aligned_seq2):
        if a != '-' and b != '-':
            res1 = residues1[idx1]
            res2 = residues2[idx2]
            for atom_name in ['N', 'CA', 'C']:
                if atom_name in res1 and atom_name in res2:
                    atom_pairs.append((res1[atom_name], res2[atom_name]))
        if a != '-':
            idx1 += 1
        if b != '-':
            idx2 += 1

    if len(atom_pairs) == 0:
        logger.warning(f"No common atoms for RMSD comparison in file: {input_file_path}")
        return np.inf

    sup = Superimposer()
    sup.set_atoms([pair[0] for pair in atom_pairs], [pair[1] for pair in atom_pairs])
    rmsd_value = sup.rms
    logger.debug(f"RMSD value: {rmsd_value:.2f} for file: {input_file_path}")
    return rmsd_value

def get_aligned_sequences(alignment, seq1, seq2):
    aligned_seq1, aligned_seq2 = [], []
    last_end1, last_end2 = 0, 0

    for (start1, end1), (start2, end2) in zip(alignment.aligned[0], alignment.aligned[1]):
        if start1 > last_end1:
            aligned_seq1.append('-' * (start1 - last_end1))
            aligned_seq2.append(seq2[last_end2:start2])
        if start2 > last_end2:
            aligned_seq1.append(seq1[last_end1:start1])
            aligned_seq2.append('-' * (start2 - last_end2))
        aligned_seq1.append(seq1[start1:end1])
        aligned_seq2.append(seq2[start2:end2])
        last_end1 = end1
        last_end2 = end2

    if last_end1 < len(seq1):
        aligned_seq1.append(seq1[last_end1:])
        aligned_seq2.append('-' * (len(seq1) - last_end1))
    if last_end2 < len(seq2):
        aligned_seq1.append('-' * (len(seq2) - last_end2))
        aligned_seq2.append(seq2[last_end2:])

    return ''.join(aligned_seq1), ''.join(aligned_seq2)

def get_interacting_chains(ligand_atoms, protein_atoms, distance):
    ns = NeighborSearch(protein_atoms)
    interacting_atoms = []
    for atom in ligand_atoms:
        nearby_atoms = ns.search(atom.coord, distance, level='A')
        interacting_atoms.extend(nearby_atoms)
    interacting_chains = {atom.get_parent().get_parent().id for atom in interacting_atoms}

    
    if interacting_chains:
        ligand_residue = ligand_atoms[0].get_parent().get_resname()
        logger.debug(f"Ligand {ligand_residue} interacts with chains: {', '.join(interacting_chains)}")
    else:
        logger.debug("No interacting chains found for this ligand group.")
        
    return interacting_chains

def find_close_ligands(ligand_atoms, all_ligand_atoms, distance):
    ns = NeighborSearch(all_ligand_atoms)
    close_residues = set()
    for atom in ligand_atoms:
        nearby_atoms = ns.search(atom.coord, distance, level='A')
        for nearby_atom in nearby_atoms:
            residue = nearby_atom.get_parent()
            if residue not in ligand_atoms and residue.id[0] != ' ':
                close_residues.add(residue)
    return list(close_residues)

def save_structure_to_tempfile(structure, select):
    io = PDBIO()
    temp_file = tempfile.NamedTemporaryFile(suffix=".pdb", delete=False)
    io.set_structure(structure)
    io.save(temp_file.name, select=select)
    temp_file.close()
    return temp_file.name

def process_ligands(structure, interact_distance=4.5, ligand_ligand_distance=3.0):
    all_ligand_atoms = [atom for atom in structure.get_atoms() if atom.get_parent().id[0] != ' ']
    protein_atoms = [atom for atom in structure.get_atoms() if atom.get_parent().id[0] == ' ']
    ligand_residues = [residue for residue in structure.get_residues() if residue.id[0] != ' ']
    processed_ligands = set()
    ligand_groups = []

    for ligand in ligand_residues:
        ligand_id = (ligand.get_resname(), ligand.get_parent().id, ligand.id[1])
        if ligand_id in processed_ligands:
            continue
        ligand_atoms = list(ligand.get_atoms())
        interacting_chains = get_interacting_chains(ligand_atoms, protein_atoms, interact_distance)
        if not interacting_chains:
            continue
        close_ligands = find_close_ligands(ligand_atoms, all_ligand_atoms, ligand_ligand_distance)
        

        all_ligands_in_group = [ligand] + close_ligands
        
        for close_ligand in close_ligands:
            processed_ligands.add(
                (close_ligand.get_resname(), close_ligand.get_parent().id, close_ligand.id[1])
            )
        processed_ligands.add(ligand_id)
        ligand_groups.append({
            'ligands': all_ligands_in_group,
            'interacting_chains': interacting_chains
        })
        
    return ligand_groups

def fix_conect_format(conect_lines):
    fixed_conect_lines = []
    for line in conect_lines:
        if line.startswith("CONECT"):
            atom_numbers = line[6:].strip()
            if len(atom_numbers.replace(" ", "")) % 5 == 0:
                atom_numbers_fixed = " ".join([atom_numbers[i:i+5] for i in range(0, len(atom_numbers), 5)])
                fixed_line = f"CONECT {atom_numbers_fixed}\n"
            else:
                fixed_line = line
            fixed_conect_lines.append(fixed_line)
        else:
            fixed_conect_lines.append(line)
    return fixed_conect_lines

def filter_conect_lines(conect_lines, saved_atoms):
    """Filter out incorrect atom numbers in CONECT lines, keeping valid ones."""
    filtered_conect_lines = []
    for line in conect_lines:
        if line.startswith("CONECT"):
            atom_numbers = [line[i:i+5].strip() for i in range(6, len(line), 5)]
            valid_atom_numbers = [f"{int(atom_num):5d}" for atom_num in atom_numbers if atom_num.isdigit() and int(atom_num) in saved_atoms]
            
            if valid_atom_numbers:
                filtered_conect_lines.append(f"CONECT{''.join(valid_atom_numbers)}\n")
    return filtered_conect_lines

def save_pocket_structure(structure, pocket_info, output_dir, saved_structures,
                          original_lines, conect_lines, input_file_path,
                          rmsd_threshold=2.0, identity_threshold=0.95):
    ligands = pocket_info['ligands']
    interacting_chains = pocket_info['interacting_chains']
    input_filename = Path(input_file_path).stem

    io = PDBIO()
    io.set_structure(structure)

    select = LigandSelect(ligands, interacting_chains)
    ligand_names = "_".join(sorted({lig.get_resname() for lig in ligands}))
    chains_str = "_".join(sorted(interacting_chains))
    output_file = output_dir / f"{input_filename}_{ligand_names}_chains_{chains_str}.pdb"

    old_atom_serials = {a.get_serial_number(): a for a in structure.get_atoms() if select.accept_atom(a)}
    temp_structure_file = save_structure_to_tempfile(structure, select)

    lig_signature = tuple(sorted((lig.get_resname(), lig.get_parent().id, lig.id[1]) for lig in ligands))

    similar_found = False
    for saved in saved_structures:
        if lig_signature != saved['lig_signature']:
            continue
        rmsd_value = calculate_aligned_rmsd(saved['temp_file'], temp_structure_file,
                                            input_file_path, identity_threshold=identity_threshold)
        if rmsd_value < rmsd_threshold:
            logger.debug(f"Found similar structure (RMSD: {rmsd_value:.2f}), skipping file: {output_file}")
            similar_found = True
            break
    if similar_found:
        return

    parser = PDBParser(QUIET=True)
    new_structure = parser.get_structure('new_struct', temp_structure_file)
    new_atom_serials = {a.get_serial_number(): a for a in new_structure.get_atoms() if select.accept_atom(a)}
    serial_map = {old: new for (old, _), (new, _) in zip(old_atom_serials.items(), new_atom_serials.items())}

    updated_conect_lines = []
    for line in conect_lines:
        if line.startswith("CONECT"):
            nums = [line[i:i+5].strip() for i in range(6, len(line), 5)]
            new_nums = []
            for n in nums:
                if n.isdigit():
                    n_old = int(n)
                    n_new = serial_map.get(n_old, n_old)
                    new_nums.append(f"{n_new:5d}")
                else:
                    new_nums.append(n)
            updated_conect_lines.append(f"CONECT{''.join(new_nums)}\n")

    filtered_conect_lines = filter_conect_lines(updated_conect_lines, serial_map.keys())

    with open(output_file, 'w') as f_out:
        f_out.writelines(original_lines)
        io.save(f_out, select=select)

    with open(output_file, 'r+') as f_out:
        lines = f_out.readlines()
        if lines[-1].strip() == "END":
            lines = lines[:-1]
        f_out.seek(0)
        f_out.writelines(lines)
        # f_out.writelines(filtered_conect_lines)  # при необходимости восстановить CONECT
        f_out.write("END\n")

    saved_structures.append({
        'output_file': output_file,
        'temp_file': temp_structure_file,
        'lig_signature': lig_signature
    })

    logger.debug(f"Saved pocket structure to file: {output_file}")


def load_original_lines(file_path):
    conect_lines = []
    other_lines = []
    with open(file_path, 'r') as f:
        for line in f:
            if line.startswith("CONECT"):
                conect_lines.append(line)
            elif not line.startswith(("ATOM", "HETATM", "END", "MASTER", "TER")):
                other_lines.append(line)
    return other_lines, conect_lines

def process_pdb_file(input_file_path, output_dir_path, interact_distance=4.5, ligand_ligand_distance=3.0, rmsd_threshold=2.0, identity_threshold=0.95):
    output_dir = Path(output_dir_path)
    output_dir.mkdir(exist_ok=True)

    original_lines, conect_lines = load_original_lines(input_file_path)

    parser = PDBParser(QUIET=True)
    structure = parser.get_structure('pdb_structure', input_file_path)

    ligand_groups = process_ligands(structure, interact_distance, ligand_ligand_distance)

    saved_structures = []
    logger.debug(f"Processing {len(ligand_groups)} ligand groups in file: {input_file_path}")
    
    for pocket_info in ligand_groups:
        save_pocket_structure(
            structure, pocket_info, output_dir, saved_structures,
            original_lines, conect_lines, input_file_path, rmsd_threshold, identity_threshold
        )



In [8]:
def analyze_protein(input_file_path, output_dir_path='separated_complexes', interact_distance=4.5, ligand_ligand_distance=1, rmsd_threshold=2, identity_threshold=0.98):
    logger.debug(f"Starting analysis of protein: {input_file_path}")
    process_pdb_file(input_file_path, output_dir_path, interact_distance, ligand_ligand_distance, rmsd_threshold, identity_threshold)
    logger.debug("Protein analysis completed")

#analyze_protein('/home/nikolenko/work/Projects/LPCE/lpce/tests/bioml/1qk0_bioml_1.pdb', 'separated_complexes')
analyze_protein('/home/nikolenko/work/Projects/LPCE/lpce/tests/test_data/1gc4.pdb', 'separated_complexes')

Starting analysis of protein: /home/nikolenko/work/Projects/LPCE/lpce/tests/test_data/1gc4.pdb
Ligand ASP interacts with chains: B, A
Ligand PLP interacts with chains: B, A
Ligand ASP interacts with chains: B, A
Ligand PLP interacts with chains: B, A
Ligand ASP interacts with chains: C, D
Ligand PLP interacts with chains: C, D
Ligand ASP interacts with chains: C, D
Ligand PLP interacts with chains: C, D
Processing 8 ligand groups in file: /home/nikolenko/work/Projects/LPCE/lpce/tests/test_data/1gc4.pdb
Saved pocket structure to file: separated_complexes/1gc4_ASP_chains_A_B.pdb
Saved pocket structure to file: separated_complexes/1gc4_PLP_chains_A_B.pdb
Saved pocket structure to file: separated_complexes/1gc4_ASP_chains_A_B.pdb
Saved pocket structure to file: separated_complexes/1gc4_PLP_chains_A_B.pdb
Saved pocket structure to file: separated_complexes/1gc4_ASP_chains_C_D.pdb
Saved pocket structure to file: separated_complexes/1gc4_PLP_chains_C_D.pdb
Saved pocket structure to file: sepa

In [2]:
#analyze_protein('/mnt/ligandpro/db/LPCE/processed/1gc4.pdb', 'separated_complexes')
#analyze_protein('/mnt/ligandpro/db/LPCE/processed/1a2f.pdb', 'separated_complexes')
#analyze_protein('/mnt/ligandpro/db/LPCE/processed/1bbp.pdb', 'separated_complexes')
#analyze_protein('../lpce/tests/test_data/1xxa.pdb', 'separated_complexes')
#analyze_protein('../lpce/tests/test_data/5ivq.pdb', 'separated_complexes')
#analyze_protein('/mnt/ligandpro/db/LPCE/processed/19gs.pdb', 'separated_complexes')
#analyze_protein('/mnt/ligandpro/db/LPCE/processed/1bru.pdb', 'separated_complexes')
#analyze_protein('/mnt/ligandpro/db/LPCE/processed/1a28.pdb', 'separated_complexes')
analyze_protein('../lpce/tests/processed/1a2f.pdb', 'separated_complexes')
analyze_protein('../lpce/tests/processed/1ahy.pdb', 'separated_complexes')
analyze_protein('../lpce/tests/processed/1axr.pdb', 'separated_complexes')
#analyze_protein('/mnt/ligandpro/db/LPCE/bioml/1ct9_bioml_1.pdb', 'separated_complexes')

Starting analysis of protein: ../lpce/tests/processed/1a00.pdb


FileNotFoundError: [Errno 2] No such file or directory: '../lpce/tests/processed/1a00.pdb'