In [2]:
import subprocess
import sys
from pathlib import Path
from loguru import logger
from tqdm import tqdm
import tempfile

logger.remove()
logger.add(sys.stdout, format="{message}", level="INFO")

def run_foldseek(input_dir, tmscore_threshold=0.5, fident_threshold=0.9, n_jobs=224):
    """
    Runs Foldseek with specified parameters.

    :param input_dir: Directory containing input PDB files.
    :param tmscore_threshold: TM-score threshold for Foldseek.
    :param fident_threshold: Identity threshold for grouping results.
    :param n_jobs: Number of threads for Foldseek.
    """
    identical_pairs = {}

    with tempfile.TemporaryDirectory() as tmp_dir:
        output_dir = Path(tmp_dir)
        db_path = output_dir / "foldseek_db"
        aln_output = output_dir / "alignment_results.m8"
        foldseek_log = "foldseek_output.log"
        foldseek_bin = "/home/nikolenko/work/soft/foldseek/bin/foldseek"
        prostt5_path = "/home/nikolenko/work/soft/prostt5_model"

        logger.info("Starting run_foldseek")
        logger.info(f"Input directory: {input_dir}")
        logger.info(f"Foldseek binary: {foldseek_bin}")

        try:
            # Handle the 'createdb' step
            with open(foldseek_log, "w") as log_file:
                logger.info("Creating Foldseek database...")
                createdb_command = [
                    foldseek_bin,
                    "createdb",
                    str(input_dir),
                    str(db_path),
                    "--threads", str(n_jobs)
                ]
                process = subprocess.Popen(
                    createdb_command,
                    stdout=subprocess.PIPE,
                    stderr=subprocess.STDOUT,
                    universal_newlines=True
                )
                logger.info("createdb process started")

                # Initialize progress bar for 'createdb'
                total_equals_createdb = 65  # Assuming progress bar has 65 '=' characters
                pbar_createdb = tqdm(total=total_equals_createdb, desc="Creating Database", unit="chars")

                for line in process.stdout:
                    log_file.write(line)
                    if line.startswith('[') and ']' in line:
                        # Extract the progress bar part
                        progress_bar = line[line.find('['):line.find(']')+1]
                        num_equals = progress_bar.count('=')
                        pbar_createdb.n = num_equals
                        pbar_createdb.refresh()
                process.wait()
                pbar_createdb.close()
                if process.returncode != 0:
                    logger.error("Foldseek exited with an error during 'createdb'")
                    raise RuntimeError("Foldseek 'createdb' exited with an error.")
                logger.info(f"Database created at {db_path}")

            # Handle the 'easy-search' step
            with open(foldseek_log, "a") as log_file:
                logger.info("Starting easy-search...")
                command = [
                    foldseek_bin,
                    "easy-search",
                    str(db_path),
                    str(db_path),
                    str(aln_output),
                    str(output_dir),
                    "--format-output", "query,target,fident,evalue",
                    "--max-seqs", "10000",
                    "--alignment-type", "2",
                    "--tmscore-threshold", str(tmscore_threshold),
                    "--threads", str(n_jobs),
                    "--prostt5-model", prostt5_path 
                ]

                process = subprocess.Popen(
                    command,
                    stdout=subprocess.PIPE,
                    stderr=subprocess.STDOUT,
                    universal_newlines=True
                )

                logger.info("easy-search process started")

                # Initialize progress bar for 'easy-search'
                total_equals_easysearch = 5 * 65
                pbar_easysearch = tqdm(total=total_equals_easysearch, desc="Foldseek Progress", unit="chars")

                # Read the output character by character
                while True:
                    char = process.stdout.read(1)
                    if not char:
                        break
                    log_file.write(char)
                    if char == '=':
                        pbar_easysearch.update(1)

                process.wait()
                pbar_easysearch.close()

                if process.returncode != 0:
                    logger.error("Foldseek exited with an error in easy-search")
                    raise RuntimeError("Foldseek exited with an error.")

        except subprocess.CalledProcessError as e:
            logger.error(f"Foldseek failed to create the database: {e}")
            sys.exit(1)

        # Reading and processing Foldseek results
        logger.info("Reading Foldseek alignment results")
        with open(aln_output, 'r') as f:
            for line in f:
                query, target, fident, evalue = line.strip().split()[:4]
                fident = float(fident)
                if fident >= fident_threshold:
                    if query not in identical_pairs:
                        identical_pairs[query] = {query}
                    identical_pairs[query].add(target)

        logger.info("Grouping identical structures")
        groups = []
        visited = set()
        for key, group in identical_pairs.items():
            if key not in visited:
                groups.append(group)
                visited.update(group)

        logger.info(f"run_foldseek completed, found {len(groups)} groups of identical structures")
        return groups


def find_identical_in_directory_foldseek(input_dir, tmscore_threshold=0.5, fident_threshold=0.9, n_jobs=224):
    """
    High-level function to find identical structures using Foldseek.

    :param input_dir: Directory with input PDB files.
    :param tmscore_threshold: TM-score threshold for Foldseek.
    :param fident_threshold: Identity threshold for grouping results.
    :param n_jobs: Number of threads for Foldseek.
    :return: List of groups of identical structures.
    """
    input_dir = Path(input_dir)

    logger.info(f"Searching for identical structures in {input_dir} with Foldseek")
    logger.info(f"tmscore_threshold = {tmscore_threshold}, fident_threshold = {fident_threshold}, n_jobs = {n_jobs}")

    groups = run_foldseek(input_dir, tmscore_threshold, fident_threshold, n_jobs)

    logger.info(f"Search completed. Found {len(groups)} groups of identical structures.")
    return groups


In [4]:
identical_groups = find_identical_in_directory_foldseek("../lpce/tests/separated", tmscore_threshold=0.8, fident_threshold=0.8, n_jobs=112)
#identical_groups = find_identical_in_directory_foldseek("/mnt/ligandpro/db/LPCE/separated", tmscore_threshold=0.8, fident_threshold=0.8, n_jobs=64)

Searching for identical structures in ../lpce/tests/separated with Foldseek
tmscore_threshold = 0.8, fident_threshold = 0.8, n_jobs = 112
Starting run_foldseek
Input directory: ../lpce/tests/separated
Foldseek binary: /home/nikolenko/work/soft/foldseek/bin/foldseek
Creating Foldseek database...
createdb process started


Creating Database: 100%|██████████| 65/65 [00:00<00:00, 353.10chars/s]

Database created at /tmp/tmpdvi1dr6e/foldseek_db
Starting easy-search...
easy-search process started



Foldseek Progress: 100%|██████████| 325/325 [00:01<00:00, 199.54chars/s]

Reading Foldseek alignment results
Grouping identical structures
run_foldseek completed, found 121 groups of identical structures
Search completed. Found 121 groups of identical structures.





In [5]:
identical_groups

[{'4a22_bioml_1_TD4_chains_A_B_processed_A',
  '4a22_bioml_1_TD4_chains_A_B_processed_B'},
 {'5a2d_bioml_1_CHT_chains_A_processed',
  '5a2d_bioml_2_ETX_chains_D_processed'},
 {'4a1o_bioml_1_AMZ_chains_A_B_processed',
  '4a1o_bioml_1_AMZ_chains_A_B_processed_A',
  '4a1o_bioml_1_AMZ_chains_A_B_processed_B'},
 {'5a4k_bioml_1_FAD_chains_B_C_processed',
  '5a4k_bioml_1_FAD_chains_B_C_processed_B',
  '5a4k_bioml_1_FAD_chains_B_C_processed_C'},
 {'5a14_bioml_1_LQ5_chains_A_processed'},
 {'8a2v_bioml_1_FMN_chains_A_processed',
  '8a4e_bioml_1_FMA_chains_A_processed',
  '8a4e_bioml_1_FMN_chains_A_processed'},
 {'8a4g_bioml_1_CD_chains_A_processed'},
 {'1a47_bioml_1_ADH_chains_A_processed'},
 {'1a4g_bioml_1_ZMR_chains_A_processed'},
 {'4a2u_bioml_1_CMP_chains_A_B_processed_A',
  '4a2u_bioml_1_CMP_chains_A_B_processed_B'},
 {'7a1u_bioml_1_FUA_chains_A_processed'},
 {'7a1j_bioml_1_ZN_chains_A_processed',
  '7a1o_bioml_1_ZN_chains_A_B_processed',
  '7a1o_bioml_1_ZN_chains_A_B_processed_A',
  '7a1o_

In [1]:
import json
identical_groups_json_compatible = [list(group) for group in identical_groups]
with open("../data/identical_groups.json", "w") as f:
    json.dump(identical_groups_json_compatible, f)


NameError: name 'identical_groups' is not defined

In [5]:
import os
import json
from collections import defaultdict
from loguru import logger

def extract_het_and_chain_identifiers(filename: str) -> tuple[str, str] | tuple[None, None]:
    try:
        chain_tag = "_chains_"
        processed_tag = "_processed"
        
        start_bioml = filename.index("bioml_") + len("bioml_")

        pos_underscore = filename.index('_', start_bioml)

        start_ligand = pos_underscore + 1
        
        chain_start = filename.index(chain_tag, start_ligand)
        ligand_part = filename[start_ligand:chain_start]
        #logger.info(f"ligand_part: {ligand_part}")

        chain_identifier_start = chain_start + len(chain_tag)
        processed_pos = filename.index(processed_tag, chain_identifier_start)
        chain_identifier = filename[chain_identifier_start:processed_pos]
        
        return (ligand_part, chain_identifier)
    except ValueError:
        return None, None


def get_resolution_from_pdb(pdb_file_path):
    try:
        with open(pdb_file_path, 'r') as f:
            for line in f:
                if line.startswith('REMARK   2 RESOLUTION.'):
                    parts = line.strip().split()
                    for i, part in enumerate(parts):
                        if 'ANGSTROM' in part.upper():
                            try:
                                resolution = float(parts[i - 1])
                                return resolution
                            except ValueError:
                                #logger.warning(f"Невозможно преобразовать разрешение в float для {pdb_file_path}")
                                return None
                    resolution_str = line.strip().split('RESOLUTION.')[-1].split()[0]
                    try:
                        resolution = float(resolution_str)
                        return resolution
                    except ValueError:
                        #logger.warning(f"Невозможно преобразовать разрешение в float для {pdb_file_path}")
                        return None
        return None
    except Exception as e:
        logger.error(f"Ошибка при чтении разрешения из {pdb_file_path}: {e}")
        return None

def remove_duplicate_groups(groups):
    unique_groups = []
    groups_sorted = sorted([sorted(group) for group in groups], key=lambda x: (-len(x), x))

    for group in groups_sorted:
        group_set = set(group)
        is_subset = False
        for unique_group in unique_groups:
            if group_set <= unique_group:
                is_subset = True
                break
        if not is_subset:
            unique_groups.append(group_set)
    return unique_groups

def process_groups_with_resolution(identical_groups, pdb_directory):
    processed_groups = []
    for group in identical_groups:
        processed_group = {file_name.split("_processed")[0] + "_processed" for file_name in group}
        processed_groups.append(processed_group)
    
    unique_groups = remove_duplicate_groups(processed_groups)
    final_groups = []

    for group in unique_groups:
        het_chain_groups = defaultdict(list)
        for file_name in group:
            het_identifier, chain_identifier = extract_het_and_chain_identifiers(file_name)
            key = (het_identifier, chain_identifier)
            het_chain_groups[key].append(file_name)

        selected_structures = []
        for key, structures in het_chain_groups.items():
            best_structure = None
            best_resolution = None
            for structure in structures:
                pdb_file_path = os.path.join(pdb_directory, f"{structure}.pdb")
                if not os.path.exists(pdb_file_path):
                    logger.warning(f"Файл {pdb_file_path} не найден.")
                    continue
                resolution = get_resolution_from_pdb(pdb_file_path)
                if resolution is None:
                    logger.warning(f"Разрешение не найдено для {structure}")
                    continue
                if best_resolution is None or resolution < best_resolution:
                    best_resolution = resolution
                    best_structure = structure
            if best_structure:
                selected_structures.append(best_structure)
            else:
                selected_structures.append(structures[0])
        final_groups.append(set(selected_structures))
    return final_groups

def remove_similar_structures(pickle_path, pdb_directory):
    identical_groups = json.load(open(pickle_path))
    
    initial_count = sum(len(group) for group in identical_groups)
    final_groups = process_groups_with_resolution(identical_groups, pdb_directory)
    
    total_files = set()
    for group in final_groups:
        total_files.update({file + ".pdb" for file in group})
    
    final_count = len(total_files)
    deleted_files = 0

    for filename in os.listdir(pdb_directory):
        file_path = os.path.join(pdb_directory, filename)
        # Удаляем файл, если его имя с расширением .pdb отсутствует в total_files
        if filename.endswith(".pdb") and filename not in total_files:
            os.remove(file_path)
            deleted_files += 1

    deleted_percentage = (deleted_files / (deleted_files + final_count)) * 100
    logger.info(f"На входе было {initial_count} структур.")
    logger.info(f"Осталось {final_count} уникальных структур.")
    logger.info(f"Удалено {deleted_files} файлов, что составляет {deleted_percentage:.2f}% от общего числа.")

identical_groups_path = "../data/identical_groups.json"
pdb_directory = "../lpce/tests/separated"
remove_similar_structures(identical_groups_path, pdb_directory)

Файл ../lpce/tests/separated/212l_bioml_1_HED_chains_A_processed.pdb не найден.
Файл ../lpce/tests/separated/249l_bioml_1_HED_chains_A_processed.pdb не найден.
Файл ../lpce/tests/separated/246l_bioml_1_HED_chains_A_processed.pdb не найден.
Файл ../lpce/tests/separated/187l_bioml_1_HED_chains_A_processed.pdb не найден.
Файл ../lpce/tests/separated/183l_bioml_1_HED_chains_A_processed.pdb не найден.
Файл ../lpce/tests/separated/242l_bioml_1_HED_chains_A_processed.pdb не найден.
Файл ../lpce/tests/separated/257l_bioml_1_HED_chains_A_processed.pdb не найден.
Файл ../lpce/tests/separated/184l_bioml_1_HED_chains_A_processed.pdb не найден.
Файл ../lpce/tests/separated/244l_bioml_1_HED_chains_A_processed.pdb не найден.
Файл ../lpce/tests/separated/186l_bioml_1_HED_chains_A_processed.pdb не найден.
Файл ../lpce/tests/separated/248l_bioml_1_HED_chains_A_processed.pdb не найден.
Файл ../lpce/tests/separated/148l_bioml_1_MUB_chains_E_S_processed.pdb не найден.
Файл ../lpce/tests/separated/2a41_biom