In [1]:

import subprocess
import sys
from pathlib import Path

from loguru import logger
from hydra import compose, initialize

# Импорт нужных функций
from lpce.cleanup.remove_dna_rna import remove_dna_rna_from_directory
from lpce.cleanup.remove_multiple_models import remove_multiple_models_from_directory
from lpce.cleanup.remove_water import remove_water_from_directory
from lpce.cleanup.remove_junk_ligands import remove_junk_ligands_from_directory
from lpce.extraction.parse_dict import extract_and_save_complexes_with_ligands
from lpce.cleanup.remove_empty_structures import remove_unused_pdb_files
from lpce.pdb_manipulations.protein_ligand_separator import protein_ligand_separator
from lpce.utils.clean_names import clean_multiple_paths
from lpce.pdb_manipulations.remove_not_buried_ligands import remove_not_buried_ligands
from lpce.pdb_manipulations.clash_ligands import split_overlapping_ligands
from lpce.pdb_manipulations.split2file import create_final_files
from lpce.cleanup.filter_ligands import filter_ligands

In [2]:

config_name = "PDBBind"
config_path = "lpce/config"

with initialize(config_path=config_path, version_base=None):
    cfg = compose(config_name=config_name)

logger.remove()
logger.add(sys.stdout, format="{message}", level="INFO")

5

In [3]:
!rm /mnt/ligandpro/db/LPCE_PDBBind/processed/*
!rm -r /mnt/ligandpro/db/LPCE_PDBBind/ligands/*
!rm /mnt/ligandpro/db/LPCE_PDBBind/bioml/*
!rm /mnt/ligandpro/db/LPCE_PDBBind/separated/*
!rm /mnt/ligandpro/db/LPCE_PDBBind/final/*

rm: cannot remove '/mnt/ligandpro/db/LPCE_PDBBind/ligands/*': No such file or directory
rm: cannot remove '/mnt/ligandpro/db/LPCE_PDBBind/final/10gs_bioml_1_UNK_chains_A_B': Is a directory
rm: cannot remove '/mnt/ligandpro/db/LPCE_PDBBind/final/11gs_bioml_1_UNK_chains_A_B': Is a directory
rm: cannot remove '/mnt/ligandpro/db/LPCE_PDBBind/final/13gs_bioml_1_UNL_chains_A': Is a directory
rm: cannot remove '/mnt/ligandpro/db/LPCE_PDBBind/final/16pk_bioml_1_UNL_chains_A': Is a directory
rm: cannot remove '/mnt/ligandpro/db/LPCE_PDBBind/final/184l_bioml_1_UNL_chains_A': Is a directory
rm: cannot remove '/mnt/ligandpro/db/LPCE_PDBBind/final/185l_bioml_1_UNL_chains_A': Is a directory
rm: cannot remove '/mnt/ligandpro/db/LPCE_PDBBind/final/186l_bioml_1_UNL_chains_A': Is a directory
rm: cannot remove '/mnt/ligandpro/db/LPCE_PDBBind/final/187l_bioml_1_UNL_chains_A': Is a directory
rm: cannot remove '/mnt/ligandpro/db/LPCE_PDBBind/final/188l_bioml_1_UNL_chains_A': Is a directory
rm: cannot remove

In [4]:
from joblib import Parallel, delayed
from tqdm import tqdm

input_path: Path = Path(cfg.paths.raw_dir)
processed_path: Path = Path(cfg.paths.processed_dir)
processed_path.mkdir(parents=True, exist_ok=True)

def process_subdir(subdir: Path) -> None:
    if not subdir.is_dir():
        return
    pdb_id: str = subdir.name  # Имя папки = PDB ID
    protein_pdb: Path = subdir / f"{pdb_id}_protein_processed.pdb"
    ligand_sdf: Path = subdir / f"{pdb_id}_ligand.sdf"
    if not (protein_pdb.exists() and ligand_sdf.exists()):
        return

    ligand_pdb: Path = processed_path / f"{pdb_id}_ligand_temp.pdb"
    # Конвертация SDF -> PDB
    subprocess.run(
        ["obabel", str(ligand_sdf), "-O", str(ligand_pdb), "-d"],
        capture_output=True,
        text=True,
        shell=False
    )

    if not (ligand_pdb.exists() and ligand_pdb.stat().st_size > 0):
        return

    final_pdb: Path = processed_path / f"{pdb_id}.pdb"
    with open(final_pdb, "w") as out_f:
        with open(protein_pdb) as prot_f:
            out_f.write(prot_f.read().strip() + "\n")
        with open(ligand_pdb) as lig_f:
            out_f.write(lig_f.read().strip() + "\n")
        out_f.write("END\n")

    ligand_pdb.unlink()

subdirs = [subdir for subdir in input_path.iterdir() if subdir.is_dir()]
_ = Parallel(n_jobs=56)(
    delayed(process_subdir)(subdir) for subdir in tqdm(subdirs, desc="Обработка PDB файлов")
)


Обработка PDB файлов: 100%|██████████| 19119/19119 [00:07<00:00, 2543.53it/s]


In [5]:
dna_rna = remove_dna_rna_from_directory(cfg)

Removing DNA/RNA: 100%|██████████| 19119/19119 [00:01<00:00, 19036.61it/s]



Total files analyzed: 19,119
Files removed: 0
Files retained: 19,119
Percentage of files retained: 100.00%


In [6]:
models = remove_multiple_models_from_directory(cfg)


Total PDB files to analyze: 19119


Removing multiple models: 100%|██████████| 19119/19119 [00:00<00:00, 84939.98it/s]


Total files analyzed: 19,119
Files retained after removal: 19,119
Files removed: 0
Percentage of files retained: 100.00%


In [7]:
_ = remove_water_from_directory(cfg)


Found 19119 PDB files in /mnt/ligandpro/db/LPCE_PDBBind/processed


Removing water: 100%|██████████| 19119/19119 [00:50<00:00, 380.11file/s]

Total structures processed: 19119
Successfully processed: 19119
Failed to process: 0





In [8]:
_ = remove_junk_ligands_from_directory(cfg)


Found 19119 PDB files in /mnt/ligandpro/db/LPCE_PDBBind/processed


Removing junk ligands: 100%|██████████| 19119/19119 [00:00<00:00, 30635.17file/s]


Total structures processed: 19119
Successfully processed: 19119
Failed to process: 0
Total ligands removed: 0


In [9]:
# convert_pdb_to_smiles_sdf(cfg)

In [10]:
extract_and_save_complexes_with_ligands(cfg)


Starting to process 19119 PDB files...


Processing files: 100%|██████████| 19119/19119 [00:20<00:00, 937.35file/s] 


Completed processing of 19119 PDB files.
Total complexes: 17828
Complexes saved to data_PDBBind/grouped_complexes.json
Total grouped complexes: 17828


100%|██████████| 19119/19119 [00:00<00:00, 19633.67it/s]


Complexes saved to data_PDBBind/site_info.json
Total pdb sites info: 0


In [11]:
_ =filter_ligands(cfg)


Total proteins analyzed: 17,828
Total ligands analyzed: 20,949
Proteins with site info available: 0
Relevant ligands found in sites: 0
Ligands removed during filtering: 0
Percentage of ligands removed: 0.0%
Ligands remaining after filtering: 20,949


In [12]:
unused = remove_unused_pdb_files(cfg)


Total PDB files in directory: 19,119
Filtered PDB files to keep: 17,828
PDB files removed: 1,291


In [13]:
import shutil
from pathlib import Path
from loguru import logger
from tqdm import tqdm

def bioml_split_dummy(cfg):
    input_dir = Path(cfg.paths.processed_dir)
    output_dir = Path(cfg.paths.bioml_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    pdb_files = list(input_dir.glob("*.pdb"))

    logger.info(f"Найдено {len(pdb_files)} файлов в {input_dir}")

    for pdb_file in tqdm(pdb_files, desc="Копирование файлов"):
        new_filename = pdb_file.stem + "_bioml_1.pdb"  # Добавляем `_bioml_1.pdb`
        new_path = output_dir / new_filename

        shutil.copy(pdb_file, new_path)

    logger.info(f"Готово! Все файлы скопированы в {output_dir}")

bioml_split_dummy(cfg)


Найдено 17828 файлов в /mnt/ligandpro/db/LPCE_PDBBind/processed


Копирование файлов: 100%|██████████| 17828/17828 [00:05<00:00, 2973.23it/s]

Готово! Все файлы скопированы в /mnt/ligandpro/db/LPCE_PDBBind/bioml





In [14]:
protein_ligand_separator(cfg)

Separating ligand pockets in PDB files: 100%|██████████| 17828/17828 [01:07<00:00, 263.18it/s]



Total PDB files found: 17828
Total similar structures skipped: 3067
Structures remaining after filtering: 14656
Total structures SAVED: 17723


In [15]:
clean_multiple_paths(cfg)

In [16]:
not_buried = remove_not_buried_ligands(cfg)

100%|██████████| 17723/17723 [00:33<00:00, 528.85it/s]



Total buried ligands: 17695
Total not buried ligands: 28
Deleted 28 files, which is 0.16% of the total.


In [17]:
split_overlapping_ligands(cfg)




100%|██████████| 17695/17695 [00:16<00:00, 1069.62it/s]


Total files processed: 17695
Files with split ligands: 1
Created new files: 2


{'split_files': ['4x6h_bioml_1_UNL_chains_A_processed.pdb',
  '4x6h_bioml_1_UNL_chains_A_processed.pdb']}

In [18]:
create_final_files(cfg)


Found 17694 PDB files to process.


Processing PDB files: 100%|██████████| 17694/17694 [01:38<00:00, 179.83it/s]


Processing completed.
