In [1]:

import subprocess
import sys
import os
from pathlib import Path

from loguru import logger
from hydra import compose, initialize

# Импорт нужных функций
from lpce.cleanup.remove_dna_rna import remove_dna_rna_from_directory
from lpce.cleanup.remove_multiple_models import remove_multiple_models_from_directory
from lpce.cleanup.remove_water import remove_water_from_directory
from lpce.cleanup.remove_junk_ligands import remove_junk_ligands_from_directory
from lpce.extraction.convert_pdb_to_smiles_sdf import convert_pdb_to_smiles_sdf
from lpce.extraction.parse_dict import extract_and_save_complexes_with_ligands
from lpce.cleanup.remove_empty_structures import remove_unused_pdb_files
from lpce.pdb_manipulations.split_bioml import bioml_split
from lpce.pdb_manipulations.protein_ligand_separator import protein_ligand_separator
from lpce.utils.clean_names import clean_multiple_paths
from lpce.pdb_manipulations.remove_not_buried_ligands import remove_not_buried_ligands
from lpce.pdb_manipulations.clash_ligands import split_overlapping_ligands
from lpce.pdb_manipulations.add_h_to_ligands import add_h_to_ligands
from lpce.pdb_manipulations.split2file import create_final_files
from lpce.cleanup.filter_ligands import filter_ligands
from lpce.utils.utils import save_removed_files_to_json
from lpce.utils.send_email import send_email_notification

In [2]:

config_name = "BindingMOAD"
config_path = "lpce/config"

with initialize(config_path=config_path, version_base=None):
    cfg = compose(config_name=config_name)

logger.remove()
logger.add(sys.stdout, format="{message}", level="INFO")

5

In [3]:
!rm /mnt/ligandpro/db/LPCE_BindingMOAD/processed/*
!rm /mnt/ligandpro/db/LPCE_BindingMOAD/bioml/*
!rm /mnt/ligandpro/db/LPCE_BindingMOAD/separated/*
!rm /mnt/ligandpro/db/LPCE_BindingMOAD/final/*

/bin/bash: line 1: /usr/bin/rm: Argument list too long
/bin/bash: line 1: /usr/bin/rm: Argument list too long
/bin/bash: line 1: /usr/bin/rm: Argument list too long
rm: cannot remove '/mnt/ligandpro/db/LPCE_BindingMOAD/final/*': No such file or directory


In [4]:
from pathlib import Path
from tqdm import tqdm
from joblib import Parallel, delayed

# Пути из конфига
raw_dir = Path(cfg.paths.raw_dir)
processed_dir = Path(cfg.paths.processed_dir)

# Каталоги с белками и лигандами
protein_dir = raw_dir / "pdb_protein"
ligand_dir = raw_dir / "pdb_superligand"

# Создаём выходной каталог, если его нет
processed_dir.mkdir(parents=True, exist_ok=True)

def process_protein_file(protein_file: Path) -> None:
    # Пропускаем скрытые файлы
    if protein_file.name.startswith("."):
        return

    # Обрабатываем только файлы с '_1_protein'
    if "_1_protein" not in protein_file.name:
        return

    # Извлекаем полный идентификатор, например "10gs_1"
    full_id = protein_file.name.split("_protein")[0]
    # Если идентификатор не заканчивается на '_1', пропускаем
    if not full_id.endswith("_1"):
        return

    # Для итогового файла убираем суффикс '_1' -> "10gs"
    base_id = full_id[:-2]

    # Ищем ligand-файлы по полному идентификатору, например "10gs_1_superlig_*.pdb"
    ligand_files = sorted([
        lf for lf in ligand_dir.glob(f"{full_id}_superlig_*.pdb")
        if not lf.name.startswith(".")
    ])

    try:
        combined_file = processed_dir / f"{base_id}.pdb"
        with open(combined_file, "w", encoding="utf-8") as outfile:
            # Записываем содержимое белка
            with open(protein_file, "r", encoding="utf-8") as pf:
                outfile.write(pf.read().strip() + "\n")
            # Дописываем лиганд-файлы, если они найдены
            for ligand_file in ligand_files:
                with open(ligand_file, "r", encoding="utf-8") as lf:
                    outfile.write(lf.read().strip() + "\n")
            outfile.write("END\n")
    except Exception as e:
        print(f"Ошибка при обработке {protein_file}: {e}")

# Получаем только файлы с '_1_protein', пропуская скрытые
protein_files = [
    pf for pf in protein_dir.glob("*.pdb")
    if not pf.name.startswith(".") and "_1_protein" in pf.name
]
print(f"Найдено белковых файлов: {len(protein_files)}")

# Параллельная обработка
Parallel(n_jobs=cfg.n_jobs)(
    delayed(process_protein_file)(pf) for pf in tqdm(protein_files, desc="Объединение белков и лигандов")
)


Найдено белковых файлов: 49226


Объединение белков и лигандов: 100%|██████████| 49226/49226 [03:09<00:00, 260.01it/s]


[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

In [5]:
dna_rna = remove_dna_rna_from_directory(cfg)

Removing DNA/RNA: 100%|██████████| 132883/132883 [00:05<00:00, 24193.80it/s]



Total files analyzed: 132,883
Files removed: 0
Files retained: 132,883
Percentage of files retained: 100.00%


In [6]:
models = remove_multiple_models_from_directory(cfg)


Total PDB files to analyze: 132883


Removing multiple models: 100%|██████████| 132883/132883 [00:01<00:00, 124544.00it/s]


Total files analyzed: 132,883
Files retained after removal: 132,860
Files removed: 23
Percentage of files retained: 99.98%


In [7]:
_ = remove_water_from_directory(cfg)


Found 132860 PDB files in /mnt/ligandpro/db/LPCE_BindingMOAD/processed


Removing water: 100%|██████████| 132860/132860 [05:59<00:00, 369.23file/s]

Total structures processed: 132860
Successfully processed: 132860
Failed to process: 0





In [8]:
_ = remove_junk_ligands_from_directory(cfg)


Found 132860 PDB files in /mnt/ligandpro/db/LPCE_BindingMOAD/processed


Removing junk ligands: 100%|██████████| 132860/132860 [00:05<00:00, 25088.63file/s]


Total structures processed: 132860
Successfully processed: 132860
Failed to process: 0
Total ligands removed: 0


In [9]:
# convert_pdb_to_smiles_sdf(cfg)

In [10]:
pwd

'/home/nikolenko/work/Projects/LPCE'

In [11]:
_ =filter_ligands(cfg)


Total proteins analyzed: 6,110
Total ligands analyzed: 17,422
Proteins with site info available: 0
Relevant ligands found in sites: 0
Ligands removed during filtering: 0
Percentage of ligands removed: 0.0%
Ligands remaining after filtering: 17,422


In [12]:
unused = remove_unused_pdb_files(cfg)

Skipping unexpected file format: 4e3g_1.pdb
Skipping unexpected file format: 4s2a_1.pdb
Skipping unexpected file format: 3efw_2.pdb
Skipping unexpected file format: 2bkx_2.pdb
Skipping unexpected file format: 2z8g_1.pdb
Skipping unexpected file format: 2nm1_1.pdb
Skipping unexpected file format: 3u3z_1.pdb
Skipping unexpected file format: 4xj3_1.pdb
Skipping unexpected file format: 5j4v_0.pdb
Skipping unexpected file format: 2f1b_0.pdb
Skipping unexpected file format: 4hzm_0.pdb
Skipping unexpected file format: 6q0u_2.pdb
Skipping unexpected file format: 4qn7_0.pdb
Skipping unexpected file format: 3qs8_0.pdb
Skipping unexpected file format: 6uqd_1.pdb
Skipping unexpected file format: 4ply_2.pdb
Skipping unexpected file format: 6rxr_3.pdb
Skipping unexpected file format: 2wpw_2.pdb
Skipping unexpected file format: 4oz1_1.pdb
Skipping unexpected file format: 4jat_1.pdb
Skipping unexpected file format: 1tjj_1.pdb
Skipping unexpected file format: 4hzw_1.pdb
Skipping unexpected file format:

In [None]:
import shutil
from pathlib import Path
from loguru import logger
from tqdm import tqdm

def bioml_split_dummy(cfg):
    input_dir = Path(cfg.paths.processed_dir)
    output_dir = Path(cfg.paths.bioml_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    pdb_files = list(input_dir.glob("*.pdb"))

    logger.info(f"Найдено {len(pdb_files)} файлов в {input_dir}")

    for pdb_file in tqdm(pdb_files, desc="Копирование файлов"):
        new_filename = pdb_file.stem + "_bioml_1.pdb"  # Добавляем `_bioml_1.pdb`
        new_path = output_dir / new_filename

        shutil.copy(pdb_file, new_path)

    logger.info(f"Готово! Все файлы скопированы в {output_dir}")

bioml_split_dummy(cfg)


Найдено 83657 файлов в /mnt/ligandpro/db/LPCE_BindingMOAD/processed


Копирование файлов: 100%|██████████| 83657/83657 [03:03<00:00, 456.27it/s]

Готово! Все файлы скопированы в /mnt/ligandpro/db/LPCE_BindingMOAD/bioml





In [14]:
# %% Импорт необходимых модулей и определение функции
from pathlib import Path

def remove_empty_files(directory: Path) -> None:
    """Удаляет все пустые файлы с расширением .pdb в указанной директории."""
    for pdb_file in directory.glob("*.pdb"):
        if pdb_file.stat().st_size == 0:
            print(f"Удаляю пустой файл: {pdb_file}")
            pdb_file.unlink()

directory = Path("/mnt/ligandpro/db/LPCE_BindingMOAD/bioml")

if not directory.is_dir():
    print("Указанный путь не является директорией")
else:
    remove_empty_files(directory)


Удаляю пустой файл: /mnt/ligandpro/db/LPCE_BindingMOAD/bioml/._4dhl_2_bioml_1.pdb


In [15]:
protein_ligand_separator(cfg)

Separating ligand pockets in PDB files: 100%|██████████| 83656/83656 [08:49<00:00, 158.00it/s]



Total PDB files found: 83656
Total similar structures skipped: 120511
Structures remaining after filtering: -57885
Total structures SAVED: 62626


In [16]:
clean_multiple_paths(cfg)

In [17]:
not_buried = remove_not_buried_ligands(cfg)

100%|██████████| 62621/62621 [02:42<00:00, 385.71it/s]



Total buried ligands: 62160
Total not buried ligands: 461
Deleted 461 files, which is 0.74% of the total.


In [18]:
split_overlapping_ligands(cfg)




100%|██████████| 62160/62160 [00:43<00:00, 1415.99it/s]


Total files processed: 62160
Files with split ligands: 50
Created new files: 101


{'split_files': ['1xbz_2_bioml_1_MG_chains_A_processed.pdb',
  '1xbz_2_bioml_1_MG_chains_A_processed.pdb',
  '1gnn_1_bioml_1_U0E_chains_A_B_processed.pdb',
  '1gnn_1_bioml_1_U0E_chains_A_B_processed.pdb',
  '5fv9_2_bioml_1_UDP_chains_B_processed.pdb',
  '5fv9_2_bioml_1_Y6W_chains_B_processed.pdb',
  '1htf_1_bioml_1_G26_chains_A_B_processed.pdb',
  '1htf_1_bioml_1_G26_chains_A_B_processed.pdb',
  '5fv9_4_bioml_1_Y6W_chains_F_processed.pdb',
  '5fv9_4_bioml_1_UDP_chains_F_processed.pdb',
  '1gt1_1_bioml_1_ANC_chains_A_processed.pdb',
  '1gt1_1_bioml_1_PRZ_chains_A_processed.pdb',
  '6atr_2_bioml_1_GSN_chains_B_processed.pdb',
  '6atr_2_bioml_1_BWS_chains_B_processed.pdb',
  '2var_1_bioml_1_ANP_chains_B_processed.pdb',
  '2var_1_bioml_1_AMP_chains_B_processed.pdb',
  '1hm0_2_bioml_1_CA_chains_A_B_C_processed.pdb',
  '1hm0_2_bioml_1_CA_chains_A_B_C_processed.pdb',
  '1hm0_2_bioml_1_CA_chains_A_B_C_processed.pdb',
  '4cq5_1_bioml_1_TCA_chains_B_C_D_processed.pdb',
  '4cq5_1_bioml_1_TCA_chai

In [19]:
create_final_files(cfg)


Found 62156 PDB files to process.


Processing PDB files: 100%|██████████| 62156/62156 [02:43<00:00, 380.11it/s]


Processing completed.
