In [None]:
from pathlib import Path
from joblib import Parallel, delayed
from tqdm import tqdm
from pdbfixer import PDBFixer
from openmm.app import PDBFile

def fix_pdb_file(input_file, output_file):
    fixer = PDBFixer(filename=str(input_file))
    fixer.findNonstandardResidues()
    fixer.replaceNonstandardResidues()
    fixer.findMissingResidues()
    fixer.findMissingAtoms()
    fixer.addMissingAtoms()
    fixer.addMissingHydrogens(7.0)
    with open(output_file, 'w') as f:
        PDBFile.writeFile(fixer.topology, fixer.positions, f)

def fix_pdb_folder(input_dir, output_dir):
    input_path = Path(input_dir)
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)

    pdb_files = list(input_path.glob("*.pdb"))

    Parallel(n_jobs=-1)(delayed(fix_pdb_file)(
        pdb_file, output_path / pdb_file.name
    ) for pdb_file in tqdm(pdb_files, desc="Processing PDB files"))

input_dir = '/mnt/ligandpro/db/LPCE/separated/'
output_dir = 'output_folder_fixed'
fix_pdb_folder(input_dir, output_dir)

In [None]:
from pathlib import Path
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor, as_completed
from loguru import logger
import shutil
import subprocess


def fix_pdb_file_cli(input_file, output_file):
    command = [
        "pdbfixer",
        str(input_file),
        "--output=" + str(output_file),
        "--replace-nonstandard",
        "--add-residues",
        "--add-atoms=all",
        "--keep-heterogens=all",
        "--ph=7.0"
    ]
    try:
        subprocess.run(command, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
        return True
    except subprocess.CalledProcessError:
        logger.error(f"Error processing file: {input_file}")
        return False

def process_file(pdb_file, output_path, error_path):
    output_file = output_path / pdb_file.name
    if not fix_pdb_file_cli(pdb_file, output_file):
        error_file = error_path / pdb_file.name
        shutil.copy(pdb_file, error_file)

def fix_pdb_folder(cfg):
    input_path = Path(cfg.paths.separated_dir)
    output_path = Path(cfg.paths.separated_fixed)
    error_path = output_path / "errors"
    output_path.mkdir(parents=True, exist_ok=True)
    error_path.mkdir(exist_ok=True)

    pdb_files = list(input_path.glob("*.pdb"))
    success_count = 0
    fail_count = 0

    with tqdm(total=len(pdb_files), desc="Processing PDB files") as pbar:
        with ProcessPoolExecutor(max_workers=112) as executor:
            futures = [executor.submit(process_file, pdb_file, output_path, error_path) for pdb_file in pdb_files]
            for future in as_completed(futures):
                result = future.result()
                if result:
                    success_count += 1
                else:
                    fail_count += 1
                pbar.update(1)

    total_files = len(pdb_files)
    logger.info("======== Fixing PDB files ========")    
    logger.info(f"Total files: {total_files}")
    logger.info(f"Successfully processed: {success_count} ({(success_count / total_files) * 100:.2f}%)")
    logger.info(f"Failed to process: {fail_count} ({(fail_count / total_files) * 100:.2f}%)")


