In [3]:
import datetime
import pandas as pd
from pathlib import Path
from tqdm import tqdm
from Bio.PDB.parse_pdb_header import parse_pdb_header
from joblib import Parallel, delayed
import subprocess
from get_header_seq import get_seq_


In [4]:
base_path = Path("/mnt/ligandpro/db/LPCE/final")
pdb_ids = [item.name for item in base_path.iterdir() if item.is_dir()]


In [5]:
header_dict = {}
resolution_dict = {}
date_dict = {}
for pdb_id in tqdm(pdb_ids):
    pdb_file = base_path / pdb_id / f"{pdb_id}.pdb"
    try:
        header_info = parse_pdb_header(str(pdb_file))
        header_dict[pdb_id] = header_info.get("head", "")
        resolution_dict[pdb_id] = header_info.get("resolution", None)
        release_date = header_info.get("release_date", "")
        if release_date:
            date_dict[pdb_id] = datetime.datetime.strptime(release_date, '%Y-%m-%d')
        else:
            date_dict[pdb_id] = None
    except Exception as e:
        print(f"Error parsing header for {pdb_id}: {e}")
        header_dict[pdb_id] = ""
        resolution_dict[pdb_id] = None
        date_dict[pdb_id] = None


100%|██████████| 39150/39150 [03:54<00:00, 167.14it/s]


In [6]:
df = pd.DataFrame({
    'header': pd.Series(header_dict),
    'resolution': pd.Series(resolution_dict),
    'date': pd.Series(date_dict)
})


In [7]:
def get_smiles(file_path):
    try:
        result = subprocess.run(
            ["obabel", str(file_path), "-osmi", "--quiet"],
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True
        )
        return result.stdout.split('\t')[0].strip() if result.returncode == 0 else ""
    except:
        return ""

def process_pdb_id(pdb_id):
    ligand_dir = base_path / pdb_id
    mol2_files = list(ligand_dir.glob("*.mol2"))
    sdf_files = list(ligand_dir.glob("*.sdf"))
    if mol2_files:
        return get_smiles(mol2_files[0])
    elif sdf_files:
        return get_smiles(sdf_files[0])
    else:
        return ""

smiles_list = Parallel(n_jobs=-1)(
    delayed(lambda pdb_id: (pdb_id, process_pdb_id(pdb_id)))(pdb_id) for pdb_id in tqdm(pdb_ids)
)
smiles_dict = dict(smiles_list)
df['smiles'] = pd.Series(smiles_dict)


100%|██████████| 39150/39150 [00:17<00:00, 2218.47it/s]


In [8]:
def get_sequence(pdb_file_path):
    seqs = list(get_seq_(pdb_file_path).values())
    unique_seqs = list(set(seqs))
    if len(unique_seqs) == 1:
        return unique_seqs[0]
    else:
        return ":".join(unique_seqs)

sequence_dict = {}
for pdb_id in tqdm(pdb_ids):
    pdb_file = base_path / pdb_id / f"{pdb_id}.pdb"
    try:
        sequence = get_sequence(str(pdb_file))
    except:
        sequence = ""
    sequence_dict[pdb_id] = sequence
df['seq'] = pd.Series(sequence_dict)


100%|██████████| 39150/39150 [01:54<00:00, 340.63it/s]


In [9]:
df.index.name = 'pdb_id'
df = df[['header', 'smiles', 'seq', 'resolution', 'date']]


In [10]:
df

Unnamed: 0_level_0,header,smiles,seq,resolution,date
pdb_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2uw1_bioml_1_FE_chains_A,oxidoreductase,[Fe],MQVTHSMPPQKLEIFKSLDDWARNNVLIHLKSVEKSWQPQDYLPDP...,1.95,2007-05-08
7zhz_bioml_1_BG6_chains_B,oxidoreductase,[C@@H]1([C@@H]([C@H]([C@@H]([C@H](O1)COP(=O)(O...,MSEEQSHADQDAYVADVDGILDVLRAQVLERKPDDIFQFISKSALS...,2.50,2022-12-14
1kjy_bioml_1_GDP_chains_A_B,signaling protein,P(=O)(O)(O)O[P@@](=O)(O)OC[C@H]1O[C@H]([C@@H](...,GAREVKLLLLGAGESGKSTIVKQMKIIHEAGYSEEECKQYKAVVYS...,2.70,2002-05-08
2c4f_bioml_1_GIL_chains_H,hydrolase,c1(cccc(c1)OC1=[N]=C(C(=C([C@H]1F)N(C(C)C)C(C)...,IVGGKVCPKGECPWQVLLLVNGAQLCGGTLINTIWVVSAAHCFDKI...,1.72,2006-10-18
8hwe_bioml_1_ATP_chains_C_D,viral protein,P(=O)(O)(O)O[P@@](=O)(O)O[P@](=O)(O)OC[C@H]1O[...,MDAAIRGNDVIFVLKTIGVPSACRQNEDPRFVEAFKCDELERYIDN...,3.30,2024-01-10
...,...,...,...,...,...
6h8t_bioml_1_TXY_chains_A,metal binding protein,N[C@H](C=O)CC1=CC=C([C@H](C1)O)O,IPEYVDWRQKGAVTPVKNQGSCGSCWAFSAVVTIEGIIKIRTGNLN...,2.10,2018-09-12
2vmq_bioml_1_GLY_chains_A,transferase,NCC(=O)[O-],MKYLPQQDPQVFAAIEQERKRQHAKIELIASENFVSRAVMEAQGSV...,1.67,2008-12-16
6kq1_bioml_1_HEC_chains_A,electron transport,[Fe@]123[N]4=C5C=c6n3c(=CC3=[N]2C(=Cc2n1c(C=C4...,QDGEALFKSKPCAACHSIDAKMVGPALKEVAAKYAGQEGAADLLAG...,1.57,2020-08-19
4g1v_bioml_1_HEM_chains_A,oxidoreductase,C1=c2c(c(c3=CC4=[N]5C(=Cc6c(c(c7C=C8C(=C(C1=[N...,MLAEKTRSIIKATVPVLEQQGTVITRTFYKNMLTEHTELLNIFNRT...,2.10,2012-11-14


In [11]:
df.to_csv('info.csv')
