In [35]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


def select_non_redundant(df):
    nr = set()
    indices = []
    for i, (_, row) in enumerate(df.iterrows()):
        pdbid = row['PDBID']
        if 'Ligand SMILES' in row:
            smi = row['Ligand SMILES']
        else:
            smi = row['smiles']
        if (pdbid, smi) not in nr:
            nr.add((pdbid, smi))
            indices.append(i)
    return df.iloc[indices, :]

hiq_sm = pd.read_csv('../figshare/hiqbind_sm_metadata.csv')
hiq_poly = pd.read_csv('../figshare/hiqbind_poly_metadata.csv')
hiq_df = pd.concat((hiq_sm, hiq_poly))
hiq_df_nr = select_non_redundant(hiq_df)


print("Number of unique PDBIDs:")
print("- HiQBind-sm:", len(hiq_sm['PDBID'].unique()))
print("- HiQBind-poly:", len(hiq_poly['PDBID'].unique()))
print("- HiQBind:", len(hiq_df['PDBID'].unique()))
print("Number of structures:")
print("- HiQBind-sm:", hiq_sm.shape[0])
print("- HiQBind-poly:", hiq_poly.shape[0])
print('- HiQBind:', hiq_df.shape[0])

Number of unique PDBIDs:
- HiQBind-sm: 17725
- HiQBind-poly: 438
- HiQBind: 18160
Number of structures:
- HiQBind-sm: 31572
- HiQBind-poly: 703
- HiQBind: 32275


## PDBBind

In [17]:
import os, glob
import pandas
from rdkit import Chem, RDLogger
from rdkit.Chem import Descriptors, Crippen, rdMolDescriptors
RDLogger.DisableLog('rdApp.*')

from tqdm import tqdm


def parse_pdbbind_metadata(index='../raw/index/INDEX_general_PL.2020'):
    data = []
    with open(index) as f:
        for line in f:
            if line.startswith('#'):
                continue
            if line:
                content = line.strip().split()
                if not content[6].endswith(')'):
                    ligand = content[6][1:]
                else:
                    ligand = content[6][1:-1]
                    
                data.append({
                    "PDBID": content[0],
                    # "Resolution": content[1],
                    # "Year": content[2],
                    "Binding Affinity": content[3],
                    # "Ligand": ligand.lstrip('_'),
                    # "Note": ' '.join(content[7:])
                })
    data = pd.DataFrame(data)
    return data


def get_molecule_properties(mol):
    """
    Computes molecular properties and returns them in a dictionary.
    
    :param mol: RDKit molecule object
    :return: Dictionary with molecular properties
    """
    properties = {
        'Ligand SMILES': Chem.MolToSmiles(mol),
        'Ligand MW': Descriptors.MolWt(mol),
        'Ligand LogP': Crippen.MolLogP(mol),
        'Ligand TPSA': rdMolDescriptors.CalcTPSA(mol),
        'Ligand NumRotBond': rdMolDescriptors.CalcNumRotatableBonds(mol),
        'Ligand NumHeavyAtoms': mol.GetNumHeavyAtoms(),
        'Ligand NumHDon': rdMolDescriptors.CalcNumHBD(mol),
        'Ligand NumHAcc': rdMolDescriptors.CalcNumHBA(mol),
        'Ligand QED': Descriptors.qed(mol)
    }
    return properties

pdbbind_data = parse_pdbbind_metadata('../raw_data_pdbbind/index/INDEX_general_PL_data.2020').set_index("PDBID")
pdbbind_data['Log Binding Affinity'] = -pdbbind_data['Binding Affinity'].astype(float)
pdbbind_data

Unnamed: 0_level_0,Binding Affinity,Log Binding Affinity
PDBID,Unnamed: 1_level_1,Unnamed: 2_level_1
3zzf,0.40,-0.40
3gww,0.45,-0.45
1w8l,0.49,-0.49
3fqa,0.49,-0.49
1zsb,0.60,-0.60
...,...,...
7cpa,13.96,-13.96
2xuf,14.39,-14.39
1avd,15.00,-15.00
2xui,15.00,-15.00


In [None]:
pdbbind_props = {}
for pdbid in tqdm(pdbbind_data.index):
    mol2 = f'../raw_data_pdbbind/PDBBind-v2020/{pdbid}/{pdbid}_ligand.mol2'
    mol = Chem.MolFromMol2File(mol2)
    if mol is None:
        continue
    info = get_molecule_properties(mol)
    info['Log Binding Affinity'] = pdbbind_data.loc[pdbid, 'Log Binding Affinity']
    pdbbind_props[pdbid] = info

pdbbind_df = pd.DataFrame(pdbbind_props).T

## Plot

In [None]:
from matplotlib.ticker import MaxNLocator

plt.rcParams['font.size'] = 10

props = [
    ('Log Binding Affinity', 'Log Kd/Ki/IC50'),
    ('Ligand MW', 'Molecular Weight'),
    ('Ligand LogP', 'cLogP'),
    ('Ligand TPSA', 'TPSA'),
    ('Ligand NumRotBond', '# Rotatable Bonds'),
    ('Ligand NumHeavyAtoms', '# Heavy Atoms'),
    ('Ligand NumHDon', '# H-Bond Donors'),
    ('Ligand NumHAcc',' # H-Bond Acceptors'),
    ('Ligand QED', 'QED')
]

ranges = {
    "Log Binding Affinity": (-15, 0),
    "Ligand MW": (0, 1500),
    "Ligand LogP": (-20, 10),
    "Ligand TPSA": (0, 400),
    "Ligand NumRotBond": (0, 30),
    "Ligand NumHeavyAtoms": (0, 100),
    "Ligand NumHDon": (0, 15),
    "Ligand NumHAcc": (0, 30),
    "Ligand QED": (0.0, 1.0),
}

fig, axes = plt.subplots(3, 3, figsize=(10, 6), constrained_layout=False)

# Iterate through each property and plot it
for i, ax in enumerate(axes.flatten()):
    prop, xlabel = props[i]
    
    # Plot with custom labels for legend
    clip = ranges[prop]
    sns.kdeplot(pdbbind_df[prop], fill=True, ax=ax, bw_adjust=1.5, label="PDBBind" if i == 0 else "", clip=clip, color="#19CAAD", lw=1.5, gridsize=100)
    sns.kdeplot(hiq_df_nr[prop], fill=True, ax=ax, bw_adjust=1.5, label="HiQBind" if i == 0 else "", clip=clip, color='#F4606C', lw=1.5, gridsize=100)
    
    # ax.set_title(xlabel, fontsize=12)
    ax.set_xlabel(xlabel, fontsize=12)
    ax.set_ylabel(None)
    ax.set_xlim(clip[0], clip[1])
    # ax.set_ylim(0.00001, ax.get_ylim()[1])
    # ax.set_yticklabels([])
    ax.xaxis.set_major_locator(MaxNLocator(nbins=5))  # Controls max number of x-axis ticks
    ax.yaxis.set_major_locator(MaxNLocator(nbins=4))
    ax.tick_params(direction="in")

    for spine in ax.spines.values():
        spine.set_linewidth(1)

# Set one common y-axis label
axes[1, 0].set_ylabel('Density', fontsize=12)

fig.subplots_adjust(
    left=0.1, right=0.98, top=0.95, bottom=0.15, 
    hspace=0.50, wspace=0.25
)
# Add a global legend
fig.legend(labels=["PDBBind", "HiQBind"], loc="lower center", fontsize=10, frameon=True, ncol=2, bbox_to_anchor=(0.5, 0.0))
fig.savefig('fig3_dis.pdf', pad_inches=0.10)


## PDBBind-Opt vs PDBBind

In [34]:
pdbbind_opt_sm = pd.read_csv('pdbbind_opt_sm_metadata.csv')
pdbbind_opt_poly = pd.read_csv('pdbbind_opt_poly_metadata.csv')
pdbbind_opt_df = pd.concat((pdbbind_opt_sm, pdbbind_opt_poly))
pdbbind_opt_df_nr = select_non_redundant(pdbbind_opt_df)


print("Number of unique PDBIDs:")
print("- PDBBind-opt-sm:", len(pdbbind_opt_sm['PDBID'].unique()))
print("- PDBBind-opt-poly:", len(pdbbind_opt_poly['PDBID'].unique()))
print("- PDBBind-opt:", len(pdbbind_opt_df_nr['PDBID'].unique()))
print("Number of structures:")
print("- PDBBind-opt-sm:", pdbbind_opt_sm.shape[0])
print("- PDBBind-opt-poly:", pdbbind_opt_poly.shape[0])
print('- PDBBind-opt:', pdbbind_opt_df.shape[0])

Number of unique PDBIDs:
- PDBBind-opt-sm: 14915
- PDBBind-opt-poly: 746
- PDBBind-opt: 15661
Number of structures:
- PDBBind-opt-sm: 26626
- PDBBind-opt-poly: 1131
- PDBBind-opt: 27757


In [21]:
pdbbind_opt_ids = pdbbind_opt_df['PDBID'].unique().tolist()
core = parse_pdbbind_metadata('/pscratch/sd/e/eric6/PDBBind-Opt/raw_data_pdbbind/index/INDEX_demo_PL_data.2021')['PDBID'].tolist()
refined = parse_pdbbind_metadata('/pscratch/sd/e/eric6/PDBBind-Opt/raw_data_pdbbind/index/INDEX_refined_data.2020')['PDBID'].tolist()

print(len(set(core).intersection(pdbbind_opt_ids)))
print(len(set(refined).intersection(pdbbind_opt_ids)))
core

279
4969


['4tmn',
 '5tmn',
 '1ydr',
 '1ydt',
 '1bcu',
 '1a30',
 '1bzc',
 '1qf1',
 '1qkt',
 '1c5z',
 '1k1i',
 '1e66',
 '1g2k',
 '1eby',
 '1gpn',
 '1h22',
 '1gpk',
 '1h23',
 '1p1n',
 '1q8u',
 '1p1q',
 '1q8t',
 '1o3f',
 '1oyt',
 '1nc1',
 '1o0h',
 '1nc3',
 '1owh',
 '1ps3',
 '1nvq',
 '1mq6',
 '1lpg',
 '1uto',
 '1pxn',
 '1r5y',
 '1sqa',
 '1o5b',
 '1s38',
 '1u1b',
 '2c3i',
 '1yc1',
 '2brb',
 '1z9g',
 '1syi',
 '2al5',
 '1w4o',
 '2br1',
 '1z95',
 '1y6r',
 '2iwx',
 '2cbv',
 '2j78',
 '2j7h',
 '2cet',
 '1z6e',
 '2hb1',
 '2fvd',
 '2v7a',
 '2v00',
 '2fxs',
 '2p15',
 '2pog',
 '3bgz',
 '2qe4',
 '1vso',
 '3cj4',
 '2zb1',
 '3coy',
 '3coz',
 '3f3e',
 '2vw5',
 '2vvn',
 '3f3d',
 '2qbp',
 '2qbq',
 '2qbr',
 '3b65',
 '2qnq',
 '3e93',
 '2zcq',
 '3b5r',
 '2vkm',
 '3e92',
 '2zcr',
 '3d4z',
 '2p4y',
 '3bv9',
 '2zda',
 '3b68',
 '3e5a',
 '3f3a',
 '2r9w',
 '3f3c',
 '3jvs',
 '3gnw',
 '2w4x',
 '2wn9',
 '2weg',
 '2wnc',
 '3ivg',
 '3kr8',
 '2wer',
 '2zy1',
 '3jya',
 '3g0w',
 '3ebp',
 '3g31',
 '3ehy',
 '3fcq',
 '3jvr',
 '3fur',
 

In [38]:
hiq_df.sort_values('PDBID').to_csv('../figshare/hiqbind_metadata.csv', index=None)