In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


def select_non_redundant(df):
    nr = set()
    indices = []
    for i, (_, row) in enumerate(df.iterrows()):
        pdbid = row['PDBID']
        if 'Ligand SMILES' in row:
            smi = row['Ligand SMILES']
        else:
            smi = row['smiles']
        if (pdbid, smi) not in nr:
            nr.add((pdbid, smi))
            indices.append(i)
    return df.iloc[indices, :]

hiq_sm = pd.read_csv('../figshare/hiqbind_sm_metadata.csv')
hiq_poly = pd.read_csv('../figshare/hiqbind_poly_metadata.csv')
hiq_df = pd.concat((hiq_sm, hiq_poly))
hiq_df_nr = select_non_redundant(hiq_df)


print("Number of unique PDBIDs:")
print("- HiQBind-sm:", len(hiq_sm['PDBID'].unique()))
print("- HiQBind-poly:", len(hiq_poly['PDBID'].unique()))
print("- HiQBind:", len(hiq_df['PDBID'].unique()))
print("Number of structures:")
print("- HiQBind-sm:", hiq_sm.shape[0])
print("- HiQBind-poly:", hiq_poly.shape[0])
print('- HiQBind:', hiq_df.shape[0])

## PDBBind

In [27]:
import os, glob
import pandas
from rdkit import Chem, RDLogger
from rdkit.Chem import Descriptors, Crippen, rdMolDescriptors
RDLogger.DisableLog('rdApp.*')

from tqdm import tqdm


def parse_pdbbind_metadata(index='../raw/index/INDEX_general_PL.2020'):
    data = []
    with open(index) as f:
        for line in f:
            if line.startswith('#'):
                continue
            if line:
                content = line.strip().split()
                if not content[6].endswith(')'):
                    ligand = content[6][1:]
                else:
                    ligand = content[6][1:-1]
                    
                data.append({
                    "PDBID": content[0],
                    # "Resolution": content[1],
                    # "Year": content[2],
                    "Binding Affinity": content[3],
                    # "Ligand": ligand.lstrip('_'),
                    # "Note": ' '.join(content[7:])
                })
    data = pd.DataFrame(data)
    return data


def get_molecule_properties(mol):
    """
    Computes molecular properties and returns them in a dictionary.
    
    :param mol: RDKit molecule object
    :return: Dictionary with molecular properties
    """
    properties = {
        'Ligand SMILES': Chem.MolToSmiles(mol),
        'Ligand MW': Descriptors.MolWt(mol),
        'Ligand LogP': Crippen.MolLogP(mol),
        'Ligand TPSA': rdMolDescriptors.CalcTPSA(mol),
        'Ligand NumRotBond': rdMolDescriptors.CalcNumRotatableBonds(mol),
        'Ligand NumHeavyAtoms': mol.GetNumHeavyAtoms(),
        'Ligand NumHDon': rdMolDescriptors.CalcNumHBD(mol),
        'Ligand NumHAcc': rdMolDescriptors.CalcNumHBA(mol),
        'Ligand QED': Descriptors.qed(mol)
    }
    return properties

pdbbind_data = parse_pdbbind_metadata('../raw_data_pdbbind/index/INDEX_general_PL_data.2020').set_index("PDBID")
pdbbind_data['Log Binding Affinity'] = -pdbbind_data['Binding Affinity'].astype(float)
pdbbind_data

Unnamed: 0_level_0,Binding Affinity,Log Binding Affinity
PDBID,Unnamed: 1_level_1,Unnamed: 2_level_1
3zzf,0.40,-0.40
3gww,0.45,-0.45
1w8l,0.49,-0.49
3fqa,0.49,-0.49
1zsb,0.60,-0.60
...,...,...
7cpa,13.96,-13.96
2xuf,14.39,-14.39
1avd,15.00,-15.00
2xui,15.00,-15.00


In [None]:
pdbbind_props = {}
for pdbid in tqdm(pdbbind_data.index):
    mol2 = f'../raw_data_pdbbind/PDBBind-v2020/{pdbid}/{pdbid}_ligand.mol2'
    mol = Chem.MolFromMol2File(mol2)
    if mol is None:
        continue
    info = get_molecule_properties(mol)
    info['Log Binding Affinity'] = pdbbind_data.loc[pdbid, 'Log Binding Affinity']
    pdbbind_props[pdbid] = info

pdbbind_df = pd.DataFrame(pdbbind_props).T

## Plot

In [None]:
from matplotlib.ticker import MaxNLocator

plt.rcParams['font.size'] = 10

props = [
    ('Log Binding Affinity', 'Log Kd/Ki/IC50'),
    ('Ligand MW', 'Molecular Weight'),
    ('Ligand LogP', 'cLogP'),
    ('Ligand TPSA', 'TPSA'),
    ('Ligand NumRotBond', '# Rotatable Bonds'),
    ('Ligand NumHeavyAtoms', '# Heavy Atoms'),
    ('Ligand NumHDon', '# H-Bond Donors'),
    ('Ligand NumHAcc',' # H-Bond Acceptors'),
    ('Ligand QED', 'QED')
]

ranges = {
    "Log Binding Affinity": (-15, 0),
    "Ligand MW": (0, 1500),
    "Ligand LogP": (-20, 10),
    "Ligand TPSA": (0, 400),
    "Ligand NumRotBond": (0, 30),
    "Ligand NumHeavyAtoms": (0, 100),
    "Ligand NumHDon": (0, 15),
    "Ligand NumHAcc": (0, 30),
    "Ligand QED": (0.0, 1.0),
}

fig, axes = plt.subplots(3, 3, figsize=(10, 6), constrained_layout=False)

# Iterate through each property and plot it
for i, ax in enumerate(axes.flatten()):
    prop, xlabel = props[i]
    
    # Plot with custom labels for legend
    clip = ranges[prop]
    sns.kdeplot(pdbbind_df[prop], fill=True, ax=ax, bw_adjust=1.5, label="PDBBind" if i == 0 else "", clip=clip, color="#19CAAD", lw=1.5, gridsize=100)
    sns.kdeplot(hiq_df_nr[prop], fill=True, ax=ax, bw_adjust=1.5, label="HiQBind" if i == 0 else "", clip=clip, color='#F4606C', lw=1.5, gridsize=100)
    
    # ax.set_title(xlabel, fontsize=12)
    ax.set_xlabel(xlabel, fontsize=12)
    ax.set_ylabel(None)
    ax.set_xlim(clip[0], clip[1])
    # ax.set_ylim(0.00001, ax.get_ylim()[1])
    # ax.set_yticklabels([])
    ax.xaxis.set_major_locator(MaxNLocator(nbins=5))  # Controls max number of x-axis ticks
    ax.yaxis.set_major_locator(MaxNLocator(nbins=4))
    ax.tick_params(direction="in")

    for spine in ax.spines.values():
        spine.set_linewidth(1)

# Set one common y-axis label
axes[1, 0].set_ylabel('Density', fontsize=12)

fig.subplots_adjust(
    left=0.1, right=0.98, top=0.95, bottom=0.15, 
    hspace=0.50, wspace=0.25
)
# Add a global legend
fig.legend(labels=["PDBBind", "HiQBind"], loc="lower center", fontsize=10, frameon=True, ncol=2, bbox_to_anchor=(0.5, 0.0))
fig.savefig('fig3_dis.pdf', pad_inches=0.10)


## PDBBind-Opt vs PDBBind

In [None]:
pdbbind_opt_sm = pd.read_csv('pdbbind_opt_sm_metadata.csv')
pdbbind_opt_poly = pd.read_csv('pdbbind_opt_poly_metadata.csv')
pdbbind_opt_df = pd.concat((pdbbind_opt_sm, pdbbind_opt_poly))
pdbbind_opt_df_nr = select_non_redundant(pdbbind_opt_df)


print("Number of unique PDBIDs:")
print("- PDBBind-opt-sm:", len(pdbbind_opt_sm['PDBID'].unique()))
print("- PDBBind-opt-poly:", len(pdbbind_opt_poly['PDBID'].unique()))
print("- PDBBind-opt:", len(pdbbind_opt_df_nr['PDBID'].unique()))
print("Number of structures:")
print("- PDBBind-opt-sm:", pdbbind_opt_sm.shape[0])
print("- PDBBind-opt-poly:", pdbbind_opt_poly.shape[0])
print('- PDBBind-opt:', pdbbind_opt_df.shape[0])

In [None]:
pdbbind_opt_ids = pdbbind_opt_df['PDBID'].unique().tolist()
core = parse_pdbbind_metadata('/pscratch/sd/e/eric6/PDBBind-Opt/raw_data_pdbbind/index/INDEX_demo_PL_data.2021')['PDBID'].tolist()
refined = parse_pdbbind_metadata('/pscratch/sd/e/eric6/PDBBind-Opt/raw_data_pdbbind/index/INDEX_refined_data.2020')['PDBID'].tolist()

print(len(set(core).intersection(pdbbind_opt_ids)))
print(len(set(refined).intersection(pdbbind_opt_ids)))
core

In [None]:
hiq_df.sort_values('PDBID').to_csv('../figshare/hiqbind_metadata.csv', index=None)

## Years

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict

plt.rcParams['font.size'] = 12
plt.rcParams['font.family'] = 'Arial'

df = pd.read_csv('../figshare/hiqbind_metadata.csv').drop_duplicates(subset=['PDBID'])
count = defaultdict(int)
for year, subdf in df.groupby('Year'):
    cnt = subdf.shape[0]
    # if year < 1995:
    #     count[1994] += cnt
    # else:
    #     count[year] = cnt

    count[year] = cnt

years = list(count.keys())
numbers = list(count.values())

fig, ax = plt.subplots(1, 1, figsize=(6, 4), constrained_layout=True)
ax.grid(True, linestyle='--', color='grey', axis='y')
ax.bar(years, numbers, color='blue')
ax.set_xlabel("Year")
ax.set_ylabel("Number of PDB IDs")
ax.set_axisbelow(True)
fig.savefig('year.pdf')

In [34]:
print("HiQBind overlapps with PDBbind v2020:", len(set(pdbbind_data.index).intersection(df['PDBID'].unique())))

HiQBind overlapps with PDBbind v2020: 11615


In [37]:
df.query('Year > 2019')

Unnamed: 0,PDBID,Resolution,Year,Ligand Name,Ligand Chain,Ligand Residue Number,Binding Affinity Measurement,Binding Affinity Sign,Binding Affinity Value,Binding Affinity Unit,...,Protein UniProtName,Ligand SMILES,Ligand MW,Ligand LogP,Ligand TPSA,Ligand NumRotBond,Ligand NumHeavyAtoms,Ligand NumHDon,Ligand NumHAcc,Ligand QED
9888,3ix2,2.1,2021,AC2,A,302,kd,=,12589.00,nM,...,"Purine nucleoside phosphorylase,Purine nucleos...",Nc1nc2c(ncn2COCCO)c(=O)[nH]1,225.208,-1.33180,119.05,4,16,3,7,0.554386
16378,4jdf,1.49,2021,SPD,A,401,kd,=,6.40,mM,...,Putrescine-binding periplasmic protein PotF,[NH3+]CCCC[NH2+]CCC[NH3+],148.274,-2.79610,71.89,7,10,3,0,0.326519
24518,5qtv,2.2,2020,QLS,A,301,ki,=,1.10,nM,...,Coagulation factor XI,COC(=O)Nc1ccc2c(c1)N[C@@H](C(F)(F)F)CCCC[C@H](...,630.031,5.67160,151.74,5,44,4,9,0.206126
24519,5qtw,2.12,2020,QLM,A,301,ki,=,0.46,nM,...,Coagulation factor XI,COC(=O)Nc1ccc2c(c1)N[C@@H](C(=O)OC)CCCC[C@H](N...,620.070,4.28230,178.04,6,44,4,11,0.179741
24520,5qtx,2.07,2020,QLD,A,301,ki,=,0.22,nM,...,"Coagulation factor XI,Coagulation factor XI",CCOC(=O)[C@H]1CCCC[C@H](NC(=O)/C=C/c2cc(Cl)ccc...,634.097,4.67240,178.04,7,45,4,11,0.165022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32245,7vnp,NMR,2021,7YV,A,1108,ec50,=,1600.00,nM,...,Potassium voltage-gated channel subfamily KQT ...,Cc1cc(C)c(NC(=O)[C@H]2C[C@@H]3CC[C@H]2C3)c(C)c1,257.377,3.98656,29.10,2,19,1,1,0.852540
32249,7vnq,NMR,2021,7YV,G,1101,ec50,=,1600.00,nM,...,Potassium voltage-gated channel subfamily KQT ...,Cc1cc(C)c(NC(=O)[C@H]2C[C@@H]3CC[C@H]2C3)c(C)c1,257.377,3.98656,29.10,2,19,1,1,0.852540
32253,7vnr,NMR,2021,7YV,G,1101,ec50,=,1600.00,nM,...,Potassium voltage-gated channel subfamily KQT ...,Cc1cc(C)c(NC(=O)[C@H]2C[C@@H]3CC[C@H]2C3)c(C)c1,257.377,3.98656,29.10,2,19,1,1,0.852540
32257,7vsi,NMR,2021,7R3,A,703,ic50,=,3.10,nM,...,Sodium/glucose cotransporter 2,OC[C@H]1O[C@@H](c2ccc(Cl)c(Cc3ccc(O[C@H]4CCOC4...,450.915,1.61340,108.61,6,31,4,7,0.529157
