In [1]:
import pandas as pd
from Bio import PDB
import numpy as np
parser=PDB.PDBParser()

from rdkit import Chem
info=pd.read_csv("info_with_new_split.csv",index_col=0)

In [3]:
from scipy.spatial.distance import cdist
from Bio import PDB
from rdkit.Chem import Descriptors
parser=PDB.PDBParser(QUIET=True)

def get_minimum_dist(pdbid, folder="v2020-other-PL"):
    pocket=parser.get_structure(pdbid,f"{folder}/{pdbid}/{pdbid}_pocket.pdb")[0]
    needed_residues=[res for res in pocket.get_residues() if res.get_resname() != "HOH"]
    needed_atoms_with_H=[atom for res in needed_residues for atom in res.get_atoms() ]
    needed_atoms_no_H=[atom for atom in needed_atoms_with_H if atom.element != "H"]
    pocket_coords_with_H=np.array([atom.get_coord() for atom in needed_atoms_with_H])
    pocket_coords_no_H=np.array([atom.get_coord() for atom in needed_atoms_no_H])
    pocket_elements_with_H=[atom.element for atom in needed_atoms_with_H]
    pocket_elements_no_H=[atom.element for atom in needed_atoms_no_H]

    try:
        ligand=Chem.MolFromMol2File(f"{folder}/{pdbid}/{pdbid}_ligand.mol2", removeHs=False)

    except:
        try:
            ligand=Chem.SDMolSupplier(f"{folder}/{pdbid}/{pdbid}_ligand.sdf",sanitize=False)
        except:
            ligand=None

    if ligand is not None:
        ligand_coords_with_H=ligand.GetConformer().GetPositions()
        ligand_elements_with_H=[atom.GetSymbol() for atom in ligand.GetAtoms()]

        ligand_no_H=Chem.RemoveHs(ligand)
        ligand_coords_no_H=ligand_no_H.GetConformer().GetPositions()
        ligand_elements_no_H=[atom.GetSymbol() for atom in ligand_no_H.GetAtoms()]
    else:
        return None
    
    distances_all_no_H=cdist(pocket_coords_no_H,ligand_coords_no_H)
    min_dist_no_H=distances_all_no_H.min()
    index_x,index_y=np.unravel_index(distances_all_no_H.argmin(), distances_all_no_H.shape)
    min_dist_pocket_element=pocket_elements_no_H[index_x]
    min_dist_ligand_element=ligand_elements_no_H[index_y]

    distances_pocket_no_H_ligand_with_H=cdist(pocket_coords_no_H,ligand_coords_with_H)
    min_dist_pocket_no_H_ligand_with_H=distances_pocket_no_H_ligand_with_H.min()
    index_x,index_y=np.unravel_index(distances_pocket_no_H_ligand_with_H.argmin(), distances_pocket_no_H_ligand_with_H.shape)
    min_dist_pocket_no_H_ligand_with_H_pocket_element=pocket_elements_no_H[index_x]
    min_dist_pocket_no_H_ligand_with_H_ligand_element=ligand_elements_with_H[index_y]

    distances_pocket_with_H_ligand_no_H=cdist(pocket_coords_with_H,ligand_coords_no_H)
    min_dist_pocket_with_H_ligand_no_H=distances_pocket_with_H_ligand_no_H.min()
    index_x,index_y=np.unravel_index(distances_pocket_with_H_ligand_no_H.argmin(), distances_pocket_with_H_ligand_no_H.shape)
    min_dist_pocket_with_H_ligand_no_H_pocket_element=pocket_elements_with_H[index_x]
    min_dist_pocket_with_H_ligand_no_H_ligand_element=ligand_elements_no_H[index_y]

    return ((min_dist_no_H,min_dist_pocket_element,min_dist_ligand_element),
            (min_dist_pocket_no_H_ligand_with_H,min_dist_pocket_no_H_ligand_with_H_pocket_element,min_dist_pocket_no_H_ligand_with_H_ligand_element),
            (min_dist_pocket_with_H_ligand_no_H,min_dist_pocket_with_H_ligand_no_H_pocket_element,min_dist_pocket_with_H_ligand_no_H_ligand_element))
    


def get_pocket_stats(pdbid, folder="v2020-other-PL", remove_Hs=False):
    pocket=parser.get_structure(pdbid,f"{folder}/{pdbid}/{pdbid}_pocket.pdb")[0]
    needed_residues=[res for res in pocket.get_residues() if res.get_resname() != "HOH"]
    needed_atoms=[atom for res in needed_residues for atom in res.get_atoms() ]
    if remove_Hs:
        needed_atoms=[atom for atom in needed_atoms if atom.element != "H"]
    pocket_coords=np.array([atom.get_coord() for atom in needed_atoms])
    pocket_elements=set([atom.element for atom in needed_atoms])
    # pocket=Chem.MolFromPDBFile(f"{folder}/{pdbid}/{pdbid}_pocket.pdb",sanitize=False, removeHs=remove_Hs)
    # if remove_Hs:
    #     try:
    #         pocket=Chem.RemoveHs(pocket)
    #     except:
    #         return None
    # pocket_coords=pocket.GetConformer().GetPositions()
    # pocket_elements=set([atom.GetSymbol() for atom in pocket.GetAtoms()])
    try:
        ligand=Chem.MolFromMol2File(f"{folder}/{pdbid}/{pdbid}_ligand.mol2", removeHs=remove_Hs)

    except:
        try:
            ligand=Chem.SDMolSupplier(f"{folder}/{pdbid}/{pdbid}_ligand.sdf",sanitize=False, removeHs=remove_Hs)
        except:
            ligand=None

    if ligand is not None:
        ligand_coords=ligand.GetConformer().GetPositions()
        ligand_elements=set([atom.GetSymbol() for atom in ligand.GetAtoms()])
    else:
        return None
    
    distances=cdist(pocket_coords,ligand_coords)
    min_dist=distances.min()
    return min_dist,pocket_elements,ligand_elements

get_pocket_stats("3n9s","refined-set", remove_Hs=True)


(1.915857469465719, {'C', 'N', 'O', 'S', 'ZN'}, {'C', 'N', 'O', 'P'})

In [88]:
from tqdm import tqdm
from rdkit.Chem import Descriptors

# create record on the minimum distance between protein and ligand

min_dist_record=[]
processed_pdbids=[]
counter=0
prot_element_counter={}
lig_element_counter={}

for selected_index in tqdm(info.index):
    category=info.loc[selected_index,"category"]
    if category == "general":
        folder="v2020-other-PL"
    elif category == "refined":
        folder="refined-set"
    elif category == "core":
        folder="CASF-2016/coreset"
    pdb_stats=get_pocket_stats(selected_index,folder,remove_Hs=True)
    if pdb_stats is None:
        continue
    counter+=1
    processed_pdbids.append(selected_index)
    min_dist,qed,pocket_elements,ligand_elements=pdb_stats
    min_dist_record.append(min_dist)
    for element in pocket_elements:
        if element not in prot_element_counter:
            prot_element_counter[element]=1
        else:
            prot_element_counter[element]+=1
    for element in ligand_elements:
        if element not in lig_element_counter:
            lig_element_counter[element]=1
        else:
            lig_element_counter[element]+=1

min_dist_record=np.array(min_dist_record)

 11%|█         | 2120/19443 [00:13<01:55, 149.75it/s][12:44:25] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 5
 12%|█▏        | 2322/19443 [00:15<02:06, 135.49it/s][12:44:25] sanitise [12:44:25] 1ksn_ligand: [12:44:27] Can't kekulize mol.  Unkekulized atoms: 0 1 3 4 5
 17%|█▋        | 3355/19443 [00:21<02:07, 126.18it/s][12:44:27] sanitise [12:44:27] 1sl3_ligand: [12:44:33] Can't kekulize mol.  Unkekulized atoms: 3 4 19 20 22
 24%|██▎       | 4583/19443 [00:29<01:22, 180.53it/s][12:44:41] Can't kekulize mol.  Unkekulized atoms: 0 2 3 4 5
 29%|██▊       | 5578/19443 [00:36<01:21, 170.26it/s][12:44:48] Explicit valence for atom # 12 C, 5, is greater than permitted
 30%|███       | 5836/19443 [00:37<01:21, 167.07it/s][12:44:48] sanitise [12:44:48] 3fxz_ligand: [12:44:49] Unusual charge on atom 13 number of radical electrons set to zero
 30%|███       | 5872/19443 [00:37<01:19, 171.42it/s][12:44:50] Unusual charge on atom 30 number of radical electrons set to zero
 30%|███       | 5890/

## CL1

In [167]:
# create CL1 definitions
CL1={}
for selected_index in tqdm(info.index):
    category=info.loc[selected_index,"category"]
    if category == "general":
        folder="v2020-other-PL"
    elif category == "refined":
        folder="refined-set"
    elif category == "core":
        folder="CASF-2016/coreset"
    pdb_stats=get_pocket_stats(selected_index,folder,remove_Hs=True)
    if pdb_stats is None:
        CL1[selected_index]=False
        continue
    
    min_dist,pocket_elements,ligand_elements=pdb_stats
    if type(info.loc[selected_index,"smiles"]) != str:
        CL1[selected_index]=False
        continue
    try:
        qed=Descriptors.qed(Chem.MolFromSmiles(info.loc[selected_index,"smiles"]))
    except:
        qed=0
    if min_dist < 1.75:   # criterion 1
        CL1[selected_index]=False
        continue

    if qed < 0.2:   # criterion 2
        CL1[selected_index]=False
        continue

    if np.any([element.upper() not in ["C","S","O","N","ZN","CA","CO","MG","NI","MN","FE","NA","K","SE","P","CD"] 
               for element in pocket_elements]):
        CL1[selected_index]=False
        continue

    if np.any([element.upper() not in ["C","S","O","N","CL", "P", "F", "BR", "I", "B"] 
               for element in ligand_elements]):
        CL1[selected_index]=False
        continue

    CL1[selected_index]=True


 11%|█         | 2113/19443 [00:16<02:36, 110.65it/s][15:29:01] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 5
 12%|█▏        | 2332/19443 [00:17<02:00, 142.48it/s][15:29:01] sanitise [15:29:01] 1ksn_ligand: [15:29:03] Can't kekulize mol.  Unkekulized atoms: 0 1 3 4 5
 17%|█▋        | 3356/19443 [00:26<01:55, 138.96it/s][15:29:03] sanitise [15:29:03] 1sl3_ligand: [15:29:11] Can't kekulize mol.  Unkekulized atoms: 3 4 19 20 22
 24%|██▎       | 4592/19443 [00:35<01:45, 141.42it/s][15:29:20] Can't kekulize mol.  Unkekulized atoms: 0 2 3 4 5
 29%|██▊       | 5587/19443 [00:43<01:36, 144.24it/s][15:29:28] Explicit valence for atom # 12 C, 5, is greater than permitted
 29%|██▉       | 5694/19443 [00:44<02:07, 107.42it/s][15:29:28] sanitise [15:29:28] 3fxz_ligand: [15:29:29] Explicit valence for atom # 34 N, 4, is greater than permitted
 30%|███       | 5840/19443 [00:45<01:39, 136.88it/s][15:29:30] Unusual charge on atom 13 number of radical electrons set to zero
[15:29:30] Unusual charge

In [174]:
CL1=pd.Series.from_dict(CL1,orient="index")
info["CL1"]=CL1

## CL2

In [176]:
# recognize Kd=/Ki= cases from general
general_index=pd.read_fwf("/home/jerry/data/pdbbind/index/INDEX_general_PL_data.2020",
                     skiprows=6,
                     header=None,
                     colspecs=[[0,4],[4,10],[10,16],[16,23],[23,39]])
general_index.columns = ["pdbid", "resolution", "year", "value", "kd/ki"]
general_index.index=general_index.pdbid

refined_index=pd.read_fwf("/home/jerry/data/pdbbind/index/INDEX_refined_data.2020", \
                        skiprows=6,
                     header=None,
                     colspecs=[[0,4],[4,10],[10,16],[16,23],[23,39]])
refined_index.columns = ["pdbid", "resolution", "year", "value", "kd/ki"]
refined_index.index=refined_index.pdbid

In [180]:
info=info.join(general_index[["kd/ki","value"]])

In [184]:
CL2=general_index["kd/ki"].str.startswith("Kd=")|general_index["kd/ki"].str.startswith("Ki=")

In [188]:
CL2=general_index["kd/ki"].str.startswith("Kd=")|general_index["kd/ki"].str.startswith("Ki=")
CL2=CL1[0]&CL2

In [191]:
info["CL2"]=CL2

## CL3

In [192]:
info["CL3"]=info["CL1"]&((info["category"]!="general"))

In [193]:
len(info), info["CL1"].sum(), info["CL2"].sum(), info["CL3"].sum()

(19443, 14324, 7985, 4404)

In [200]:
info.drop(columns=["new_CL1","OLD_CL1"]).to_csv("final_info.csv")

# Statistics

In [210]:

type_order=["hydrolase",   
"transferase"  ,
"other" ,
"transcription",
"lyase"        ,
"transport"    ,
"oxidoreductase",
"ligase"       ,
"isomerase"    ,
"chaperone"    ,
"membrane"     ,
"viral"        ,
"metal_containing",
]
summary=info[info.CL3].pivot_table(index="type", columns="new_split", values="seq", aggfunc="count").sort_index(key=lambda x: [type_order.index(y) for y in x])[["train","val","test"]]

In [211]:
summary

new_split,train,val,test
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
hydrolase,873.0,56.0,411.0
transferase,355.0,237.0,257.0
other,542.0,91.0,58.0
transcription,122.0,15.0,70.0
lyase,85.0,39.0,308.0
transport,114.0,20.0,26.0
oxidoreductase,64.0,6.0,29.0
ligase,37.0,6.0,31.0
isomerase,41.0,18.0,21.0
chaperone,,27.0,91.0


In [134]:
for elem in prot_element_counter:
    print(elem,prot_element_counter[elem],"%.2f"%(prot_element_counter[elem]/counter))

C 19398 1.00
S 16375 0.84
O 19398 1.00
N 19398 1.00
ZN 1842 0.09
CA 400 0.02
CO 66 0.00
MG 858 0.04
NI 79 0.00
MN 356 0.02
FE 104 0.01
NA 138 0.01
K 78 0.00
SE 107 0.01
HG 6 0.00
CU 16 0.00
CS 1 0.00
P 105 0.01
CD 22 0.00
SR 1 0.00
GA 1 0.00
RB 1 0.00


# Covalent vs non-covalent binders
according to CovBindersInPDB (https://yzhang.hpc.nyu.edu/CovBinderInPDB/)

In [3]:
cov=pd.read_csv("CovBinderInPDB_2022Q4_AllRecords.csv")[["pdb_id","binder_id_in_adduct"]]

In [7]:
cov_pdbids=set(cov.pdb_id)

In [8]:
covalent_in_pdbbind=[item for item in info.index if item.upper() in cov_pdbids]

In [10]:
false_covalent=["3n3g",
"3kku",
"1hvy",
"4zam",
"1f28",
"4hva",
"4do3",
"4ch8",
"2kce",
"1juj",
"4ch2",
"6avi",
"5mi7",
"6hhp",
"3dla",
"2qnx",
"5mi6",
"1jr1",
"5mi5",
"6hmt",
"4gz3",
"1dva",
"1jtq",
"1jut",
"5vfd",
"2qo1"
]

In [11]:
true_covalent_in_pdbbind=[item.lower() for item in covalent_in_pdbbind if item.lower() not in false_covalent]

In [15]:
info["covalent"]=False
for item in true_covalent_in_pdbbind:
    info.loc[item,"covalent"]=True

In [17]:
info.covalent.value_counts()

False    18550
True       893
Name: covalent, dtype: int64

In [18]:
info.to_csv("final_info_with_covalency.csv")

In [24]:
from rdkit import Chem
from Bio.PDB import PDBParser
import numpy as np
import os

# three letter codes for all amino acids
excluded_residue_names = ['ALA', 'ARG', 'ASN', 'ASP', 'CYS', 'GLN', 'GLU',
 'GLY', 'HIS', 'ILE', 'LEU', 'LYS', 'MET', 'PHE', 'PRO', 'SER', 'THR', 'TRP',
 'TYR', 'VAL', 'HOH']

parser=PDBParser()

def download_pdb(pdbid, position):
    os.system("wget -O {}/{}.pdb https://files.rcsb.org/view/{}.pdb > /dev/null".format(position, pdbid, pdbid))

def get_pdbid_matched_ligand_name(pdbid, category, tmp_folder="tmp"):
    if category=="general":
        ligand_file = "v2020-other-PL/"+pdbid+"/"+pdbid+"_ligand.mol2"
    elif category in ["refined", "core"]:
        ligand_file = "refined-set/"+pdbid+"/"+pdbid+"_ligand.mol2"
    mol = Chem.MolFromMol2File(ligand_file, sanitize=False)
    mol_center = mol.GetConformer().GetPositions().mean(axis=0)

    # if not os.path.exists(tmp_folder+"/"+pdbid+".pdb"):
    #     download_pdb(pdbid, tmp_folder)
    pdb=parser.get_structure(pdbid, tmp_folder+"/"+pdbid+".pdb")
    resnames = []
    centers = []
    for residue in pdb.get_residues():
        resname = residue.get_resname()
        if resname not in excluded_residue_names:
            resnames.append(resname)
            centers.append(np.mean([atom.get_coord() for atom in residue.get_atoms()], axis=0))

    distances = np.linalg.norm(np.array(centers) - mol_center, axis=1)
    min_dist = distances.min()
    return resnames[np.argmin(distances)], min_dist


In [32]:
errors = []

for item in covalent_in_pdbbind:
    item_category = info.loc[item,"category"]
    matched_id,min_dist=get_pdbid_matched_ligand_name(item,item_category)
    
    should_be_ligand = list(cov[cov.pdb_id==item.upper()]["binder_id_in_adduct"].unique())
    if (matched_id not in should_be_ligand) or min_dist>1:
        errors.append((item,matched_id,should_be_ligand,min_dist))
    

