In [1]:
from Bio import PDB
from Bio.PDB import PDBIO, Select
import pandas as pd
from copy import copy

parser = PDB.PDBParser(QUIET=True)

In [2]:
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem import AllChem
from rdkit.Chem.AllChem import Compute2DCoords

import numpy as np
import subprocess
import shutil
import os

In [3]:
from Bio import Align
from Bio.SubsMat import MatrixInfo as matlist
import numpy as np
from tqdm import tqdm
from Bio.PDB.StructureBuilder import StructureBuilder

def Needleman_Wunsch_alignment(seq1,seq2):
    '''
    Function for doing global alignment between seq1 and seq2 using Needleman-Wunsch algorithm implemented in Biopython
    '''
    missing=None
    if "-" in seq1:
        # Need to handle "-" beforehand, otherwise the alignment may fail
        missing=[s=="-" for s in seq1]
        seq1=seq1.replace("-","")
    aligner=Align.PairwiseAligner()
    aligner.open_gap_score=-10
    aligner.extend_gap_score=-0.5
    aligner.substitution_matrix=matlist.blosum62
    alignment=aligner.align(seq1,seq2)[0]
    alignment_info=alignment.__str__().split("\n")
    aligned1,aligned2=alignment_info[0],alignment_info[2]
    if missing is None:
        final1=aligned1
        final2=aligned2
    else:
        # Assign alignment with "-"
        final1_temp=""
        final2_temp=""
        j=0
        for s in missing:
            if s:
                final1_temp+="-"
                final2_temp+="-"
            else:
                while aligned1[j]=="-" and j<len(aligned1):
                    final1_temp+=aligned1[j]
                    final2_temp+=aligned2[j]
                    j+=1
                if j<len(aligned1):
                    final1_temp+=aligned1[j]
                    final2_temp+=aligned2[j]
                    j+=1
        if j<len(aligned1):
            final1_temp+=aligned1[j:]
            final2_temp+=aligned2[j:]
        # Cleaning up
        final1=""
        final2=""
        for i in range(len(final1_temp)):
            if not (final1_temp[i]=="-" and final2_temp[i]=="-"):
                final1+=final1_temp[i]
                final2+=final2_temp[i]
    return final1,final2

        
def calc_seq_align_identity(seq1,seq2):
    if seq1 in seq2 or seq2 in seq1:
        return 1
    aseq1,aseq2=Needleman_Wunsch_alignment(seq1,seq2)
    assert len(aseq1)==len(aseq2)
    while aseq1[0]=="-" or aseq2[0]=="-":
        aseq1=aseq1[1:]
        aseq2=aseq2[1:]
    while aseq1[-1]=="-" or aseq2[-1]=="-":
        aseq1=aseq1[:-1]
        aseq2=aseq2[:-1]

    same_count = 0
    aligned_count = 0
    for i in range(len(aseq1)):
        if aseq1[i]=="-" or aseq2[i]=="-":
            continue
        aligned_count+=1
        if aseq1[i]==aseq2[i]:
            same_count+=1
    if aligned_count / len(aseq1) < 0.6:
        return -1
        
    
    return same_count/aligned_count

def get_min_dist_between_lig_and_chain(ligand, chain):
    ligand_coords = ligand.GetConformer().GetPositions()
    chain_CA_coords = np.array([res["CA"].get_coord() for res in chain if PDB.Polypeptide.is_aa(res)])
    # minimum distance between ligand and chain
    min_dist = np.min(np.linalg.norm(ligand_coords[:, None, :] - chain_CA_coords[None, :, :], axis=-1))
    return min_dist


class AAOnly(Select):
    def accept_residue(self, residue):
        return PDB.Polypeptide.is_aa(residue)
    
    def accept_atom(self, atom):
        if not atom.is_disordered() or atom.get_altloc() == "A":
            atom.set_altloc(" ")
            return True
        return False

class AltlocSelect(Select):
    
    def accept_atom(self, atom):
        if not atom.is_disordered() or atom.get_altloc() == 'A':
            return True
        return False
    



def save_multiple_chains(list_of_chains, save_addr, selector=None):
    sb = StructureBuilder()
    sb.init_structure("pdb")
    sb.init_seg(" ")
    sb.init_model(0)
    
    for chain in list_of_chains:
        sb.structure[0].add(chain)

    io = PDB.PDBIO()
    io.set_structure(sb.structure)
    io.save(save_addr, select=selector)


def write_alignment(seq1, seq2, out_file):
    aseq1, aseq2 = Needleman_Wunsch_alignment(seq1, seq2)
    seq1_line = ""
    seq2_line = ""
    final_lines = []
    for i in range(len(aseq1)):
        if i % 150 == 0:
            final_lines.append(seq1_line + "\n")
            final_lines.append(seq2_line + "\n")
            final_lines.append("\n")
            seq1_line = ""
            seq2_line = ""
        if aseq1[i] == "-" or aseq2[i] == "-":
            seq1_line += aseq1[i]
            seq2_line += aseq2[i]
        elif aseq1[i] == aseq2[i]:
            seq1_line += aseq1[i]
            seq2_line += aseq2[i]
        else:
            seq1_line += '\033[31m' + aseq1[i] + '\033[0m'
            seq2_line += '\033[31m' + aseq2[i] + '\033[0m'
    
    with open(out_file, "w") as f:
        f.writelines(final_lines)


In [4]:

ignored_list = ["HOH", "PO4"]
min_dist_threshold = 5

In [5]:
df=pd.read_csv("prepared.csv",index_col=0)

In [6]:
df

Unnamed: 0_level_0,value,smiles,seq
pdbid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
6TRX,14,NC(CC(=O)N1CCN(CC1)C(c1ccc(F)cc1)c1ccc(F)cc1)C...,MWKRSEQMKIKSGKCNMAAAMETEQLGVEIFETADCEENIESQDRP...
7LQZ,65,COc1cc(CC(=O)OCC2=C[C@H]3[C@H]4OC5(Cc6ccccc6)O...,MKKWSSTDLGAAADPLQKDTCPDPLDGDPNSRPPPAKPQLSTAKSR...
7T6S,44,CC(C)c1c(NC(=O)Nc2ccc(Cl)cc2)c(=O)n(-c2ccccc2)n1C,METNFSTPLNEYEEVSYESAGYTVLRILPLVVLGVTFVLGVLGNGL...
7SHV,0.13,OCCn1cc(c(n1)-c1ccncc1)-c1ccc2C(CCc2c1)N=O,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...
6LR4,28,CC(C)[C@H](O)C(=O)N[C@@H](C)C(=O)N[C@H]1c2cccc...,MTELPAPLSYFQNAQMSEDNHLSNTVRSQNDNRERQEHNDRRSLGH...
...,...,...,...
7LR0,20,COc1cc(CNC(=O)CCCC\C=C\C(C)C)ccc1O,MKKWSSTDLGAAADPLQKDTCPDPLDGDPNSRPPPAKPQLSTAKSR...
7EW2,5.9,CCCCCCCCc1ccc(CC[C@](N)(CO)COP(O)(O)=O)cc1,MATALPPRLQPVRGNETLREHYQYVGKLAGRLKEASEGSTLTTVLF...
5SIJ,1.8,Cc1cnc(C)c2nc(CCc3nc(cn3C)-c3ccccc3)nn12,MRIEERKSQHLTGLTDEKVKAYLSLHPQVLDEFVSESVSAETVEKW...
7NA4,3,CCOC(C1CC1)c1nc2cc(nc(-c3cncc(Cl)c3)c2n1C(C)[C...,MCNTNMSVPTDGAVTTSQIPASEQETLVRPKPLLLKLLKSVGAQKD...


In [None]:
import os
os.mkdir("dataset")

In [8]:
import urllib

def get_pdb(pdbid):
    pdbl = PDB.PDBList()
    pdbl.retrieve_pdb_file(pdbid, pdir=f"dataset/{pdbid}", file_format="pdb")
    if not os.path.exists(f"dataset/{pdbid}/pdb{pdbid.lower()}.ent"):
        print("Download and convert .cif file")
        os.makedirs(f"dataset/{pdbid}_IP", exist_ok=True)
        # use urlretrieve to get the .cif file
        url = f"https://files.rcsb.org/download/{pdbid}.cif"
        urllib.request.urlretrieve(url, f"dataset/{pdbid}_IP/{pdbid}.cif")

        # convert .cif to .pdb
        subprocess.run(["obabel", f"dataset/{pdbid}_IP/{pdbid}.cif", "-O", f"dataset/{pdbid}/pdb{pdbid.lower()}.ent"])


In [9]:
def process_pdb(pdbid, maximum_chain_dist=min_dist_threshold):
    get_pdb(pdbid)
    struc=parser.get_structure(pdbid, f"dataset/{pdbid}/pdb{pdbid.lower()}.ent")

    os.makedirs(f"dataset/{pdbid}_IP", exist_ok=True)
    # get chain sequences
    seqs={}
    for chain in struc[0].child_dict.values():
        seqs[chain.id]="".join([PDB.Polypeptide.three_to_one(res.get_resname()) for res in chain.get_residues() if PDB.Polypeptide.is_aa(res, standard=True)])

    
    # get ligand from pdb
    interesting_ligand_id = 0
    for residue in struc[0].get_residues():
        if not PDB.Polypeptide.is_aa(residue, standard=True):
            print(residue.get_resname())
            if residue.get_resname() not in ignored_list and len(residue.child_list) > 1:
                copied_residue = residue.copy()
                # save residue as ligand
                try:
                    io = PDB.PDBIO()
                    io.set_structure(residue)
                    io.save(f"dataset/{pdbid}_IP/lig_{interesting_ligand_id}.pdb", AltlocSelect())
                except:
                    io = PDB.PDBIO()
                    io.set_structure(copied_residue)
                    io.save(f"dataset/{pdbid}_IP/lig_{interesting_ligand_id}.pdb")
                subprocess.call(["obabel", f"dataset/{pdbid}_IP/lig_{interesting_ligand_id}.pdb", "-O", f"dataset/{pdbid}_IP/lig_{interesting_ligand_id}.sdf"])
                interesting_ligand_id += 1

    ligands = []
    for i in range(interesting_ligand_id):
        mol = Chem.MolFromMolFile(f"dataset/{pdbid}_IP/lig_{i}.sdf", sanitize=False)
        mol_noH = Chem.RemoveHs(mol, sanitize=False)
        Chem.rdmolops.SanitizeMol(mol_noH, 268435455-2)
        ligands.append(mol_noH)

    if len(ligands) == 0:
        print(f"Warning! No ligands found for {pdbid}!")
        return None

    ref_mol=Chem.MolFromSmiles(df.loc[pdbid]["smiles"])
    ref_mol_fp=AllChem.GetMorganFingerprintAsBitVect(ref_mol,2,nBits=1024)

    # get all similarities
    similarity = [Chem.DataStructs.DiceSimilarity(ref_mol_fp, AllChem.GetMorganFingerprintAsBitVect(ligand, 2,nBits=1024)) for ligand in ligands]
    chosen_ligand_idx = np.argmax(similarity)
    chosen_ligand = Chem.RemoveHs(ligands[chosen_ligand_idx], sanitize=False)

    if chosen_ligand.GetNumAtoms() != ref_mol.GetNumAtoms():
        print(f"Warning! Best matched ligand has different number of heavy atoms for {pdbid}!")
        return chosen_ligand, ref_mol

    fixed_ligand = AllChem.AssignBondOrdersFromTemplate(ref_mol, chosen_ligand)
    Chem.MolToMolFile(fixed_ligand, f"dataset/{pdbid}/ligand.sdf")

    # if similarity[chosen_ligand_idx] < 0.8:
    #     print(f"Warning! Best matched ligand is not similar for {pdbid}!")
    #     titles = ["ref_mol"] + [f"ligand_{i}" + ("(selected)" if i == chosen_ligand_idx else "")\
    #                              for i in range(len(ligands))]
    #     proc_ligs = []
    #     for lig in ligands:
    #         copied_lig = copy(lig)
    #         AllChem.Compute2DCoords(copied_lig)
    #         proc_ligs.append(Chem.RemoveHs(copied_lig,sanitize=False))
    #         img = Draw.MolsToGridImage([ref_mol] + proc_ligs, legends=titles, molsPerRow=5, subImgSize=(300,300))
    #     img.save(f"dataset/{pdbid}/alternative_ligands.png")
    
    # chosen_ligand_idx = np.argmax(similarity)

    ref_chain_seq = df.loc[pdbid]["seq"]
    seq_sims = {}
    for chain_id in seqs:
        seq_sims[chain_id] = calc_seq_align_identity(ref_chain_seq, seqs[chain_id])


    satisfied_chains = []
    satisfied_chain_sims = []
    for seqid in seqs:
        if len(seqs[seqid]) == 0:
            continue
        min_dist = get_min_dist_between_lig_and_chain(fixed_ligand, struc[0][seqid])
        print("chain", seqid, "min_dist", min_dist)
        if min_dist > maximum_chain_dist:
            continue
        satisfied_chains.append(seqid)
        satisfied_chain_sims.append(seq_sims[seqid])

    
    if len(satisfied_chains) == 0:
        print(f"Warning! No chains near ligand of interest for {pdbid}!")
        return None
    
    if max(seq_sims.values()) != 1:
        max_seq_sim_id = max(seq_sims, key=seq_sims.get)
        print(f"Warning! Dissimilar chain for {pdbid} ({seq_sims[max_seq_sim_id]})!")
        
        write_alignment(ref_chain_seq, seqs[max_seq_sim_id], f"dataset/{pdbid}/seq_alignment.txt")
        # return ref_chain_seq, seqs, satisfied_chains
    
    if max(satisfied_chain_sims) != 1:
        print(f"Warning! Matched sequence not in contact with ligand for {pdbid}!")
    
    save_multiple_chains([struc[0][chain_id] for chain_id in satisfied_chains], \
                          f"dataset/{pdbid}/protein.pdb", AAOnly())
    
    # shutil.move(f"dataset/{pdbid}_IP/lig_{chosen_ligand_idx}.sdf", f"dataset/{pdbid}/ligand.sdf")
    shutil.move(f"dataset/{pdbid}_IP/lig_{chosen_ligand_idx}.pdb", f"dataset/{pdbid}/ligand.pdb")
    shutil.rmtree(f"dataset/{pdbid}_IP")



In [40]:
for pdbid in df.index:
    process_pdb(pdbid)

Structure exists: 'dataset/6TRX/pdb6trx.ent' 
Structure exists: 'dataset/7LQZ/pdb7lqz.ent' 
Structure exists: 'dataset/7T6S/pdb7t6s.ent' 
Structure exists: 'dataset/7SHV/pdb7shv.ent' 
Structure exists: 'dataset/6LR4/pdb6lr4.ent' 
Structure exists: 'dataset/7Y5T/pdb7y5t.ent' 
Structure exists: 'dataset/7DMC/pdb7dmc.ent' 
Structure exists: 'dataset/7B91/pdb7b91.ent' 
Structure exists: 'dataset/7QQ6/pdb7qq6.ent' 
Structure exists: 'dataset/7X5H/pdb7x5h.ent' 
Structure exists: 'dataset/7XTB/pdb7xtb.ent' 
Structure exists: 'dataset/7QNE/pdb7qne.ent' 
Structure exists: 'dataset/7E9H/pdb7e9h.ent' 
Structure exists: 'dataset/7MU5/pdb7mu5.ent' 
Structure exists: 'dataset/7EJ8/pdb7ej8.ent' 
Structure exists: 'dataset/7JVR/pdb7jvr.ent' 
Structure exists: 'dataset/7CX2/pdb7cx2.ent' 
Structure exists: 'dataset/7CMV/pdb7cmv.ent' 
Structure exists: 'dataset/7EJ0/pdb7ej0.ent' 
Structure exists: 'dataset/7EJK/pdb7ejk.ent' 
Structure exists: 'dataset/7CMU/pdb7cmu.ent' 
Structure exists: 'dataset/7DFL/pd

# Final dataset

In [2]:
import pandas as pd
df_final=pd.read_csv("bindingDB_processed.csv",index_col=0)

In [16]:
df_final=pd.read_csv("prepared_final.csv",index_col=0)

In [36]:
df_final["accurate"]=~df_final.value.str.contains('<')

In [42]:
df_final["pKa"]=-np.log10(df_final.value.apply(lambda x: float(x.replace("<","")))*1e-9)

In [45]:
df_final.to_csv("bindingDB_processed.csv")