In [1]:
import os
from pathlib import  Path
import glob
import pandas as pd
import pickle
import numpy as np
from pymol import cmd
from tqdm import tqdm

In [None]:
PROTAC_complex_path = 'protacs' # the complex file path, you should change.
PROTAC_data_path = 'protacs.csv' # the csv file path, including the label information, you should change.

In [2]:
name_list = os.listdir(PROTAC_complex_path)
name_list = [x.split('.')[0] for x in name_list]

In [3]:
name_list[:5]

['318_BCR-ABL_CRBN',
 '2023_CDK4_CRBN',
 '1550_EGFR-L858R-T790M_CRBN',
 '194_BRD4_CRBN',
 '2293_PDL1_CRBN']

In [4]:
Path('data').mkdir(exist_ok=True)
Path('pocket_target').mkdir(exist_ok=True)
Path('pocket_ligase').mkdir(exist_ok=True)
Path('ligand_target').mkdir(exist_ok=True)
Path('ligand_ligase').mkdir(exist_ok=True)
Path('ligand_target').mkdir(exist_ok=True)
Path('ligand_ligase').mkdir(exist_ok=True)

In [5]:
with open('data/name.pkl','wb') as f:
    pickle.dump(name_list, f)

In [6]:
for i in name_list:
    cmd.load(glob.glob('protacs/'+i+"/*rotein.pdb")[0])
    cmd.remove('h.')
    cmd.select("target","byres chain C around 5 and chain A")
    cmd.save("pocket_target/"+i+".mol2","target")
    cmd.select("ligand_target","chain C")
    cmd.save("ligand_target/"+i+".sdf","ligand_target")
    cmd.delete("all")

    cmd.load(glob.glob('protacs/'+i+"/*igase.pdb")[0])
    cmd.remove('h.')
    cmd.select("ligase","byres chain D around 5 and Chain B")
    cmd.save("pocket_ligase/"+i+".mol2","ligase")
    cmd.select("ligand_ligase","chain D")
    cmd.save("ligand_ligase/"+i+".sdf","ligand_ligase")
    cmd.delete("all")

 PyMOL not running, entering library mode (experimental)


In [7]:
ligase_atom = {}
ligase_bond = {}
for name in name_list:
    with open('pocket_ligase/'+name+".mol2") as f:
        lines = f.readlines()
    atoms = lines[lines.index('@<TRIPOS>ATOM\n')+1:lines.index('@<TRIPOS>BOND\n')]
    bonds = lines[lines.index('@<TRIPOS>BOND\n')+1:lines.index('@<TRIPOS>SUBSTRUCTURE\n')]
    atom = [i.split('\t')[1].strip()[0] for i in atoms]
    CODE = ['C','N','O','S']
    atom_code = [CODE.index(x) if x in atom else 4 for x in atom]
    bond_1 = [int(i.split(' ')[1])-1 for i in bonds]
    bond_2 = [int(i.split(' ')[2])-1 for i in bonds]
    bond = np.eye(len(atom))
    for i in range(len(bond_1)):
        bond[bond_1[i]][bond_2[i]] = 1
        bond[bond_2[i]][bond_1[i]] = 1
    ligase_atom[name] = atom_code
    ligase_bond[name] = bond
with open('data/ligase_atom.pkl','wb') as f:
    pickle.dump(ligase_atom, f)
with open('data/ligase_bond.pkl','wb') as f:
    pickle.dump(ligase_bond, f)

In [8]:
target_atom = {}
target_bond = {}
for name in name_list:
    with open('pocket_target/'+name+'.mol2') as f:
        lines = f.readlines()
    atoms = lines[lines.index('@<TRIPOS>ATOM\n')+1:lines.index('@<TRIPOS>BOND\n')]
    bonds = lines[lines.index('@<TRIPOS>BOND\n')+1:lines.index('@<TRIPOS>SUBSTRUCTURE\n')]
    atom = [i.split('\t')[1].strip()[0] for i in atoms]
    CODE = ['C','N','O','S']
    atom_code = [CODE.index(x) if x in atom else 4 for x in atom]
    bond_1 = [int(i.split(' ')[1])-1 for i in bonds]
    bond_2 = [int(i.split(' ')[2])-1 for i in bonds]
    bond = np.eye(len(atom))
    for i in range(len(bond_1)):
        bond[bond_1[i]][bond_2[i]] = 1
        bond[bond_2[i]][bond_1[i]] = 1
    target_atom[name] = atom_code
    target_bond[name] = bond
with open('data/target_atom.pkl','wb') as f:
    pickle.dump(target_atom, f)
with open('data/target_bond.pkl','wb') as f:
    pickle.dump(target_bond, f)

In [9]:
ligase_ligand_atom = {}
ligase_ligand_bond = {}
for name in name_list:
    with open('ligand_ligase/'+name+'.sdf') as f:
        lines = f.readlines()
    atoms = []
    bond_1 = []
    bond_2 = []
    for line in lines:
        if len(line.strip().split()) == 16:
            atoms.append(line.strip().split()[3])
        if len(line.strip().split()) == 4:
            bond_1.append(int(line.strip().split()[0])-1)
            bond_2.append(int(line.strip().split()[1])-1)

    CODE = ['C','N','O','S','F','Cl','Br','I','P']
    atom_code = [CODE.index(x) if x in atoms else 9 for x in atoms]
    bond = np.eye(len(atom_code))
    for i in range(len(bond_1)):
        bond[bond_1[i]][bond_2[i]] = 1
        bond[bond_2[i]][bond_1[i]] = 1

    ligase_ligand_atom[name] = atom_code
    ligase_ligand_bond[name] = bond
with open('data/ligand_ligase_atom.pkl','wb') as f:
    pickle.dump(ligase_ligand_atom, f)
with open('data/ligand_ligase_bond.pkl','wb') as f:
    pickle.dump(ligase_ligand_bond, f)

In [10]:
target_ligand_atom = {}
target_ligand_bond = {}
for name in name_list:
    with open('ligand_target/'+name+'.sdf') as f:
        lines = f.readlines()
    atoms = []
    bond_1 = []
    bond_2 = []
    for line in lines:
        if len(line.strip().split()) == 16:
            atoms.append(line.strip().split()[3])
        if len(line.strip().split()) == 4:
            bond_1.append(int(line.strip().split()[0])-1)
            bond_2.append(int(line.strip().split()[1])-1)

    CODE = ['C','N','O','S','F','Cl','Br','I','P']
    atom_code = [CODE.index(x) if x in atoms else 9 for x in atoms]
    bond = np.eye(len(atom_code))
    for i in range(len(bond_1)):
        bond[bond_1[i]][bond_2[i]] = 1
        bond[bond_2[i]][bond_1[i]] = 1

    target_ligand_atom[name] = atom_code
    target_ligand_bond[name] = bond
with open('data/ligand_target_atom.pkl','wb') as f:
    pickle.dump(target_ligand_atom, f)
with open('data/ligand_target_bond.pkl','wb') as f:
    pickle.dump(target_ligand_bond, f)

In [11]:
SMILES_CHAR =['?','C', '(', '=', 'O', ')', 'N', '[', '@', 'H', ']', '1', 'c', 'n', '/', '2', '#', 'S', 's', '+', '-', '\\', '3', '4', 'l', 'F', 'o', 'I', 'B', 'r', 'P', '5', '6', 'i', '7', '8', '9', '%', '0', 'p']

def trans_smiles(x):
    temp = list(x)
    temp = [SMILES_CHAR.index(i) if i in SMILES_CHAR else 0 for i in temp]
    return temp

smiles = {}
for i in name_list:
    smi_num = i.split("_")[0]
    if Path("protacs/"+i+"/linker_"+smi_num+".smi").exists():
        with open("protacs/"+i+"/linker_"+smi_num+".smi") as f:
            smi = f.read()
        smiles[i] = trans_smiles(smi.strip())
    else:
        smiles[i]= [0]
with open("data/smiles.pkl","wb") as f:
    pickle.dump(smiles,f)

In [12]:
label_csv = pd.read_csv(PROTAC_data_path)
label_csv[:5]

Unnamed: 0,Compound ID,Uniprot,Target,PDB,Name,Smiles,DC50 (nM),Dmax (%),Assay (DC50/Dmax),Percent degradation (%),...,Degradation Assay,Unnamed: 88,Unnamed: 89,Unnamed: 90,Unnamed: 91,Unnamed: 92,Unnamed: 93,Unnamed: 94,Unnamed: 95,Unnamed: 96
0,1,Q9NPI1,BRD7,,,COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...,,,,,...,% BRD7 degradation in HeLa cells after 4/16 h ...,,,,,,,,,
1,1,Q9H8M2,BRD9,,,COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...,,,,,...,% BRD9 degradation in HeLa cells after 4/16 h ...,,,,,,,,,
2,2,Q9NPI1,BRD7,,,COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...,,,,,...,% BRD7 degradation in HeLa cells after 4/16 h ...,,,,,,,,,
3,2,Q9H8M2,BRD9,,,COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...,,,,,...,% BRD9 degradation in HeLa cells after 4/16 h ...,,,,,,,,,
4,3,Q9H8M2,BRD9,,,COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...,,,,,...,% BRD9 degradation in HeLa cells after 4/16 h ...,,,,,,,,,


In [13]:
label_csv = pd.read_csv("protacs.csv")
id = list(label_csv["Compound ID"])
tar = list(label_csv["Target"])
e3  = list(label_csv["E3 Ligase"])
lab = list(label_csv["Degradation Identification"])
labels = {}
for i in range(len(id)):
    a = str(id[i])+"_"+tar[i].split('_')[0].replace(' ','-').replace('/','-')+"_"+e3[i].split('_')[0]
    labels[a] = lab[i]
for i in name_list:
    if i not in labels:
        print(i) 
labels_code = {}
for i in labels:
    if labels[i]=='Good' or labels[i]=='Excellent':
        labels_code[i] = 1
    elif  labels[i]=='Moderate' or labels[i]=="Not Good" or labels[i]=='Poor':
        labels_code[i] = 0
for i in name_list:
    if i not in labels_code:
        print(i)
with open("data/label.pkl","wb") as f:
    pickle.dump(labels_code, f)