### Import Module:

In [2]:
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem import AllChem
import pandas as pd
import os

In [1]:
import rdkit
rdkit.__version__

'2021.09.4'

### Converting sequences into SMILEs and Mol objects:

In [3]:
#convert peptide to smi:
def convert_smi(peptide):
    peptide_smile = Chem.MolToSmiles(Chem.MolFromFASTA(peptide))
    return peptide_smile

#convert smi to mol:
def smi2mol(smi):
    mol = Chem.MolFromSmiles(smi)
    m3d = Chem.AddHs(mol)
    return m3d

### Using ETKDG and MMFF method optimized 3D conformers:

In [18]:
def ConformerGenerate(mol,peptide,logtext="Optenergy.log",log=True,numconfs=200):
    '''
    mol is the mol object from function smi2mol(smi).
    peptide is the currently processde peptide sequence,type is String.
    logtext is the log file which stored the coverage energy information about corresponding
    conformer.
    numconfs is a int that sets the number of conformers wanted to generate.
    '''
    cids = AllChem.EmbedMultipleConfs(mol, numConfs=numconfs,clearConfs=False)   
    res = AllChem.MMFFOptimizeMoleculeConfs(mol,numThreads=0,maxIters=800) #Energy Optimization
    if log==True:
        with open(logtext,"a+") as f: #Record the Log
            f.write("*-"*8+peptide+"*-"*8+"\n")
            f.write("Conformer optimized successfully,if coverage_state equal 0. \n"+"coverage_state,conformer_energy\n")
            for c,e in res:
                f.write(str(c)+","+str(e)+"\n")
            f.write("\n")
            f.close()
    return mol,cids

### Screening the repeatting Conformers Using RMSD:

In [19]:
def findsameconfs(mol,confs_num):
    """
    The RMSD value of each conformer was calculated.
    The first conformer utilized as the template to get RMSD of others.
    The RMSD values less than 1 A were removed.
    """
    count = confs_num  
    sameids = []
    confsset = [id for id in range(confs_num)]
    for confi in range(count):
        for confj in range(confi+1,count):
            rmsd = AllChem.GetConformerRMS(mol,confi,confj)
            if rmsd < 1:
                sameids.append(confi)
            sameids=list(set(sameids))
    for i in sameids:
        confsset.remove(i)
    return confsset
def retrieved_diverse_conf(mol):
    """
    Based on the RMSD values, the index of different conformers were extracted.
    """
    rmslist = []
    AllChem.AlignMolConformers(mol, RMSlist=rmslist)
    extractlist = [] 
    for i in range(len(rmslist)):  # Logging the index of conformer with RMSD > 1 A
        if rmslist[i] > 1:
            extractlist.append(i)
    extractlist.append(0)
    return extractlist


### Export the SDF file:

In [20]:
def generateMol(molname,mol,conflist,confnum,cwd,swd):
    """
    Saved the obtained conformers of each compound to a local SDF file.
    molname is the filename of this compound SDF file.
    mol is the mol object obtained from previous step.
    conflist is the index list of various conformers.
    cwd is the current working directory.
    swd is the path wanted to export sdf file.
    """
    w = Chem.SDWriter(molname+".sdf")
    count = 0
    for cid in conflist:
        count += 1
        w.write(mol, confId=cid)
        if count == confnum:  #only record top 100 conformers
            break
    w.close()
    addname(cwd,molname+".sdf")   #add peptide sequence for each conformer within signle sdf
    os.rename(cwd+molname+"2.sdf",swd+molname+".sdf")#change folder of sdf
    return None

def addname(folder,filename):
    """
    Added peptide sequence String in each conformers header.
    This makes it easy to identify which peptide this conformation belongs to 
    in pymol ir Pharmit.    
    """
    with open (folder+filename,"r") as fr:
        content = fr.readlines()
        fr.close()
    conformerpart = []
    for linenum in range(len(content)):
        if content[linenum][0:22] == "     RDKit          3D":
            conformerpart.append(linenum)
        if content[linenum][0:4] == "$$$$":
            conformerpart.append(linenum)
    conformerpart.sort()
    with open (folder+filename.replace(".sdf","")+"2.sdf","w+") as fw:
        for confnum in range(int(len(conformerpart)/2)):
            later = conformerpart.pop()
            former = conformerpart.pop()
            fw.write(filename+"\n")
            for line in range(former,later+1):
                fw.write(content[line])
        fw.close()
    return None

### Load Smiles, generate Mol and add H atom for Molecules:

In [21]:
#Path to a folder that holds csv files of Data sequences: 
folder = ".\\"
file=folder+"example.csv"
print("Note! your input data is "+file)

Note! your input data is .\example.csv


In [22]:
data = pd.read_csv(file)
print(data.shape)
data.head()

(2, 7)


Unnamed: 0,pepname,HIA,BBB,hepatotoxicity,nephrotoxicity,score,score.1
0,PPCPW,0.6166,-0.6,-0.5125,-0.8811,0.02364,2.363987
1,VVCTY,0.9193,-0.725,-0.6038,-0.5694,0.015041,1.504145


### Loop:

In [23]:
cwd=".\\" 
swd=".\\output\\"
confnum=100
print("Note! the current work path is "+cwd)
print("Note! the sdf were stored in path "+swd)
print("Note! the number of exporting conformers are "+str(confnum))
peptideseq = data["pepname"]
for pep in peptideseq:
    try:
        smi = convert_smi(pep)
        mol = smi2mol(smi)
        mol,cids = ConformerGenerate(mol,pep,logtext="Optenergy.log",log=True,numconfs=200) #200 is the generated conformers
        elist = findsameconfs(mol,confs_num=len(cids))
        generateMol(pep,mol,elist,confnum,cwd,swd) 
    except Exception as e:
        print(e)
print("successfully!")

Note! the current work path is .\
Note! the sdf were stored in path .\output\
Note! the number of exporting conformers are 100
successfully!


### Export SMILEs data as csv file:

In [None]:
peptideseq = data["pepname"]  #extract the sequence data
smilefilename = "PeptideSmi.csv"  #Filename of output file

In [15]:
with open(smilefilename,"w") as f:
    f.write("peptide,smiles\n")
    for pep in peptideseq:
        try:
            smi = convert_smi(pep)
            f.write(pep+","+smi+"\n")
        except Exception as e:
            print(e)
            continue
    f.close()

### Some non-peptide compounds:

In [25]:

# count = 0
# for c in comlist:
#     try:
#         count += 1
#         mol = smi2mol(c)
#         mol = ConformerGenerate(mol,str(count),"./clog.txt",numconfs=100)
#         elist = retrieved_diverse_conf(mol)
#         generateMol(str(count),mol,elist,cwd="E:\\jupyter_save_address\\Pharmacophore\\",swd="D:\\学习\\实验班\\紫菜蛋白\\药效团\\conformersofactive\\")
#     except Exception as e:
#         print(e)
#         pass
cwd=".\\" 
swd=".\\output\\"
confnum=100
print("Note! the current work path is "+cwd)
print("Note! the sdf were stored in path "+swd)
print("Note! the number of exporting conformers are "+str(confnum))

nameslist = ["kojic_acid","leonurine"]
comlist = ["C1=C(OC=C(C1=O)O)CO","COC1=CC(=CC(=C1O)OC)C(=O)OCCCCN=C(N)N"]
for i in range(len(nameslist)):
    try:
        mol = smi2mol(comlist[i])
        mol,cids = ConformerGenerate(mol,nameslist[i],logtext="Optenergy-2.log",log=True,numconfs=200) #200 is the generated conformers
        elist = findsameconfs(mol,confs_num=len(cids))
        generateMol(nameslist[i],mol,elist,confnum,cwd,swd) 
    except Exception as e:
        print(e)
print("successfully!")

Note! the current work path is .\
Note! the sdf were stored in path .\output\
Note! the number of exporting conformers are 100
successfully!


### Generate the conformers and check the energy information:

In [15]:
#import package:
from rdkit import Chem
from rdkit.Chem import AllChem


def gene_conf(smiles='CC1=CC=C(C=C1)NC2=C3C(=C(C=C2)NC4=CC=C(C=C4)C)C(=O)C5=CC=CC=C5C3=O',num_conf=10):
    """
    Import smile and generate the conformers.
    """
    mol = Chem.MolFromSmiles(smiles)
    mol = Chem.AddHs(mol)
    cids = AllChem.EmbedMultipleConfs(mol,numConfs=num_conf)
    print("the count of generated conformers is: ", mol.GetNumConformers())
    return mol


def calconfenergy(mol,smiles,num_conf=10):
    """
    Calculate the energy of the conformer.
    """
    ff = AllChem.MMFFOptimizeMolecule(mol)
    for conf_id in range(num_conf):
        empty_mol = Chem.AddHs(Chem.MolFromSmiles(smiles))
        empty_mol.AddConformer(mol.GetConformer(conf_id))
        ff = AllChem.MMFFGetMoleculeForceField(empty_mol,AllChem.MMFFGetMoleculeProperties(empty_mol))
        energy = ff.CalcEnergy()
        print("the No."+str(conf_id+1)+" energy is: ",round(energy,3))
    return None

if __name__  == "__main__":
    mol = gene_conf()
    calconfenergy(mol,'CC1=CC=C(C=C1)NC2=C3C(=C(C=C2)NC4=CC=C(C=C4)C)C(=O)C5=CC=CC=C5C3=O')

the count of generated conformers is:  10
the No.1 energy is:  102.334
the No.2 energy is:  135.442
the No.3 energy is:  132.882
the No.4 energy is:  133.967
the No.5 energy is:  133.917
the No.6 energy is:  136.286
the No.7 energy is:  140.83
the No.8 energy is:  130.797
the No.9 energy is:  135.96
the No.10 energy is:  132.905


### Merge multiple decoy SDF into One：

In [8]:
import os
files = os.listdir("D:\\学习\\实验班\\紫菜蛋白\\药效团\\诱导剂分子构象\\decoy\\decoy\\")

In [9]:
"""
Since I have so many inducers and the numbering is very long.
here they are renamed using letters plus numbers.
"""

def readfiletostring(filename,content,newname):
    with open("D:\\学习\\实验班\\紫菜蛋白\\药效团\\诱导剂分子构象\\decoy\\decoy\\"+filename,"r") as rf:
        compoundname = filename.replace(".sdf","")
        content += rf.read().replace(compoundname,newname)
        rf.close()
    return content

def writestringtofile(filename,content):
    with open("D:\\学习\\实验班\\紫菜蛋白\\药效团\\诱导剂分子构象\\decoy\\decoy\\"+filename+".sdf","w") as wf:
        wf.write(content)
        content = ""
        wf.close()
    return content


filenumber = len(files)
content = ""
index = 0
itemlist = [50,100,150,200,250,300,350,400,450,500,550,600,650,700,750,800,850,900,950,1000,1050,1100,1150,1200,1250,1300,1350,1400,1403]
words = ["aa","ab","ac","ad","ae","af","ag","ah","ai","aj","ak","al","am","an","ao","ap","aq","ar","as","at","au","av","aw","ax","ay","az","ba","bb","bc","bd","be","bf","bg","bh","bi","bj","bk","bl","bm","bn","bo","bp","bq","br","bs","bt","bu","bv","bw","bx","by","bz"]
wordsindex = 0
namerecord = ""
for i in range(1,filenumber+1):
    filename = files.pop()
    name = str(i)+words[wordsindex]
    logcurrent = filename+","+name+"\n"
    namerecord+=logcurrent
    content = readfiletostring(filename,content,name)
    if i == itemlist[index]:
        content = writestringtofile("sdfs_"+str(index),content)
        index+=1
        wordsindex+=1

In [11]:
"""
A text record the origin name the corresponding novel name.
"""
with open ("namerecord.txt","w") as f:
    f.write("lastname,nwename\n")
    f.write(namerecord)