In [1]:
from rdkit import Chem
from rdkit.Chem.Scaffolds import MurckoScaffold
from rdkit.Chem import Draw
import numpy as np
import pandas as pd
from rdkit.Chem import AllChem
from rdkit.Chem.Descriptors import NumValenceElectrons
from rdkit.Chem import Descriptors
from rdkit.Chem.rdMolDescriptors import GetUSRScore, GetUSRCAT

In [None]:
#smile standardization
path = 'Davis.csv'
df=pd.read_csv(path)
def Standardization(smi):
    mol = Chem.MolFromSmiles(smi)
    smi = Chem.MolToSmiles(mol)
    return smi
df['SMILES']=df["SMILES"].apply(Standardization)
save_path="Davis_sta.csv"
df.to_csv(save_path,index=False)

In [218]:
#smile->sdf
X = pd.read_csv('Davis_sta.csv')['SMILES']
smilesList = X.tolist()
mols = []
for smiles in smilesList:
    mol = AllChem.AddHs(Chem.MolFromSmiles(smiles))
    AllChem.EmbedMolecule(mol,useRandomCoords=True)
    AllChem.MMFFOptimizeMolecule(mol)
    mols.append(mol)
#print(mols)
#work_patch = os.getcwd()
writer = Chem.SDWriter('Davis.sdf')
for i, mol in enumerate(mols):
    
    writer.write(mol)

writer.close() 


In [2]:
#Butina
def ClusterFps(fps,cutoff=0.2):
    from rdkit import DataStructs
    from rdkit.ML.Cluster import Butina

    # first generate the distance matrix:
    dists = []
    nfps = len(fps)
    for i in range(1,nfps):
        sims = DataStructs.BulkTanimotoSimilarity(fps[i],fps[:i])
        dists.extend([1-x for x in sims])

    # now cluster the data:
    cs = Butina.ClusterData(dists,nfps,cutoff,isDistData=True)
    return cs

In [3]:
ms = [x for x in Chem.SDMolSupplier('Davis.sdf')]
fps = [AllChem.GetMorganFingerprintAsBitVect(x,2,1024) for x in ms]
clusters=ClusterFps(fps,cutoff=0.5)
#print(clusters)


In [None]:
#Compound scaffold
drugbank_input = Chem.SDMolSupplier('Davis.sdf')
drugbank = [m for m in drugbank_input if m]
drugbank_atomic_scaffolds = [MurckoScaffold.GetScaffoldForMol(mol) for mol in drugbank]
for i in drugbank_atomic_scaffolds:
    i.Compute2DCoords()

def genericize_scaffold(s):
    
    try:
        return MurckoScaffold.MakeScaffoldGeneric(s)
        
    

    except ValueError:
        return None
drugbank_grafh_scaffolds = [genericize_scaffold(s) for s in drugbank_atomic_scaffolds]
#len(drugbank), len(drugbank_atomic_scaffolds), len(drugbank_grafh_scaffolds), len([x for x in drugbank_grafh_scaffolds if x == None])
Chem.MolToSmiles(drugbank_grafh_scaffolds[0])

for i in drugbank_grafh_scaffolds:
    #print(i)
    a = Chem.MolToSmiles(i)
    print(a)
scaffold_smiles = [Chem.MolToSmiles(scaffold) for scaffold in drugbank_grafh_scaffolds if scaffold != None]

import collections
counter=collections.Counter(scaffold_smiles)

print(counter)





In [None]:
#usrcat
fh=open('Davis.txt')
smis=[]
for line in fh.readlines():
    smi=line.strip()
    smis.append(smi)
#print(smis)
mols3d=[]
for smi in smis:
    mol=Chem.MolFromSmiles(smi)
    m2=Chem.AddHs(mol)
    AllChem.EmbedMolecule(m2,useRandomCoords=True)
    # maxIters默认是优化200次，有时不会收敛，建议设置成2000
    opt_state=AllChem.MMFFOptimizeMolecule(m2,maxIters=2000)
    #print(opt_state)
    mols3d.append(m2)

usrcats = [ GetUSRCAT( mol ) for mol in mols3d ]
with open('Davis_usr.txt','wt') as f:
    for i in range( len( usrcats )):
        for j in range( len( usrcats )):
            score = GetUSRScore( usrcats[ i ], usrcats[ j ] )
            print(i,j,score,file = f)


