# RDKit拓扑分子指纹

In [1]:
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem

In [2]:
ms = [Chem.MolFromSmiles('CCOC'), Chem.MolFromSmiles('CCO'), Chem.MolFromSmiles('COC')]
# 从SMILES编码中获取结构信息
fps = [Chem.RDKFingerprint(x) for x in ms]
# 计算拓扑分子指纹。得到默认长度为2048的分子指纹
fps
# print(len(fps[0]))
# print(fps[0].ToBitString())

[<rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x183c22018c0>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x183c2201930>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x183c22019a0>]

In [3]:
print(DataStructs.FingerprintSimilarity(fps[0], fps[1]))
# 相似性比较，默认方法为DataStructs.TanimotoSimilarity，即谷本相似度,等价于下面一行代码
print(DataStructs.TanimotoSimilarity(fps[0], fps[1]))

0.6
0.6


# Morgan指纹（圆形指纹）

以SparseBitVects方式生成摩根指纹：GetMorganFingerprint(mol, radius) radius：考虑半径

In [22]:
mfp = [AllChem.GetMorganFingerprint(x, 2) for x in ms]
mfp[0].GetLength()

4294967295

In [23]:
mfp[2].GetNonzeroElements()

{864674487: 1, 2154640335: 1, 2246728737: 2, 3975275337: 2}

以ExplicitBitVects方式生成摩根指纹：GetMorganFingerprintAsBitVect(mol, radius, nBits) radius：考虑半径 nBits：指纹长度

In [25]:
mfp = [AllChem.GetMorganFingerprintAsBitVect(x, 2, nBits=10) for x in ms]
print(mfp[0].GetNumBits())
print(mfp[0].ToBitString())
print(DataStructs.TanimotoSimilarity(mfp[0], mfp[1]))

10
0010100100
0.6


# DrugBank中药物相似性计算

In [2]:
import pandas as pd
from rdkit import Chem, DataStructs
from rdkit.Chem import MACCSkeys
from rdkit.Chem import AllChem

In [3]:
data = pd.read_csv('./data/structure.csv')
# csv表中包含2列，分别为分子名称，分子的SMILES格式
print(data.info())
# data的信息
data[0:6]
# 展示data前6行

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11792 entries, 0 to 11791
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   DRUGBANK_ID  11792 non-null  object
 1   SMILES       11792 non-null  object
dtypes: object(2)
memory usage: 184.4+ KB
None


Unnamed: 0,DRUGBANK_ID,SMILES
0,DB00006,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@...
1,DB00007,CCNC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCNC(N)=N)NC(=...
2,DB00014,CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H...
3,DB00027,CC(C)C[C@@H](NC(=O)CNC(=O)[C@@H](NC=O)C(C)C)C(...
4,DB00035,NC(=O)CC[C@@H]1NC(=O)[C@H](CC2=CC=CC=C2)NC(=O)...
5,DB00050,CC(C)C[C@H](NC(=O)[C@@H](CCCNC(N)=O)NC(=O)[C@H...


In [4]:
Drug_fingerprints_smiles={} 
drug_name = data[:]['DRUGBANK_ID']
drug_smiles = data[:]['SMILES'].tolist()
# n = len(drug_name)
for i in range(100):
    molecule = Chem.MolFromSmiles(drug_smiles[i])
    Drug_fingerprints_smiles[drug_name[i]]=AllChem.GetMorganFingerprintAsBitVect(molecule, 6, nBits=1024)
    # 长度太大会报错，用到哪些就用那些数据
# 先将DrugBank中需要用到的药物转化为分子指纹
# 再计算相似度

In [5]:
filename = "./data/drug_drug.csv"
drugPair = pd.read_csv(filename)

# Drug_fingerprints_smiles[drugPair.loc[0]['drug1']]
drug1 = drugPair['drug1']
drug2 = drugPair['drug2']
drug_sim = [] 
for i in range(len(drug1)):
    drug_sim.append(DataStructs.FingerprintSimilarity(Drug_fingerprints_smiles[drug1[i]], Drug_fingerprints_smiles[drug2[i]]))
drug_sim

[0.29256594724220625,
 0.5210355987055016,
 0.5210355987055016,
 0.1404494382022472,
 0.22727272727272727]

In [6]:
from decimal import Decimal

In [7]:
finaldrug_sim = []
# for i in range(len(drug_sim)-1):
for i in range(len(drug_sim)):
    finaldrug_sim.append(Decimal(drug_sim[i]).quantize(Decimal('0.0001')))
    # 保留四位小数
drugPair['coeff'] = finaldrug_sim
drugPair

Unnamed: 0,drug1,drug2,coeff
0,DB00006,DB00007,0.2926
1,DB00007,DB00014,0.521
2,DB00014,DB00007,0.521
3,DB00027,DB00035,0.1404
4,DB00035,DB00050,0.2273


In [13]:
Out_file = './data/drug_drug_coeff.csv'
data = drugPair
df = pd.DataFrame(data)
df.to_csv(Out_file, index = False)

P.S.
'./data/structure.csv'文件通过[DrugBank](https://go.drugbank.com/releases/5-1-11/downloads/all-structures)下载对应的structures.sdf,再使用'./complement/Rcode/SDFfile.R'预处理得到的

# 关键算法

[FORMULA](./Formula.ipynb)

# 作用

确定不同药物（属于不同类别或结构不同的药物）的共同作用机制

# 参考资料

[Morgan Algorithm](./book/Morgan_Algorithm.pdf)(1965)

[【3.1】分子指纹提取-RDKit - Sam' Note (qinqianshan.com)](https://qinqianshan.com/biology/chemical/fp-rd/)
 
[【3.2.11】扩展连通性指纹（Extended Connectivity Fingerprints，ECFPs）原理介绍 - Sam' Note (qinqianshan.com)](https://qinqianshan.com/biology/chemical/ecfps/)

[【3.3.1】相似性-RDKit - Sam' Note (qinqianshan.com)](https://qinqianshan.com/biology/chemical/sim-rd/)

[Getting Started with the RDKit in Python — The RDKit 2023.09.5 documentation](https://www.rdkit.org/docs/GettingStartedInPython.html#list-of-available-descriptors)

[rdkit.Chem.AtomPairs.Pairs module — The RDKit 2023.09.5 documentation](https://www.rdkit.org/docs/source/rdkit.Chem.AtomPairs.Pairs.html)

[RDKit|分子指纹提取、相似性比较及应用_rdkit分子相似性-CSDN博客](https://blog.csdn.net/dreadlesss/article/details/106129597)