In [3]:
import pandas as pd
from rdkit import Chem
from rdkit.DataStructs import FingerprintSimilarity
import warnings
warnings.filterwarnings('ignore')
from tqdm.autonotebook import tqdm
from rdkit.Chem import MACCSkeys

In [4]:
data = pd.read_csv('BST.csv',encoding='gb18030')
food_data = pd.read_csv('food_Compound.csv',encoding='gb18030')

In [5]:
data['SMILES'][:8045]

0       Br.Br.CC1C2CCC3C4CC=C5CC(N(C)C)CCC5(C)C4CCC32CN1C
1                                  Br.COC(=O)C1=CCCN(C)C1
2                               C#CCCCCCCCCCCCC(O)CC(O)CO
3                         C#CCCCCCCCCCCCC(O)CC(O)COC(C)=O
4                                                     C#N
                              ...                        
8040    [O-][Cl+2]([O-])OCC1(OC2OC(CO)C(O)C(O)C2O)OC(C...
8041    [O-][Si]([O-])([O-])OC1C(O)C(CO)OC1(CO)OC1OC(C...
8042             [O-][Si]([O-])([O-])OCC(O)C(O)C(O)C(O)CO
8043                c1ccc(N=N/C(=N\Nc2ccccc2)c2ccccc2)cc1
8044                  c1ccc(N=NC(=NNc2ccccc2)c2ccccc2)cc1
Name: SMILES, Length: 8045, dtype: object

In [6]:
food_data

Unnamed: 0,id,public_id,name,SMILES
0,4,FDB000004,Cyanidin 3-(6''-acetyl-galactoside),[H][C@]1(COC(C)=O)O[C@@]([H])(OC2=CC3=C(O)C=C(...
1,13,FDB000013,Cyanidin 3-(6''-succinyl-glucoside),[H][C@]1(COC(=O)CCC(O)=O)O[C@@]([H])(OC2=CC3=C...
2,14,FDB000014,Pelargonidin 3-(6''-succinyl-glucoside),[H][C@]1(COC(=O)CCC(O)=O)O[C@@]([H])(OC2=CC3=C...
3,24,FDB000024,Petunidin 3-O-(6''-acetyl-galactoside),[H][C@]1(COC(C)=O)OC(OC2=C([O+]=C3C=C(O)C=C(O)...
4,25,FDB000025,Peonidin 3-(6''-acetyl-galactoside),[H][C@]1(COC(C)=O)OC(OC2=C([O+]=C3C=C(O)C=C(O)...
...,...,...,...,...
17736,139981,FDB112148,gamma-Glutamyllysine,NCCCC[C@H](NC(=O)CC[C@H](N)C(O)=O)C(O)=O
17737,139982,FDB112149,gamma-Glutamylproline,N[C@@H](CCC(=O)N1CCC[C@H]1C(O)=O)C(O)=O
17738,139983,FDB112150,gamma-Glutamylserine,N[C@@H](CCC(=O)N[C@@H](CO)C(O)=O)C(O)=O
17739,139984,FDB112151,gamma-Glutamylthreonine,C[C@@H](O)[C@H](NC(=O)CC[C@H](N)C(O)=O)C(O)=O


In [7]:
food_data['SMILES'][10000:14000]

10000       COC1=CC=C(C=C1)C1=C(O)C(=O)C2=C(O)C=C(O)C=C2O1
10001    COC1=C(OC2OC(C)C(O)C(O)C2O)C=C(O)C2=C1OC(C1=CC...
10002    CC(C)=CCC1=C(O)C2=C(OC(CC2=O)C2=CC=C(O)C=C2)C2...
10003       CC(C)=CCC1=C(O)C2=C(OC(CC2=O)C2=CC=CC=C2)C=C1O
10004    CC(=O)OC1C(O)C(O)C(CO)OC1C1=C(O)C=C(O)C2=C1OC(...
                               ...                        
13995          O=C1OC(=O)C2=C(C=CC3=C2C1=CC=C3)C1=CC=CC=C1
13996     OC1=CC=C(C=C1)C1=C2C(=O)OC(=O)C3=CC=CC(C=C1)=C23
13997                          CC1CC(=O)C2=C(C=CC=C2O)C1=O
13998    COC1=CC(=C\C=C\C=C2C=C(OC)C(=O)C(OC)=C2)C=C(OC...
13999    COC(=O)C1(O)CC(=O)C2=C1C(=CC1=C2OC2=C(C(O)=C3C...
Name: SMILES, Length: 4000, dtype: object

In [12]:
# 定义数据集1的SMILES
dataset1_smiles = data['SMILES'][:8045]

# 定义数据集2的SMILES
dataset2_smiles = food_data['SMILES'][16000:]

# 创建结果DataFrame
AD_results = pd.DataFrame(columns=['SMILES', 'InDomain'])


# 计算数据集2中每个分子与数据集1的相似度，并判断是否在应用域内
for smiles in tqdm(dataset2_smiles):
    
    try:
        
        mol2 = Chem.MolFromSmiles(smiles)
        # 初始化应用域标志
        in_domain = False

        # 计算与数据集1中每个分子的相似度
        for ref_smiles in dataset1_smiles:
            # 转换为RDKit分子对象
            mol1 = Chem.MolFromSmiles(ref_smiles)

            # 计算Tanimoto相似度
            similarity = FingerprintSimilarity(Chem.RDKFingerprint(mol1), Chem.RDKFingerprint(mol2),)
            #print(f"Tanimoto similarity between {smiles} and {ref_smiles}: {similarity}")
            # 判断是否在应用域内
            if similarity > 0.95: 
                in_domain = True

        # 添加结果到DataFrame
        AD_results = AD_results._append({'SMILES': smiles, 'InDomain': in_domain}, ignore_index=True)
    except:
        AD_results = AD_results._append({'SMILES': smiles, 'InDomain': in_domain}, ignore_index=True)

 35%|██████████████████████████▍                                                | 615/1741 [2:38:27<5:11:57, 16.62s/it][14:49:49] Explicit valence for atom # 0 P, 11, is greater than permitted
 36%|██████████████████████████▋                                                | 620/1741 [2:39:39<4:19:18, 13.88s/it][14:51:01] Explicit valence for atom # 7 N, 4, is greater than permitted
 50%|█████████████████████████████████████▊                                     | 879/1741 [4:02:40<4:19:01, 18.03s/it][16:14:02] Explicit valence for atom # 28 N, 4, is greater than permitted
100%|████████████████████████████████████████████████████████████████████████████| 1741/1741 [7:07:58<00:00, 14.75s/it]


In [14]:
AD_results

Unnamed: 0,SMILES,InDomain
0,C[C@@H]1N(C)CCC2=CC(O)=C(O)C=C12,False
1,OCCS(O)(=O)=O,False
2,CC(O)C1=[N+](CC2=CN=C(C)N=C2N)C(C)=C(CCOP(O)(=...,False
3,CC(CN)C(O)=O,False
4,CN1CCC2=C(C=C3OCOC3=C2)C(=O)CC2=C(C1)C1=C(OCO1...,True
...,...,...
1736,NCCCC[C@H](NC(=O)CC[C@H](N)C(O)=O)C(O)=O,False
1737,N[C@@H](CCC(=O)N1CCC[C@H]1C(O)=O)C(O)=O,False
1738,N[C@@H](CCC(=O)N[C@@H](CO)C(O)=O)C(O)=O,False
1739,C[C@@H](O)[C@H](NC(=O)CC[C@H](N)C(O)=O)C(O)=O,False


In [17]:
# 计算True和False值的数量
value_counts = AD_results['InDomain'].value_counts()

# 打印结果
print(value_counts)

False    1671
True       70
Name: InDomain, dtype: int64


In [16]:
AD_results.to_csv('AD_results_final_16000_.csv')