In [1]:
import spyrmsd
from spyrmsd.rmsd import rmsdwrapper
from spyrmsd import io, rmsd
from spyrmsd.molecule import Molecule
import numpy as np
import csv
import sys
import re
from rdkit import Chem
from rdkit.Chem import AllChem
from collections import defaultdict
import pandas as pd

In [2]:
df_map = pd.read_csv('./merge_1443.csv')
''.join(df_map.loc[df_map['standardized_smiles'] == 'Cc1ccc(-c2onc3ccc(C(=O)Nc4c(NCc5ccccc5)c5ccccc5oc4=O)cc23)cc1', 'title_vina'])

'HIT103423082'

In [None]:
vina_suppl = Chem.SDMolSupplier('1443_vina.sdf', sanitize=False, removeHs=True)
glide_suppl = Chem.SDMolSupplier('1443_glide.sdf', removeHs=True)


In [4]:
vina_mols = {}
suppl_vina = Chem.SDMolSupplier('1443_vina.sdf', sanitize=False, removeHs=True)
for mol in suppl_vina:
    if mol is not None:
        name = mol.GetProp('_Name')
        vina_mols[name] = mol
print(f"Vina SDF文件包含 {len(vina_mols)} 个分子")

# 3. 读取Glide SDF文件
glide_mols = {}
suppl_glide = Chem.SDMolSupplier('1443_glide.sdf', removeHs=True)
for mol in suppl_glide:
    if mol is not None:
        name = mol.GetProp('_Name')
        glide_mols[name] = mol
print(f"Glide SDF文件包含 {len(glide_mols)} 个分子")

# 4. 创建匹配字典
matched_molecules = {}
missing_molecules = []

# 遍历CSV中的每一行
for index, row in df_map.iterrows():
    std_smiles = row['standardized_smiles']
    title_vina = row['title_vina']+'.pdbqt'
    title_glide = row['title_glide']
    
    # 检查两个分子是否存在
    if title_vina in vina_mols and title_glide in glide_mols:
        matched_molecules[std_smiles] = {
            'title_vina': title_vina,
            'title_glide': title_glide,
            'mol_vina': vina_mols[title_vina],
            'mol_glide': glide_mols[title_glide]
        }
    else:
        missing_info = []
        if title_vina not in vina_mols:
            missing_info.append(f"Vina分子: {title_vina}")
        if title_glide not in glide_mols:
            missing_info.append(f"Glide分子: {title_glide}")
        missing_molecules.append({
            'smiles': std_smiles,
            'missing': ", ".join(missing_info)
        })

# 5. 打印结果统计
matched_molecules


[23:30:54] The 2 defining bonds for an atropisomer are co-planar - atoms are: 1 2
[23:30:54] The 2 defining bonds for an atropisomer are co-planar - atoms are: 1 0
[23:30:54] Unexpected error hit on line 72699
[23:30:54] ERROR: moving to the beginning of the next molecule
[23:30:54] The 2 defining bonds for an atropisomer are co-planar - atoms are: 11 10
[23:30:54] The 2 defining bonds for an atropisomer are co-planar - atoms are: 1 0
[23:30:54] Unexpected error hit on line 110209
[23:30:54] ERROR: moving to the beginning of the next molecule
[23:30:54] The 2 defining bonds for an atropisomer are co-planar - atoms are: 1 0
[23:30:54] Both bonds on one end of an atropisomer are on the same side - atoms are: 11 3
[23:30:54] Both bonds on one end of an atropisomer are on the same side - atoms are: 20 22
[23:30:54] The 2 defining bonds for an atropisomer are co-planar - atoms are: 1 2
[23:30:54] The 2 defining bonds for an atropisomer are co-planar - atoms are: 23 22
[23:30:54] The 2 defin

Vina SDF文件包含 20272 个分子
Glide SDF文件包含 20280 个分子


{'Cc1ccc(-c2onc3ccc(C(=O)Nc4c(NCc5ccccc5)c5ccccc5oc4=O)cc23)cc1': {'title_vina': 'HIT103423082.pdbqt',
  'title_glide': 'ligprep_chemdiv-stock1_03_400000_split-10_4.sdf:2573',
  'mol_vina': <rdkit.Chem.rdchem.Mol at 0x2440c9893f0>,
  'mol_glide': <rdkit.Chem.rdchem.Mol at 0x2440cbc26c0>},
 'O=C(NCc1ccccc1)C(=Cc1cn(-c2ccccc2)nc1-c1ccc(F)cc1)NC(=O)c1ccc(Cl)cc1Cl': {'title_vina': 'HIT211256561.pdbqt',
  'title_glide': 'ligprep_chemdiv-stock1_01_400000_split-10_0.sdf:14455',
  'mol_vina': <rdkit.Chem.rdchem.Mol at 0x2440cb725e0>,
  'mol_glide': <rdkit.Chem.rdchem.Mol at 0x2440cbc27a0>},
 'CCc1cc(Br)ccc1N1C(=O)C2N=NN(CC(=O)N3N=C(c4ccccc4)CC3c3ccccc3)C2C1=O': {'title_vina': 'HIT100711582.pdbqt',
  'title_glide': 'ligprep_chemdiv-stock1_04_414266_split-10_7.sdf:13746',
  'mol_vina': <rdkit.Chem.rdchem.Mol at 0x2440c856ab0>,
  'mol_glide': <rdkit.Chem.rdchem.Mol at 0x2440cbc2880>},
 'Cc1cccc(C)c1N1C(=O)C2N=NN(CC(=O)N3N=C(c4ccc(Br)cc4)CC3c3ccccc3)C2C1=O': {'title_vina': 'HIT103266035.pdbqt',
  

In [5]:
for cid in matched_molecules:
    ref = Molecule.from_rdkit(matched_molecules[cid]['mol_vina'])
    coords_ref = ref.coordinates
    print(coords_ref.shape[0])
    anum_ref = ref.atomicnums
    adj_ref = ref.adjacency_matrix
    mol = Molecule.from_rdkit(matched_molecules[cid]['mol_glide'])
    coords = mol.coordinates
    print(coords.shape[0])
    anum = mol.atomicnums
    adj = mol.adjacency_matrix
    if coords_ref.shape[0] == coords.shape[0]:
        RMSD = rmsd.symmrmsd(coords_ref, coords, anum_ref, anum, adj_ref, adj)
        matched_molecules[cid]['RMSD'] = RMSD
    else:
        print(cid)
matched_molecules

38
38
41
41
39
39
39
39
38
38
32
32
41
41
35
35
38
38
47
47
32
32
40
40
33
33
40
40
37
37
39
39
41
41
38
38
49
49
36
36
42
42
40
40
37
37
42
42
48
48
35
35
39
39
41
41
32
32
44
44
36
36
36
36
36
36
37
37
44
44
32
32
38
38
39
39
35
35
41
41
32
32
35
35
50
50
43
43
35
35
34
34
41
41
59
59
40
40
38
38
39
39
36
36
34
34
30
30
36
36
30
30
39
39
40
40
39
39
34
34
38
38
39
39
38
38
38
38
36
36
34
34
43
43
37
37
31
31
38
38
34
34
31
31
41
41
31
31
42
42
33
33
45
45
30
30
43
43
35
35
36
36
32
32
42
42
47
47
33
33
33
33
40
40
42
42
39
39
41
41
40
40
41
41
33
33
37
37
39
39
46
46
33
33
32
32
42
42
35
35
36
36
40
40
50
50
36
36
33
33
48
48
43
43
50
50
39
39
46
46
39
39
35
35
45
45
40
40
34
34
29
29
35
35
42
42
38
38
32
32
42
42
34
34
33
33
42
42
38
38
37
37
35
35
38
38
41
41
36
36
39
39
43
43
37
37
38
38
31
31
45
45
44
44
32
32
41
41
33
33
34
34
29
29
41
41
41
41
32
32
36
36
37
37
41
41
34
34
33
33
35
35
41
41
39
39
32
32
43
43
40
40
42
42
30
30
34
34
44
44
36
36
35
35
39
39
36
36
39
39
34
34
34
3

{'Cc1ccc(-c2onc3ccc(C(=O)Nc4c(NCc5ccccc5)c5ccccc5oc4=O)cc23)cc1': {'title_vina': 'HIT103423082.pdbqt',
  'title_glide': 'ligprep_chemdiv-stock1_03_400000_split-10_4.sdf:2573',
  'mol_vina': <rdkit.Chem.rdchem.Mol at 0x2440c9893f0>,
  'mol_glide': <rdkit.Chem.rdchem.Mol at 0x2440cbc26c0>,
  'RMSD': 8.501543383487247},
 'O=C(NCc1ccccc1)C(=Cc1cn(-c2ccccc2)nc1-c1ccc(F)cc1)NC(=O)c1ccc(Cl)cc1Cl': {'title_vina': 'HIT211256561.pdbqt',
  'title_glide': 'ligprep_chemdiv-stock1_01_400000_split-10_0.sdf:14455',
  'mol_vina': <rdkit.Chem.rdchem.Mol at 0x2440cb725e0>,
  'mol_glide': <rdkit.Chem.rdchem.Mol at 0x2440cbc27a0>,
  'RMSD': 5.456327920685192},
 'CCc1cc(Br)ccc1N1C(=O)C2N=NN(CC(=O)N3N=C(c4ccccc4)CC3c3ccccc3)C2C1=O': {'title_vina': 'HIT100711582.pdbqt',
  'title_glide': 'ligprep_chemdiv-stock1_04_414266_split-10_7.sdf:13746',
  'mol_vina': <rdkit.Chem.rdchem.Mol at 0x2440c856ab0>,
  'mol_glide': <rdkit.Chem.rdchem.Mol at 0x2440cbc2880>,
  'RMSD': 3.1515643017722366},
 'Cc1cccc(C)c1N1C(=O)C2N=

In [7]:
output_data = []
for cid, data in matched_molecules.items():
    output_data.append({
            'standardized_smiles': cid,
            'vina_title': data['title_vina'],
            'glide_title': data['title_glide'],
            'rmsd': data['RMSD']
        })
df = pd.DataFrame(output_data, columns = ['standardized_smiles', 'vina_title', 'glide_title', 'rmsd'])
df.sort_values(by = 'rmsd')
csv_file = './1443_rmsd.csv'
df.sort_values(by = 'rmsd').copy().to_csv(csv_file, index=False)

In [8]:
rmsd_mol = df[df['rmsd']<2.51].copy()
rmsd_mol.to_csv("./1443_rmsd_select.csv", index = False)

In [9]:
rmsd_mol['vina_title'].to_csv("./1443_rmsd_vinalist.txt", index = False, header = False)
rmsd_mol['glide_title'].to_csv("./1443_rmsd_glidelist.txt", index = False, header = False)