In [1]:
import pandas as pd
from rdkit import Chem

In [2]:
vina_df = pd.read_csv("sort_1443_vinascore_top10.csv")
glide_df = pd.read_csv("sort_1443_glidescore_top10.csv")

In [3]:
def standarize_smiles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    standardized_smiles = Chem.MolToSmiles(mol, isomericSmiles=False, canonical=True)
    return standardized_smiles

In [5]:
vina_df['standardized_smiles'] = vina_df['SMILES'].apply(standarize_smiles)
glide_df['standardized_smiles'] = glide_df['SMILES'].apply(standarize_smiles)

In [None]:
glide_df.rename(columns={
    "SMILES": "SMILES",
    "title": "title",
    "r_i_glide_gscore": "score"
}, inplace=True)
common_smiles = set(vina_df["standardized_smiles"]).intersection(glide_df["standardized_smiles"])

Unnamed: 0,SMILES,title,score,standardized_smiles
0,CCOC(=O)CNC(=O)[C@H]1O[C@H](O[C@@H]2[C@H](O[C@...,ligprep_chemdiv-stock1_03_400000_split-10_6.sd...,-9.31765,CCOC(=O)CNC(=O)C1OC(OC2C(OC3CCC4(C)C(CCC5(C)C4...
1,CC1(C)[C@H](O[C@@H]2O[C@@H](C(=O)On3nnc4ccccc4...,ligprep_chemdiv-stock1_03_400000_split-10_6.sd...,-9.13766,CC1(C(=O)On2nnc3ccccc32)CCC2(C)CCC3(C)C(=CC(=O...
2,Nc1c2nc[nH]c2c(N)c2[nH]c(=O)c(=O)[nH]c12,ligprep_chemdiv-stock1_01_400000_split-10_6.sd...,-9.11498,Nc1c2nc[nH]c2c(N)c2[nH]c(=O)c(=O)[nH]c12
3,C[C@H]1O[C@@H](OC[C@H]2O[C@H](Oc3c(-c4ccc(O)c(...,ligprep_chemdiv-stock1_03_400000_split-10_6.sd...,-8.97477,CC1OC(OCC2OC(Oc3c(-c4ccc(O)c(O)c4)oc4cc(O)cc(O...
4,COC(=O)[C@@H](CCSC)NC(=O)[C@H]1O[C@@H](O[C@H]2...,ligprep_chemdiv-stock1_03_400000_split-10_6.sd...,-8.93881,COC(=O)C(CCSC)NC(=O)C1OC(OC2C(OC3CCC4(C)C(CCC5...
...,...,...,...,...
180667,Cc1ccc2oc(=O)c(S(=O)(=O)c3ccccc3)cc2c1,ligprep_chemdiv-stock1_01_400000_split-10_8.sd...,-5.85002,Cc1ccc2oc(=O)c(S(=O)(=O)c3ccccc3)cc2c1
180668,CCc1ccc2oc(C(=O)N(Cc3ccc(OC)cc3)[C@H]3CCS(=O)(...,ligprep_chemdiv-stock1_01_400000_split-10_9.sd...,-5.85002,CCc1ccc2oc(C(=O)N(Cc3ccc(OC)cc3)C3CCS(=O)(=O)C...
180669,COc1cc([C@H]2c3c(oc4ccc(F)cc4c3=O)C(=O)N2c2ncc...,ligprep_chemdiv-stock1_01_400000_split-10_3.sd...,-5.85001,COc1cc(C2c3c(oc4ccc(F)cc4c3=O)C(=O)N2c2nccs2)c...
180670,Cc1ccc2nc(CSc3ccccc3NC(=O)Nc3ccc(C)c(F)c3)cc(=...,ligprep_chemdiv-stock1_02_400000_split-10_6.sd...,-5.85001,Cc1ccc2nc(CSc3ccccc3NC(=O)Nc3ccc(C)c(F)c3)cc(=...


In [8]:
common_vina = vina_df[vina_df['standardized_smiles'].isin(common_smiles)]
common_glide = glide_df[glide_df['standardized_smiles'].isin(common_smiles)]

In [10]:
merged_df = pd.merge(
    common_vina[["title", "SMILES", "standardized_smiles", "score"]],
    common_glide[["title", "SMILES", "standardized_smiles", "score"]],
    on="standardized_smiles",
    suffixes=("_vina", "_glide")
)
merged_df.rename(columns={
    "SMILES_vina": "Original_SMILES_vina",
    "SMILES_glide": "Original_SMILES_glide"
}, inplace=True)
merged_df

Unnamed: 0,title_vina,Original_SMILES_vina,standardized_smiles,score_vina,title_glide,Original_SMILES_glide,score_glide
0,HIT103423082,Cc1ccc(-c2onc3ccc(C(=O)Nc4c(NCc5ccccc5)c5ccccc...,Cc1ccc(-c2onc3ccc(C(=O)Nc4c(NCc5ccccc5)c5ccccc...,-10.1,ligprep_chemdiv-stock1_03_400000_split-10_4.sd...,Cc1ccc(-c2onc3ccc(C(=O)Nc4c(NCc5ccccc5)c5ccccc...,-6.10593
1,HIT211256561,O=C(NCc1ccccc1)/C(=C/c1cn(-c2ccccc2)nc1-c1ccc(...,O=C(NCc1ccccc1)C(=Cc1cn(-c2ccccc2)nc1-c1ccc(F)...,-10.0,ligprep_chemdiv-stock1_01_400000_split-10_0.sd...,O=C(NCc1ccccc1)/C(=C/c1cn(-c2ccccc2)nc1-c1ccc(...,-5.99141
2,HIT100711582,CCc1cc(Br)ccc1N1C(=O)C2N=NN(CC(=O)N3N=C(c4cccc...,CCc1cc(Br)ccc1N1C(=O)C2N=NN(CC(=O)N3N=C(c4cccc...,-9.9,ligprep_chemdiv-stock1_04_414266_split-10_7.sd...,CCc1cc(Br)ccc1N1C(=O)[C@H]2N=NN(CC(=O)N3N=C(c4...,-6.34753
3,HIT103266035,Cc1cccc(C)c1N1C(=O)C2N=NN(CC(=O)N3N=C(c4ccc(Br...,Cc1cccc(C)c1N1C(=O)C2N=NN(CC(=O)N3N=C(c4ccc(Br...,-9.7,ligprep_chemdiv-stock1_04_414266_split-10_7.sd...,Cc1cccc(C)c1N1C(=O)[C@@H]2[C@@H](N=NN2CC(=O)N2...,-6.26251
4,HIT105336037,O=C1C2N=NN(CC(=O)N3N=C(c4ccc(Cl)cc4)CC3c3ccccc...,O=C1C2N=NN(CC(=O)N3N=C(c4ccc(Cl)cc4)CC3c3ccccc...,-9.7,ligprep_chemdiv-stock1_04_414266_split-10_8.sd...,O=C1[C@@H]2[C@@H](N=NN2CC(=O)N2N=C(c3ccc(Cl)cc...,-6.69460
...,...,...,...,...,...,...,...
21804,HIT213607696,Cc1ccc(C(=O)N2CCCN(c3nc4ncccc4o3)CC2)cc1N1CCCC1=O,Cc1ccc(C(=O)N2CCCN(c3nc4ncccc4o3)CC2)cc1N1CCCC1=O,-7.7,ligprep_chemdiv-stock1_02_400000_split-10_1.sd...,Cc1ccc(C(=O)N2CCCN(c3nc4ncccc4o3)CC2)cc1N1CCCC1=O,-6.32981
21805,HIT213607722,Cc1ccc2oc(N3CCCN(C(=O)C(C)Oc4ccccc4)CC3)nc2n1,Cc1ccc2oc(N3CCCN(C(=O)C(C)Oc4ccccc4)CC3)nc2n1,-7.7,ligprep_chemdiv-stock1_02_400000_split-10_1.sd...,Cc1ccc2oc(N3CCCN(C(=O)[C@H](C)Oc4ccccc4)CC3)nc2n1,-6.14800
21806,HIT213608824,O=C(c1cccc2[nH]ccc12)N1CCC(Oc2cc(N3CCOCC3)ncn2)C1,O=C(c1cccc2[nH]ccc12)N1CCC(Oc2cc(N3CCOCC3)ncn2)C1,-7.7,ligprep_chemdiv-stock1_02_400000_split-10_2.sd...,O=C(c1cccc2[nH]ccc12)N1CC[C@@H](Oc2cc(N3CCOCC3...,-6.21050
21807,HIT213608940,O=C(Nc1ccc(OC2CCN(C(=O)c3cccc4[nH]ccc34)CC2)nc...,O=C(Nc1ccc(OC2CCN(C(=O)c3cccc4[nH]ccc34)CC2)nc...,-7.7,ligprep_chemdiv-stock1_02_400000_split-10_2.sd...,O=C(Nc1ccc(OC2CCN(C(=O)c3cccc4[nH]ccc34)CC2)nc...,-6.65461


In [46]:
merged_df.to_csv('./merge_1443.csv', index=False)

In [47]:
merged_df['title_vina'].to_csv('./1443_vinalist.txt', index=False)
merged_df['title_glide'].to_csv('./1443_glidelist.txt', index=False)