In [1]:
import pandas as pd
from scipy.spatial import distance
import rdkit as rd
from rdkit.Chem import Descriptors

In [2]:
def smiles_to_desc(s):
    m = rd.Chem.MolFromSmiles(s)
    molweight_desc = Descriptors.MolWt(m)
    rotatable_desc = Descriptors.NumRotatableBonds(m)
    logp_desc = Descriptors.MolLogP(m) # less reliable
    qed_desc = Descriptors.qed(m)
    return [molweight_desc, rotatable_desc, logp_desc, qed_desc]

In [15]:
def desc_to_smiles(d, p):
    dsts = pd.DataFrame(
        [distance.euclidean(d.iloc[i, 1:], p) for i in range(d.shape[0])]
    )
    dsts.columns = ['distance_ligands']
    d = pd.concat([d, dsts], axis=1)
    m = d['ligands'][d['distance_ligands'].argmin()]
    return m

In [16]:
data = pd.DataFrame({"ligands": ['CC(N(C)C)=O',
                                'CCCC#N',
                                'CCCCOC(C)=O',
                                'CC1=CC=C(C)C=C1',
                                'CC(C)C1=CC(C(C)C)=C(C(C(C)C)=C1)C2=C(P(C3CCCCC3)C4CCCCC4)C(OC)=CC=C2OC',
                                'CC(C)(C)P(C1=CC=CC=C1)C(C)(C)C',
                                'CN(C)C1=CC=CC(N(C)C)=C1C2=CC=CC=C2P(C(C)(C)C)C3=CC=CC=C3',
                                'P(C1CCCCC1)(C2CCCCC2)C3CCCCC3',
                                'P(C1=CC=CC=C1)(C2=CC=CC=C2)C3=CC=CC=C3',
                                'CC(C1=C(C2=CC=CC=C2P(C3CCCCC3)C4CCCCC4)C(C(C)C)=CC(C(C)C)=C1)C',
                                'P(C1=CC=CO1)(C2=CC=CO2)C3=CC=CO3',
                                'CP(C1=CC=CC=C1)C2=CC=CC=C2',
                                'CC(OC1=C(P(C2CCCCC2)C3CCCCC3)C(OC(C)C)=CC=C1)C',
                                'FC(F)(F)C1=CC(P(C2=C(C3=C(C(C)C)C=C(C(C)C)C=C3C(C)C)C(OC)=CC=C2OC)C4=CC(C(F)(F)F)=CC(C(F)(F)F)=C4)=CC(C(F)(F)F)=C1',
                                'C[C@]1(O2)O[C@](C[C@]2(C)P3C4=CC=CC=C4)(C)O[C@]3(C)C1',
                                'CP(C)C1=CC=CC=C1']})

desc = pd.DataFrame([smiles_to_desc(data['ligands'][i]) for i in range(data.shape[0])])
full_data = pd.concat([data, desc], axis=1)
full_data.columns = [data.columns[0], f'mol_weight_{data.columns[0]}', f'rotatable_{data.columns[0]}', f'logp_{data.columns[0]}', f'qed_{data.columns[0]}']

In [17]:
smiles_to_desc('CC(N(C)C)=O')

[87.12199999999999, 0, 0.09450000000000003, 0.40990688283686294]

In [18]:
point = [87.12199999999999, 0, 0.09450000000000003, 0.40990688283686294]
best_smile = desc_to_smiles(full_data, point)
best_smile

'CC(N(C)C)=O'

In [19]:
full_data

Unnamed: 0,ligands,mol_weight_ligands,rotatable_ligands,logp_ligands,qed_ligands
0,CC(N(C)C)=O,87.122,0,0.0945,0.409907
1,CCCC#N,69.107,1,1.31008,0.45507
2,CCCCOC(C)=O,116.16,3,1.3496,0.410684
3,CC1=CC=C(C)C=C1,106.168,0,2.30344,0.475758
4,CC(C)C1=CC(C(C)C)=C(C(C(C)C)=C1)C2=C(P(C3CCCCC...,536.781,9,10.5136,0.296776
5,CC(C)(C)P(C1=CC=CC=C1)C(C)(C)C,222.312,1,4.3908,0.618066
6,CN(C)C1=CC=CC(N(C)C)=C1C2=CC=CC=C2P(C(C)(C)C)C...,404.538,5,5.7168,0.502186
7,P(C1CCCCC1)(C2CCCCC2)C3CCCCC3,280.436,3,6.4664,0.519846
8,P(C1=CC=CC=C1)(C2=CC=CC=C2)C3=CC=CC=C3,262.292,3,3.4448,0.634212
9,CC(C1=C(C2=CC=CC=C2P(C3CCCCC3)C4CCCCC4)C(C(C)C...,476.729,7,10.4964,0.348576
