#### 原始的fragment2vec
- 选择parallel，best model
- 查看碎片之间的相似性
- 查看双键、芳香性等抽象度更高的结构信息

In [2]:
# Helper libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
from sklearn.metrics import pairwise_distances

from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
IPythonConsole.ipython_useSVG = True

In [3]:
def cal_distance(x, y, metric='euclidean'):
    if type(x) == pd.core.series.Series:
        x = x.values.reshape(1, -1)
    if type(y) == pd.core.series.Series:
        y = y.values.reshape(1, -1)
    return pairwise_distances(x, y, metric=metric)

In [4]:
def print_closest_words(x_embedding, x_query, n=5, add_vec=None):
    x = x_embedding.loc[x_query].values.reshape(1, -1).copy()
    # print('x is: {}'.format(x))
    if add_vec is not None:
        x += add_vec
        # print('x + add_vec is: {}'.format(x))
    dists = cal_distance(x=x_embedding.values, y=x)     # compute distances to all words
    lst = sorted(enumerate(dists), key=lambda x: x[1]) # sort by distance
    # print(lst[:100])
    all_smiles = []
    all_dis = [] 
    if add_vec is not None:
        for idx, difference in lst[0:n]:
            _smiles = x_embedding.iloc[idx,:].name
            all_smiles.append(_smiles)
            all_dis.append(difference[0])
            # print(_smiles, difference)
    else:
        for idx, difference in lst[1:n+1]:   # take the top n
            _smiles = x_embedding.iloc[idx,:].name
            all_smiles.append(_smiles)
            all_dis.append(difference[0])
            # print(_smiles, difference)
    return {'smiles': all_smiles, 'dis': all_dis}

In [5]:
def get_minus_result(x_embedding, x, y):
    x = x_embedding.loc[x].values.reshape(1, -1)
    y = x_embedding.loc[y].values.reshape(1, -1)
    return x-y

In [6]:
def draw_mol_by_smiles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    size = (200, 200)
    return Draw.MolToImage(mol, size=size)

In [7]:
def draw_multiple_mol(smiles_list, mols_per_row=4, file_path=None, legends=None):
    mols = []
    for i in smiles_list:
        mols.append(Chem.MolFromSmiles(i))
    mols_per_row = min(len(smiles_list), mols_per_row)
    if legends is None:
        img=Draw.MolsToGridImage(mols, molsPerRow=mols_per_row, subImgSize=(220, 120), useSVG=True)
    else:
        img=Draw.MolsToGridImage(mols, molsPerRow=mols_per_row, subImgSize=(220, 120), useSVG=True, legends=legends)
    if file_path:
        with open(file_path, 'w') as f_handle:
            f_handle.write(img.data)
    return img

In [8]:
frag2vec = pd.read_csv('./model_parallel/frag2vec_ws_4_minn_1_maxn_2.csv', index_col=0)
frag2vec.head(2)

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
fragment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CC,0.102366,0.236778,-0.212418,0.097849,0.353949,0.066136,0.25152,0.141767,-0.135168,0.230127,...,0.707791,-0.000694,-0.055997,0.39924,-0.149455,0.417486,0.369599,-0.189464,-0.133221,0.05911
CN,0.13585,0.226165,-0.434924,0.303119,-0.014223,0.132242,0.368385,0.068198,-0.059371,0.28751,...,0.617289,-0.11381,0.043899,0.385367,0.093859,0.156435,0.381115,-0.142981,0.027194,-0.063204


In [3]:
frag2vec.shape

(505, 100)

In [54]:
# demo_frags = np.random.choice(frag2vec.index.to_list(), 5, replace=False)
demo_frags = ['C1=CCNN=C1', 'C1=COCO1', 'C1=CCC1', 'C1=CN=N[SH]=C1', 'OBr']
demo_frags

['C1=CCNN=C1', 'C1=COCO1', 'C1=CCC1', 'C1=CN=N[SH]=C1', 'OBr']

In [55]:
frag2nn = {}
for frag in demo_frags:
    frag2nn[frag] = print_closest_words(x_embedding=frag2vec, x_query=frag, n=7)

In [56]:
nn1 = frag2nn['C1=COCO1']
nn1

{'smiles': ['C1=COCCO1',
  'C1=COCCCO1',
  'C1=CO1',
  'C1=COCOC1',
  'C1=COCCOC1',
  'C1=CCOCOC1',
  'C1=NCCCO1'],
 'dis': [1.3378840407327308,
  1.9484125075791836,
  2.2607932819501433,
  2.4655408884687557,
  2.5521667090063422,
  2.586532753857119,
  2.670682410744231]}

In [57]:
for k, frag in enumerate(demo_frags):
    nn = frag2nn[frag]
    draw_multiple_mol(smiles_list=[frag] + nn['smiles'],  
                   legends=[frag] + [nn['smiles'][i] + '(' + str('{:.2f}'.format(nn['dis'][i])) + ')' for i in range(7)],
                   file_path='./images/original_frag_nn_{}.svg'.format(k))

#### test double bond

In [42]:
double_bond1 = get_minus_result(x_embedding=frag2vec, x='C=N', y='CN')
# double_bond

In [43]:
double_bond2 = get_minus_result(x_embedding=frag2vec, x='C=C', y='CC')

In [44]:
double_bond3 = get_minus_result(x_embedding=frag2vec, x='C1CC1', y='C1=CC1')

In [45]:
double_bond_mean = (double_bond1 + double_bond2 + double_bond3) / 3
double_bond_mean

array([[-0.15338857, -0.69446378, -0.11758295,  0.10127928, -0.28230951,
         0.35719175, -0.25445741,  0.15018814,  0.04232586, -0.07113249,
         0.17409453,  0.21518471,  0.18697108,  0.38729877, -0.55214642,
        -0.27350847, -0.08156174, -0.14223799, -0.76886205,  0.3164752 ,
         0.09915667, -0.52885189,  0.11038432,  0.09531353,  0.12560121,
        -0.22811896,  0.42231693, -0.38499789,  0.02158625,  0.82479176,
        -0.32096885,  0.7461179 ,  0.19736769,  0.00275624, -0.68191759,
         0.07402038, -0.26060044, -0.1914505 , -0.09077727,  0.34305247,
         0.03109213, -0.22195799, -0.17001751, -0.40804091,  0.71335634,
        -0.00529603, -0.36297107,  0.49758943,  0.02510355,  0.79819195,
         0.01410074,  0.15133423,  0.13739951, -0.56637301,  0.23606619,
        -0.64870068,  0.38990146, -0.12237843,  0.13921537,  0.18282258,
        -0.08468334, -0.37729357, -0.13456128, -0.34152181,  0.22029255,
        -0.03813214, -0.51485686, -0.12119216, -0.0

In [48]:
single_bond_frag = ['CO', 'CS', 'OS', 'C1CC1']
single_bond2nn = {}
for frag in single_bond_frag:
    single_bond2nn[frag] = print_closest_words(x_embedding=frag2vec, x_query=frag, n=5, add_vec=double_bond_mean)

In [49]:
for frag in single_bond_frag:
    print('>> {} <<'.format(frag))
    print(single_bond2nn[frag])

>> CO <<
{'smiles': ['CO', 'C1=CC=COC=C1', 'C1=COCC=N1', 'C1CCOCOC1', 'C1=CCCOC=C1'], 'dis': [3.7123331725173134, 4.346668802304423, 4.383921311850287, 4.437078998600673, 4.441982816756549]}
>> CS <<
{'smiles': ['CS', 'C=S', 'O=S', 'NS', 'N'], 'dis': [3.712333172517321, 4.7514898821537175, 6.283862917529502, 6.401635232012894, 6.487376394400499]}
>> OS <<
{'smiles': ['OS', 'O=S', 'NS', 'C=S', 'CS'], 'dis': [3.7123331725173196, 6.859237366825346, 7.50933157715918, 7.798521819459647, 8.630502871268495]}
>> C1CC1 <<
{'smiles': ['C1CC1', 'C1=CCC=NC=C1', 'C1=CC=CC=C1', 'C1CCCC1', 'C1CCCCCC1'], 'dis': [3.712333172517313, 4.337495471436628, 4.468599835862893, 4.471456104532888, 4.557169417506268]}


In [19]:
print_closest_words(x_embedding=frag2vec, x_query='CN')

{'smiles': ['CC', 'C1=COC=C1', 'C1COCCO1', 'C1CCOC1', 'C1=CCC=NC=C1'],
 'dis': [2.0746646357384684,
  2.5587250731641857,
  2.558931641309026,
  2.5946924444424466,
  2.5966570193977443]}

#### test triple bond

In [65]:
tri_bond = get_minus_result(x_embedding=frag2vec, x='C#C', y='CC')

In [83]:
cn_add_tri_bond_nn = print_closest_words(x_embedding=frag2vec, x_query='CN', add_vec=tri_bond)
cn_add_tri_bond_nn

{'smiles': ['C#C', 'C1=CON=C1', 'C#N', 'CN', 'C1COC1'],
 'dis': [2.0746646357384675,
  3.851598374765036,
  4.171327245422764,
  4.2089828415025385,
  4.2588016118418235]}

In [84]:
print_closest_words(x_embedding=frag2vec, x_query='CN')

{'smiles': ['CC', 'C1=COC=C1', 'C1COCCO1', 'C1CCOC1', 'C1=CCC=NC=C1'],
 'dis': [2.0746646357384684,
  2.5587250731641857,
  2.558931641309026,
  2.5946924444424466,
  2.5966570193977443]}

#### test aromaticity（芳香性）

In [85]:
arom = get_minus_result(x_embedding=frag2vec, x='C1=CC=CC=C1', y='C1CCCCC1')

In [87]:
print_closest_words(x_embedding=frag2vec, x_query='C1CCCC1')

{'smiles': ['C1CCCCC1', 'C1CCCCCC1', 'C1CCOCC1', 'C1=CCCC=CC1', 'C1CCCOCC1'],
 'dis': [1.5055548090861803,
  1.7683505233222776,
  2.0486014449126917,
  2.1874284871082814,
  2.189285527644637]}

In [86]:
print_closest_words(x_embedding=frag2vec, x_query='C1CCCC1', add_vec=arom)

{'smiles': ['C1=CC=CC=C1',
  'C1=CC=COC=C1',
  'C1=CC=CNC=C1',
  'C1=CCC=NC=C1',
  'C1=CC=NC=C1'],
 'dis': [1.5055548090861814,
  2.4786185001017365,
  2.502898718438676,
  2.5621038970574874,
  2.6199867159118995]}

In [93]:
print_closest_words(x_embedding=frag2vec, x_query='C1CCNC1')

{'smiles': ['C1CCNCC1', 'C1COCCN1', 'C1CNCCN1', 'C1CCNCNC1', 'C1CCOCNC1'],
 'dis': [1.6443838646540268,
  2.170819843259323,
  2.2132769893673925,
  2.2540547833444777,
  2.2761720717561515]}

In [92]:
print_closest_words(x_embedding=frag2vec, x_query='C1CCNC1', add_vec=arom)

{'smiles': ['C1=CC=CC=C1',
  'C1CCNC1',
  'C1=CC=CNC=C1',
  'C1=CNC=CNC1',
  'C1=CC=COC=C1'],
 'dis': [2.7806917290748765,
  3.0839756828459097,
  3.110060697950161,
  3.3052409571906343,
  3.3485461076081036]}

#### test N atom

In [88]:
n = get_minus_result(x_embedding=frag2vec, x='C1CNC1', y='C1CC1')

In [90]:
print_closest_words(x_embedding=frag2vec, x_query='C1CCCC1')

{'smiles': ['C1CCCCC1', 'C1CCCCCC1', 'C1CCOCC1', 'C1=CCCC=CC1', 'C1CCCOCC1'],
 'dis': [1.5055548090861803,
  1.7683505233222776,
  2.0486014449126917,
  2.1874284871082814,
  2.189285527644637]}

In [91]:
print_closest_words(x_embedding=frag2vec, x_query='C1CCCC1', add_vec=n)

{'smiles': ['C1CNC1', 'C1CNCCOC1', 'C1CCNCC1', 'C1CCOCC1', 'C1CC2CC1CN2'],
 'dis': [2.2564268258018467,
  3.8010950538268298,
  3.861273343279964,
  3.9531810164869623,
  3.9651659706769586]}