# Initialization

In [1]:
import math
import sys
sys.path.append('../')
from utils import array_in_nd_array, remove_space, r2pcorr
import numpy as np
import pandas as pd
import darkchem

Using TensorFlow backend.


Load model:

In [2]:
model = darkchem.utils.load_model('../../Final Trained DarkChem Network Weights/N7b_[M+H]')

Load training data:

In [3]:
x = np.load('../../Training Data - Molecules/combined_[M+H]_smiles.npy')
y = np.load('../../Training Data - Molecules/combined_[M+H]_labels.npy') # must have the same number of columns as the data the network was trained on

In [4]:
#tranform nd.array to list
x = x.tolist()

Generate latent space representations ($x$) from training data:

In [5]:
#x_latent = model.encoder.predict(x)

Generate property predictions ($y$):

In [6]:
#y_pred = model.predictor.predict(x_latent)

Predict SMILES outputs:

In [7]:
#x_pred = model.decoder.predict(x_latent)

In [8]:
# imput reactants and products
smiles = {'Reactants': ['CC(=O)C', 'O=C1CCCCC1', 'O=C1CCCC1C', 'O=C1CCCC1C', 'CC(=O)CC(=O)C', 'CC(=O)CC(=O)C', 'COCC(=O)C',
                        'CC(=O)c1ccccc1','CC(=O)C', 'CCC(=O)C', 'CCCC(=O)C', 'CCC(=O)CC', 'CC(=O)CC(=O)C','CCCC(=O)C(=O)C',
                        'CCCC(=O)CC', 'O=C1CCCCC1', 'O=C1CCCC1C', 'COCC(=O)C', 'CC(=O)c1ccccc1','CC(=O)CCC(=O)C'], 
        'Products': ['CCC(=O)C', 'CC1CCCCC1=O', 'CC1CCC(C1=O)C', 'O=C1CCCC1(C)C', 'CC(C(=O)C)C(=O)C', 'CC(=O)C(C(=O)C)(C)C',
                     'COC(C(=O)CC)C','CCC(=O)c1ccccc1','CC(O)C', 'CCC(O)C', 'CCCC(O)C', 'CCC(CC)O', 'CC(CC(O)C)O',
                    'CCCC(C(O)C)O','CCCC(CC)O', 'OC1CCCCC1', 'OC1CCCC1C', 'COCC(O)C', 'CC(c1ccccc1)O','CC(CCC(O)C)O']}

# removed the original fifth and sixth reactions due to the products don't belong to the combined smiles strings file

In [9]:
#transform to DataFrame type
smiles = pd.DataFrame(smiles)
smiles.head()

Unnamed: 0,Reactants,Products
0,CC(=O)C,CCC(=O)C
1,O=C1CCCCC1,CC1CCCCC1=O
2,O=C1CCCC1C,CC1CCC(C1=O)C
3,O=C1CCCC1C,O=C1CCCC1(C)C
4,CC(=O)CC(=O)C,CC(C(=O)C)C(=O)C


In [10]:
#transform smiles string to vectors
rvec = [darkchem.utils.struct2vec(reactant).astype(int) for reactant in smiles['Reactants']]
pvec = [darkchem.utils.struct2vec(product).astype(int) for product in smiles['Products']]
#tranform list to nd.array
rvec = np.array(rvec)
pvec = np.array(pvec)
#transform numpy array to list  
#rvec = rvec.tolist() 
#pvec = pvec.tolist()
#smiles['rxnvector'] = [smilesdf.pvec[i] - smilesdf.rvec[i] for i in range(len(smilesdf))]

In [11]:
#judge whether the reactants are contained in the training set or not
[array_in_nd_array(vec, x) for vec in rvec]

[True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True]

In [12]:
#judge whether the products are included in the training set or not
[array_in_nd_array(vec, x) for vec in pvec]

[True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True]

In [13]:
#find the specific indices of reactants and products in the training set
#rindex = [x.index(rvec[i]) for i in range(len(rvec))]
#pindex = [x.index(pvec[j]) for j in range(len(pvec))]

In [14]:
#extract the vectors of reactants and products from training set
#rlaten = [x[rindex[i]] for i in range(len(rindex))]
#platen = [x[pindex[j]] for j in range(len(pindex))]
#tranform to np.array
#rlaten = np.array(rlaten)
#platen = np.array(platen)

In [15]:
#generate the latent space representation for reactants and products
r_latent = model.encoder.predict(rvec)
p_latent = model.encoder.predict(pvec)

In [16]:
# transform the path,pvec and rvec(latent vectors) lists to dataframe
#pathdf = pd.DataFrame(path)
rvecdf = pd.DataFrame(r_latent)
pvecdf = pd.DataFrame(p_latent)

In [17]:
# calculate the correlation between the latent reactant vectors to product vectors
r2p = r2pcorr(rvecdf,pvecdf)
r2p

Unnamed: 0,Correlation
0,0.978819
1,0.957353
2,0.947682
3,0.981697
4,0.945772
5,0.914027
6,0.944949
7,0.918495
8,0.989565
9,0.985863


In [18]:
#calculate the average value of 
r2p.mean()

Correlation    0.963098
dtype: float64

The correlation values are very high, one reason is that the chemical reactions encoded in the notebook normally are single reactions(one reactant to one product), if there are two reactants within one reaction, the value will be lower(mean value will be 0.7), so the 

In [19]:
#save the vectors for reactants and products
#rct8 = np.save('rct8.npy',rvec)
#prd8 = np.save('prd8.npy',pvec)

In [20]:
# Here I just used the vectors transformed from the function 
# 'struct2vec' to generate the corresponding latent space representations
rls = model.encoder.predict(np.array(rvec))
pls = model.encoder.predict(np.array(pvec))

In [21]:
r_latent[0] == rls[0]

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True]

From here, you can see r_latent list is the same as rls list. 

In [22]:
#predict the smiles strings using latent space vectors
#x_pred = model.decoder.predict(r_latent)

In [23]:
#transform latent space representations to vectors
#trs = darkchem.utils.beamsearch(x_pred, k=1)

In [24]:
# transform the type to [-1,100] for the following calculation of structure
#trs = trs.reshape(-1,100)

In [25]:
#transform the vectors to structure
#v2s = [darkchem.utils.vec2struct(trs[i]) for i in range(len(trs))]

In [26]:
#v2s
# something wrong with the indent, but here we can know the whole process is right.
#That's to say the data extract from combined smiles strings.npy file equals to
# the data we get from the function 'struct2vecS'