# Initialization

In [1]:
import math

import numpy as np
import pandas as pd
import darkchem

Using TensorFlow backend.


Load model:

In [2]:
model = darkchem.utils.load_model('../../Final Trained DarkChem Network Weights/N7b_[M+H]')

Load training data:

In [3]:
x = np.load('../../Training Data - Molecules/combined_[M+H]_smiles.npy')
y = np.load('../../Training Data - Molecules/combined_[M+H]_labels.npy') # must have the same number of columns as the data the network was trained on

In [4]:
#tranform nd.array to list
x = x.tolist()

Generate latent space representations ($x$) from training data:

In [5]:
#x_latent = model.encoder.predict(x)

Generate property predictions ($y$):

In [6]:
#y_pred = model.predictor.predict(x_latent)

Predict SMILES outputs:

In [7]:
#x_pred = model.decoder.predict(x_latent)

In [8]:
#function for checking whether chemicals is included in the combined smiles strings
def array_in_nd_array(test, array):
    """
    Checks whether or not a test 1D array is contained within a full ND array.
    Returns True if the test array is equal to any of the dimensions of the ND array.
    Returns False if the test array does not match any dimension of the ND array.
    """
    return any(np.array_equal(x, test) for x in array)

## Enolate Alkylation (CC Bond Formation)

Alkylation (in this case, methylation) was explored through the following reactions.

1. Acetone $(CH_{3}C(O)CH_{3}) + OH^{-} + CH_{3}Br \rightarrow CH_{3}C(O)CH_{2}CH_{3} + H_{2}O + Br^{-}$
2. Cyclohexanone $\rightarrow$ 2-methylcyclohexanone
3. 2-Methylcyclopentanone $\rightarrow$ 2,2-dimethylcyclopentanone
4. 2-Methylcyclopentanone $\rightarrow$ 2,5-dimethylcyclopentanone
7. 2,4-pentanedione $\rightarrow$ 3-methyl-2,4-pentanedione
8. 2,4-pentanedione $\rightarrow$ 3-dimethyl-2,4-pentanedione
9. 1-methoxy-2-propanone $\rightarrow$ 2-methoxy-3-pentanone
10. Acetophenone $\rightarrow$ 1-phenyl-1-propanone

Note that (4) is the kinetic product more readily accessed from the same starting material.

In [9]:
smiles = {'reactants': ['CC(=O)C', 'O=C1CCCCC1', 'O=C1CCCC1C', 'O=C1CCCC1C', 
                        'CC(=O)CC(=O)C', 'CC(=O)CC(=O)C', 'COCC(=O)C',
                        'CC(=O)c1ccccc1'], 
        'products': ['CCC(=O)C', 'CC1CCCCC1=O', 'CC1CCC(C1=O)C', 'O=C1CCCC1(C)C', 
                     'CC(C(=O)C)C(=O)C', 'CC(=O)C(C(=O)C)(C)C', 'COC(C(=O)CC)C',
                     'CCC(=O)c1ccccc1']}

# removed the original fifth and sixth reactions due to the products don't belong to the combined smiles strings file

In [10]:
#transform to DataFrame type
smiles = pd.DataFrame(smiles)
smiles.head()

Unnamed: 0,reactants,products
0,CC(=O)C,CCC(=O)C
1,O=C1CCCCC1,CC1CCCCC1=O
2,O=C1CCCC1C,CC1CCC(C1=O)C
3,O=C1CCCC1C,O=C1CCCC1(C)C
4,CC(=O)CC(=O)C,CC(C(=O)C)C(=O)C


In [11]:
#transform smiles string to vectors
rvec = [darkchem.utils.struct2vec(reactant).astype(int) for reactant in smiles['reactants']]
pvec = [darkchem.utils.struct2vec(product).astype(int) for product in smiles['products']]
#tranform list to nd.array
rvec = np.array(rvec)
pvec = np.array(pvec)
#transform numpy array to list  
rvec = rvec.tolist() 
pvec = pvec.tolist()
#smiles['rxnvector'] = [smilesdf.pvec[i] - smilesdf.rvec[i] for i in range(len(smilesdf))]

In [12]:
#judge whether the reactants are contained in the training set or not
[array_in_nd_array(vec, x) for vec in rvec]

[True, True, True, True, True, True, True, True]

In [13]:
#judge whether the products are included in the training set or not
[array_in_nd_array(vec, x) for vec in pvec]

[True, True, True, True, True, True, True, True]

In [14]:
#find the specific indices of reactants and products in the training set
rindex = [x.index(rvec[i]) for i in range(len(rvec))]
pindex = [x.index(pvec[j]) for j in range(len(pvec))]

In [15]:
#extract the vectors of reactants and products from training set
rlaten = [x[rindex[i]] for i in range(len(rindex))]
platen = [x[pindex[j]] for j in range(len(pindex))] 

In [16]:
#tranform to no.array
rlaten = np.array(rlaten)
platen = np.array(platen)

In [17]:
#generate the latent space representation for reactants and products
r_latent = model.encoder.predict(rlaten)
p_latent = model.encoder.predict(platen)

In [18]:
# create the path vectors 
path = [p_latent[i] - r_latent[i] for i in range(smiles.shape[0])]

In [19]:
#calculate the Pearson Correlation coefficients for the path vectors
cor = np.corrcoef(path)
cor = pd.DataFrame(cor)
cor

Unnamed: 0,0,1,2,3,4,5,6,7
0,1.0,0.00247,0.163614,0.326212,0.285242,0.185808,0.476659,0.854454
1,0.00247,1.0,0.713666,0.052942,-0.01477,-0.163662,0.207443,0.060268
2,0.163614,0.713666,1.0,0.461192,0.243065,0.022661,0.189457,0.051094
3,0.326212,0.052942,0.461192,1.0,0.345487,0.311165,0.47091,0.211952
4,0.285242,-0.01477,0.243065,0.345487,1.0,0.618727,-0.055559,0.183516
5,0.185808,-0.163662,0.022661,0.311165,0.618727,1.0,0.162696,0.09125
6,0.476659,0.207443,0.189457,0.47091,-0.055559,0.162696,1.0,0.39198
7,0.854454,0.060268,0.051094,0.211952,0.183516,0.09125,0.39198,1.0


In [20]:
#save the vectors for reactants and products
#rct8 = np.save('rct8.npy',rvec)
#prd8 = np.save('prd8.npy',pvec)

In [21]:
# Here I just used the vectors transformed from the function 
# 'struct2vec' to generate the corresponding latent space representations
rls = model.encoder.predict(np.array(rvec))
pls = model.encoder.predict(np.array(pvec))

In [22]:
r_latent[0]

array([-0.13035029,  0.00977203,  0.7587669 , -0.695763  ,  0.7624186 ,
       -0.70816445, -1.6310813 , -1.3992049 ,  0.21437114,  0.16402137,
        0.869545  , -0.03586465,  1.2500587 ,  1.1478314 , -0.06087017,
        1.579063  , -1.2945819 , -0.69367516, -0.74395645,  0.67282236,
        1.0523617 ,  1.0868762 , -1.3348655 , -0.03884706,  0.67275655,
       -0.7287611 ,  0.95812255, -0.7532906 ,  0.81484246,  0.67305315,
       -0.78553116, -0.78153235,  0.04047529, -0.7960257 , -0.7728058 ,
        0.03402176,  0.01002301, -0.02760999,  0.07065707, -0.03489565,
        0.83611166,  0.07842414,  0.7925011 ,  0.7465579 ,  0.02986768,
        0.79608214, -0.63136995,  0.05825005,  0.88595617,  1.6280986 ,
       -0.71103805, -1.1342529 ,  0.74180496,  0.9668985 ,  0.40982607,
        1.0991409 , -0.04201152,  0.9303349 , -0.02639547,  0.59913284,
       -0.70660055, -0.717965  ,  0.05152839,  0.69333184,  0.2667974 ,
        0.04964831, -0.02104664,  0.661407  , -0.7272766 , -0.64

In [23]:
rls[0]

array([-0.13035029,  0.00977203,  0.7587669 , -0.695763  ,  0.7624186 ,
       -0.70816445, -1.6310813 , -1.3992049 ,  0.21437114,  0.16402137,
        0.869545  , -0.03586465,  1.2500587 ,  1.1478314 , -0.06087017,
        1.579063  , -1.2945819 , -0.69367516, -0.74395645,  0.67282236,
        1.0523617 ,  1.0868762 , -1.3348655 , -0.03884706,  0.67275655,
       -0.7287611 ,  0.95812255, -0.7532906 ,  0.81484246,  0.67305315,
       -0.78553116, -0.78153235,  0.04047529, -0.7960257 , -0.7728058 ,
        0.03402176,  0.01002301, -0.02760999,  0.07065707, -0.03489565,
        0.83611166,  0.07842414,  0.7925011 ,  0.7465579 ,  0.02986768,
        0.79608214, -0.63136995,  0.05825005,  0.88595617,  1.6280986 ,
       -0.71103805, -1.1342529 ,  0.74180496,  0.9668985 ,  0.40982607,
        1.0991409 , -0.04201152,  0.9303349 , -0.02639547,  0.59913284,
       -0.70660055, -0.717965  ,  0.05152839,  0.69333184,  0.2667974 ,
        0.04964831, -0.02104664,  0.661407  , -0.7272766 , -0.64

From here, you can see r_latent list is the same as rls list. 

In [24]:
#predict the smiles strings using latent space vectors
x_pred = model.decoder.predict(r_latent)

In [25]:
#transform latent space representations to vectors
trs = darkchem.utils.beamsearch(x_pred, k=1)

In [26]:
# transform the type to [-1,100] for the following calculation of structure
trs = trs.reshape(-1,100)

In [27]:
#transform the vectors to structure
v2s = [darkchem.utils.vec2struct(trs[i]) for i in range(len(trs))]

In [28]:
v2s
# something wrong with the indent, but here we can know the whole process is right.
#That's to say the data extract from combined smiles strings.npy file equals to
# the data we get from the function 'struct2vecS'

['CC(=O)C',
 'O=C1CCCCC1',
 'O=C1CCCC1C',
 'O=C1CCCC1C',
 'CC(=O)CC(=O)C',
 'CC(=O)CC(=O)C',
 'COCC(=O)C',
 'CC(=O)c1ccccc1']