In [11]:
import numpy as np
import pandas as pd
import sys, os
from rdkit import Chem
from rdkit.Chem import PandasTools
from rdkit.Chem.Draw import IPythonConsole
from mol2vec.features import mol2alt_sentence, mol2sentence, MolSentence, DfVec, sentences2vec
from gensim.models import word2vec

In [12]:
def sentences2vec(sentences, model, unseen=None):
    """Generate vectors for each sentence (list) in a list of sentences. Vector is simply a
    sum of vectors for individual words.
    Parameters
    ----------
    sentences : list, array
        List with sentences
    model : word2vec.Word2Vec
        Gensim word2vec model
    unseen : None, str
        Keyword for unseen words. If None, those words are skipped.
        https://stats.stackexchange.com/questions/163005/how-to-set-the-dictionary-for-text-analysis-using-neural-networks/163032#163032
    Returns
    -------
    np.array
    """

    keys = set(model.wv.key_to_index)
    vec = []

    if unseen:
        unseen_vec = model.wv.get_vector(unseen)

    for sentence in sentences:
        if unseen:
            vec.append(sum([model.wv.get_vector(y) if y in set(sentence) & keys
                            else unseen_vec for y in sentence]))
        else:
            vec.append(sum([model.wv.get_vector(y) for y in sentence
                            if y in set(sentence) & keys]))
    return np.array(vec)

In [21]:
load_path='../Data02-different_descriptors/SMILES/'
save_path='../Data02-different_descriptors/graph_descriptor/'
train_data='SMILES_train.pkl'
test_data='SMILES_test.pkl'
model = word2vec.Word2Vec.load('../Data02-different_descriptors/graph_descriptor/POLYINFO_PI1M.pkl')

In [15]:
train_df=pd.read_pickle(load_path+train_data)
test_df=pd.read_pickle(load_path+test_data)
train_SMILES=train_df['SMILES']
test_SMILES=test_df['SMILES']

In [17]:
train_SMILES

0                     [*]CCCCCCCCCCCCOC(=O)CC=CCC(=O)O[*]
1                              [*]CCCCCCCCCCCC(=O)N([*])C
2                             [*]OC(C)CCOC(=O)COCC([*])=O
3                      [*]Oc1ccc(Oc2ccc(OC([*])=O)cc2)cc1
4       [*]Oc1cc(C(C)(C)c2ccccc2)c(OC(=O)c2ccc(C([*])=...
                              ...                        
1630                [*]CCCCCCCCCNC(=O)c1ccc(C(=O)N[*])cc1
1631                 [*]CCCCCCCCCCOC(=O)C(O)C(O)C(=O)O[*]
1632    [*]C(=O)OCCCCCCCCOC(=O)c1ccc(NC(=O)c2ccc(N3C(=...
1633                       [*]CC(CO[*])(COC(C)=O)COC(C)=O
1634                    [*]CCCCC(=O)NCCCCCCNC(=O)CCCCS[*]
Name: SMILES, Length: 1635, dtype: object

In [19]:
train_df['rdmol'] = train_SMILES.map(lambda x: Chem.MolFromSmiles(x)) 
test_df['rdmol'] = test_SMILES.map(lambda x: Chem.MolFromSmiles(x)) 

In [22]:
train_df['sentence'] = train_df.apply(lambda x: MolSentence(mol2alt_sentence(x['rdmol'], 1)), axis=1)
test_df['sentence'] = test_df.apply(lambda x: MolSentence(mol2alt_sentence(x['rdmol'], 1)), axis=1)

In [24]:
train_df['mol2vec'] = [DfVec(x) for x in sentences2vec(train_df['sentence'], model, unseen='UNK')]
test_df['mol2vec'] = [DfVec(x) for x in sentences2vec(test_df['sentence'], model, unseen='UNK')]

In [27]:
X_train = np.array([x.vec for x in train_df['mol2vec']])
X_test = np.array([x.vec for x in test_df['mol2vec']])

In [30]:
Name=[]
for i in range (0,300):
    name='mol2vec_'+str(i+1)
    Name.append (name)

In [31]:
train_desc=pd.DataFrame(X_train,columns=Name)
test_desc=pd.DataFrame(X_test,columns=Name)

In [34]:
train_desc.to_pickle(save_path+"Mol2vec_train.pkl")
test_desc.to_pickle(save_path+"Mol2vec_test.pkl")

In [36]:
test_desc

Unnamed: 0,mol2vec_1,mol2vec_2,mol2vec_3,mol2vec_4,mol2vec_5,mol2vec_6,mol2vec_7,mol2vec_8,mol2vec_9,mol2vec_10,...,mol2vec_291,mol2vec_292,mol2vec_293,mol2vec_294,mol2vec_295,mol2vec_296,mol2vec_297,mol2vec_298,mol2vec_299,mol2vec_300
0,-2.313458,0.398943,-0.833941,-4.721895,-0.544928,-5.402278,-2.990527,-3.494968,-2.150224,-2.910558,...,3.888973,0.385175,-5.013699,3.341217,1.707630,-6.348175,7.829412,-7.483551,-0.960439,1.101459
1,1.323712,0.500946,1.621850,-1.959775,0.198714,-4.277528,1.920267,-2.911444,-1.011799,-1.027008,...,4.210801,-1.128490,-2.571071,6.198559,-4.054068,-6.173509,6.057120,-5.934949,-3.456444,3.238417
2,5.809849,0.168695,0.266147,-4.557753,2.240451,-1.784688,9.754171,1.361863,1.446795,-3.905705,...,17.516323,-10.152035,-4.464473,2.578799,3.720590,-2.745079,5.778493,-7.648422,-10.771959,9.279565
3,-0.591354,3.389807,-3.122604,-2.221508,3.886441,-1.587961,-0.142316,-0.221104,-6.161095,-6.444210,...,14.412880,-1.611671,1.125716,7.611958,-1.410582,-6.580986,7.293660,-8.102212,-6.503845,3.338789
4,-1.208719,1.773136,-0.831502,-1.759578,1.399999,-1.414137,0.197643,0.917739,-1.044393,-2.006225,...,5.211084,0.388168,0.289415,2.430781,-1.368332,-3.436624,3.419059,-2.722724,-2.658467,-0.537893
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1.414064,1.533768,-0.042748,-4.216507,0.491357,-0.922021,-0.032107,-2.251621,-0.371294,-1.769227,...,3.578623,-1.341317,-4.295772,0.840626,-1.707275,-4.755214,3.150140,-6.007267,-0.688439,3.204928
96,0.875707,1.013946,-2.630456,-6.212923,-0.183610,0.863627,2.944800,0.654643,-4.455277,-4.693870,...,8.605012,-9.200393,-4.762942,2.458585,5.615210,-0.115062,6.176520,-6.394039,-7.656897,9.959673
97,4.209914,1.473680,0.596548,-0.582649,0.661348,0.800088,3.905423,-0.059207,1.832354,0.174651,...,3.401891,-4.382341,-2.355928,-0.792472,3.263234,0.988590,0.028464,-0.160925,-2.153331,5.136760
98,-0.282478,1.050936,-2.135473,-2.300927,0.134731,-1.551853,0.508774,1.434669,-3.555662,-4.818180,...,8.648672,-0.962479,1.798321,4.315444,0.712779,-4.510499,6.332903,-5.579467,-4.727393,-2.284262
