In [None]:
# default_exp spe2vec

# SPE2Vec

>  

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#export
import os
import gensim


class Corpus(object):
    '''
    *filename*: A file that stores SMILES line-by-line.
    
    *tokenizer*: SPE tokenizer
    
    *dropout*: SPE dropout, default = 0
    '''
    def __init__(self, infile, tokenizer, isdir=False, dropout=0):
        self.infile = infile
        self.tokenizer = tokenizer
        self.dropout = dropout
        self.isdir = isdir
 
    def __iter__(self):
        if self.isdir:
            for fname in os.listdir(self.infile):
                for smi in open(os.path.join(self.infile, fname)):
                    yield self.tokenizer.tokenize(smi, dropout=self.dropout).split(' ')
        else:
            for smi in open(self.infile):
                yield self.tokenizer.tokenize(smi, dropout=self.dropout).split(' ')
    
def learn_spe2vec(corpus, outfile=None, 
                  vector_size=100, window=10, min_count=10, n_jobs = 1, method = 'skip-gram', 
                  **kwargs):
    '''
    Train a spe2vec model.
    
    *corpus*: an instance of `Class Corpus()`
    
    *outfile*: str, name of the spe2vec model file.
    
    *vector_size*: dimensions of embedding.
    
    *window*: number of tokens considered as context
    
    *min_count*: number of occurrences a token should have to be considered in training
    
    *n_jobs*: number of cpu cores used for training.
    
    *method*: modeling method, choose from ['cbow', 'skip-gram']
    
    More training parameter can be found https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec
    '''
    
    if method.lower() == 'skip-gram':
        sg = 1
    elif method.lower() == 'cbow':
        sg = 0
    else:
        raise ValueError("Invalid option,  choose from ['cbow', 'skip-gram']")
    
    model = gensim.models.Word2Vec(corpus, size=vector_size, window=window, min_count=min_count, workers=n_jobs, sg=sg,
                              **kwargs)
    
    if outfile:
        model.save(outfile)
        
    return model

def load_spe2vec(model_path):
    return gensim.models.Word2Vec.load(model_path)



In [None]:
#hide
class SPE2Vec(object):
    def __init__(self, model_path, tokenizer):
        self.model = gensim.models.Word2Vec.load(model_path)
        self.tokenizer = tokenizer
        self.token_keys = set(self.model.wv.vocab.keys())
        
        #get the vector for unknown tokens. simply averge the vectors of all known tokens.
        import numpy as np
        
        vectors = []
        for word in self.model.wv.vocab:
            vectors.append(self.model.wv[word])        
        self.unknown = np.mean(vectors, axis=0)
    
    def tokenize(self, smi, dropout=0):
        '''
        tokenize SMILES into substructure tokens.
        '''
        return self.tokenizer.tokenize(smi, dropout)
    
    def smiles2vec(self, smi, dropout=0):
        '''
        Generate a vector for a SMILES. The vector is simply a sum of vectors for individual tokens.
        
        The Unknown token will be skipped
        '''
        
        tokens = self.tokenizer.tokenize(smi, dropout).split(' ')
#         vec=[]
#         for tok in tokens:
#             if tok in self.token_keys:
#                 vec.append(self.model.wv[tok])
        
        return np.mean([self.model.wv[tok] for tok in tokens if tok in self.token_keys], axis=0)
    
    def spe2vec(self, smi, dropout=0, skip_unknown=False):
        '''
        Generate a list of vectors (np.array). Each vector is spe vector of each token.
        
        The unknown token will be represented by the mean of all token vectors from the model.
        '''
        
        token_keys = set(self.model.wv.vocab.keys())
        tokens = self.tokenizer.tokenize(smi, dropout).split(' ')
        
        if skip_unknown:
            vec = [self.model.wv[tok] for tok in tokens if tok in self.token_keys]
        else:
            vec = [self.model.wv[tok] if tok in self.token_keys else self.unknown for tok in tokens]
        
        return vec

In [None]:
#hide
file = '../experiments/data/smiles_toy.smi'
filedir = '../experiments/data/'

In [None]:
#hide
import codecs
from SmilesPE.tokenizer import *
spe_vob= codecs.open('../SPE_ChEMBL.txt')
spe = SPE_Tokenizer(spe_vob)

In [None]:
#hide
%%time
corpus = Corpus(file, tokenizer = spe) # a memory-friendly iterator
model = learn_spe2vec(corpus)

CPU times: user 2.81 s, sys: 19.6 ms, total: 2.83 s
Wall time: 1.05 s


In [None]:
#hide
model = load_spe2vec('../experiments/results/spe_model.bin')
print(model)

Word2Vec(vocab=3114, size=100, alpha=0.025)


In [None]:
s = SPE2Vec('../experiments/results/spe_model.bin', spe)

In [None]:
s.tokenize('c1ccccc1')

'c1ccccc1'

In [None]:
s.smiles2vec('c1ccccc1') == model.wv['c1ccccc1']

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True])

In [None]:
s.tokenize('c1ccccc1[dum]')

'c1ccccc1 [dum]'

In [None]:
s.spe2vec('c1ccccc1[dum]')

[array([ 0.00324177, -0.18124679,  0.1894573 ,  0.29736474, -0.14143717,
        -0.03290153, -0.31891045,  0.16373567, -0.12413523, -0.08658446,
        -0.23956653,  0.05335753,  0.18146366, -0.17212407, -0.17879114,
        -0.01039552, -0.00274071,  0.01653983,  0.08432296, -0.15634526,
         0.29629305, -0.16786121,  0.06479991,  0.34462902, -0.11052489,
        -0.13513446,  0.16418819, -0.21508686, -0.01842665, -0.15818536,
        -0.05421342,  0.2041645 ,  0.14783993, -0.00653112, -0.19034739,
        -0.11876111,  0.12208337, -0.0743893 ,  0.03400969,  0.04422404,
        -0.10224582,  0.34490895,  0.12326851, -0.08695894, -0.08150315,
         0.09907438,  0.28797793,  0.15912676,  0.15228626, -0.164707  ,
         0.33839643, -0.04265443, -0.11858924,  0.10059267, -0.24335982,
        -0.02948368,  0.53029126,  0.2448303 ,  0.11335112,  0.01153868,
        -0.01010862, -0.06406022, -0.01338368, -0.18424016,  0.03580371,
         0.18463984,  0.15326728, -0.15144381, -0.0

In [None]:
s.spe2vec('c1ccccc1[dum]',skip_unknown=True)

[array([ 0.00324177, -0.18124679,  0.1894573 ,  0.29736474, -0.14143717,
        -0.03290153, -0.31891045,  0.16373567, -0.12413523, -0.08658446,
        -0.23956653,  0.05335753,  0.18146366, -0.17212407, -0.17879114,
        -0.01039552, -0.00274071,  0.01653983,  0.08432296, -0.15634526,
         0.29629305, -0.16786121,  0.06479991,  0.34462902, -0.11052489,
        -0.13513446,  0.16418819, -0.21508686, -0.01842665, -0.15818536,
        -0.05421342,  0.2041645 ,  0.14783993, -0.00653112, -0.19034739,
        -0.11876111,  0.12208337, -0.0743893 ,  0.03400969,  0.04422404,
        -0.10224582,  0.34490895,  0.12326851, -0.08695894, -0.08150315,
         0.09907438,  0.28797793,  0.15912676,  0.15228626, -0.164707  ,
         0.33839643, -0.04265443, -0.11858924,  0.10059267, -0.24335982,
        -0.02948368,  0.53029126,  0.2448303 ,  0.11335112,  0.01153868,
        -0.01010862, -0.06406022, -0.01338368, -0.18424016,  0.03580371,
         0.18463984,  0.15326728, -0.15144381, -0.0