In [None]:
# default_exp spe2vec

# SPE2Vec

>  

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#export
import os
import gensim

class Corpus(object):
    '''
    *filename*: A file that stores SMILES line-by-line.
    *tokenizer*: SPE tokenizer
    *dropout*: SPE dropout, default = 0
    '''
    def __init__(self, infile, tokenizer, isdir=False, dropout=0):
        self.infile = infile
        self.tokenizer = tokenizer
        self.dropout = dropout
        self.isdir = isdir
 
    def __iter__(self):
        if self.isdir:
            for fname in os.listdir(self.infile):
                for smi in open(os.path.join(self.infile, fname)):
                    yield self.tokenizer.tokenize(smi, dropout=self.dropout).split(' ')
        else:
            for smi in open(self.infile):
                yield self.tokenizer.tokenize(smi, dropout=self.dropout).split(' ')
    
def learn_spe2vec(corpus, outfile=None, 
                  vector_size=100, window=10, min_count=10, n_jobs = 1, method = 'skip-gram', 
                  **kwargs):
    '''
    Train a spe2vec model.
    
    *corpus*: an instance of `Class Corpus()`
    *outfile*: str, name of the spe2vec model file.
    *vector_size*: dimensions of embedding.
    *window*: number of tokens considered as context
    *min_count*: number of occurrences a token should have to be considered in training
    *n_jobs*: number of cpu cores used for training
    method: modeling method, choose from ['cbow', 'skip-gram']
    
    More training parameter can be found https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec
    '''
    
    if method.lower() == 'skip-gram':
        sg = 1
    elif method.lower() == 'cbow':
        sg = 0
    else:
        raise ValueError("Invalid option,  choose from ['cbow', 'skip-gram']")
    
    model = gensim.models.Word2Vec(corpus, size=vector_size, window=window, min_count=min_count, workers=n_jobs, sg=sg,
                              **kwargs)
    
    if outfile:
        model.save(outfile)
        
    return model

class SPE2Vec(object):
    pass
    

In [None]:
#hide
file = '../experiments/data/smiles_toy.smi'
filedir = '../experiments/data/'

In [None]:
#hide
import codecs
from SmilesPE.tokenizer import *
spe_vob= codecs.open('../SPE_ChEMBL.txt')
spe = SPE_Tokenizer(spe_vob)

In [None]:
#hide
%%time
corpus = Corpus(file, tokenizer = spe) # a memory-friendly iterator
model = learn_spe2vec(corpus)

CPU times: user 2.81 s, sys: 19.6 ms, total: 2.83 s
Wall time: 1.05 s
