In [None]:
# default_exp pretokenizer

# Pre-Tokenizer

> Tokenize SMILES (Simplified Molecular-Input Line-Entry System) into units. 

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#export
def atomwise_tokenizer(smi, exclusive_tokens = None):
    """
    Tokenize a SMILES molecule at atom-level:
        (1) 'Br' and 'Cl' are two-character tokens
        (2) Symbols with bracket are considered as tokens
    
    exclusive_tokens: A list of specifical symbols with bracket you want to keep. e.g., ['[C@@H]', '[nH]'].
    Other symbols with bracket will be replaced by '[UNK]'. default is `None`.
    """
    import re
    pattern =  "(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])"
    regex = re.compile(pattern)
    tokens = [token for token in regex.findall(smi)]
    
    if exclusive_tokens:
        for i, tok in enumerate(tokens):
            if tok.startswith('['):
                if tok not in exclusive_tokens:
                    tokens[i] = '[UNK]'
    return tokens

In [None]:
#hide
def atomwise_tokenizer(smiles, exclusive_tokens = None):
    """
    Tokenize a SMILES molecule at atom-level:
        (1) 'Br' and 'Cl' are two-character tokens
        (2) Symbols with bracket are considered as tokens
        (3) All other symbols are tokenized on character level.
    
    exclusive_tokens: A list of specifical symbols with bracket you want to keep. e.g., ['[C@@H]', '[nH]'].
    Other symbols with bracket will be replaced by '[UNK]'. default is `None`.
    """
    import re
    from functools import reduce
    regex = '(\[[^\[\]]{1,10}\])'
    char_list = re.split(regex, smiles)
    tokens = []
        
    if exclusive_tokens:
        for char in char_list:
            if char.startswith('['):
                if char in exclusive_tokens:
                    tokens.append(str(char))
                else:
                    tokens.append('[UNK]')
            else:
                chars = [unit for unit in char]
                [tokens.append(i) for i in chars]                    
        
    if not exclusive_tokens:
        for char in char_list:
            if char.startswith('['):
                tokens.append(str(char))
            else:
                chars = [unit for unit in char]
                [tokens.append(i) for i in chars]
                
    #fix the 'Br' be splited into 'B' and 'r'
    if 'r' in tokens:
        for index, tok in enumerate(tokens):
            if tok == 'r':
                if tokens[index-1] == 'B':
                        tokens[index-1: index+1] = [reduce(lambda i, j: i + j, tokens[index-1 : index+1])]
        
    #fix the 'Cl' be splited into 'C' and 'l'
    if 'l' in tokens:
        for index, tok in enumerate(tokens):
            if tok == 'l':
                if tokens[index-1] == 'C':
                        tokens[index-1: index+1] = [reduce(lambda i, j: i + j, tokens[index-1 : index+1])]
    return tokens

Tokenize a SMILES string on atom-level.

In [None]:
smi = 'CC[N+](C)(C)Cc1ccccc1Br'
toks = atomwise_tokenizer(smi)
print(toks)

['C', 'C', '[N+]', '(', 'C', ')', '(', 'C', ')', 'C', 'c', '1', 'c', 'c', 'c', 'c', 'c', '1', 'Br']


In [None]:
#hide
atomwise_tokenizer('ABrBCD>>[dum]dumcobrclCl[Br] %11')

['Br', 'B', 'C', '>', '>', '[dum]', 'c', 'o', 'b', 'c', 'Cl', '[Br]', '%11']

Tokenize a SMILES string on atom-level. Only include specifcal symbols in the `exclusive_tokens` list. The symbols with bracket which are not in `exclusive_tokens` will be replaced with `[UNK]`

In [None]:
sep_tokens = ['[C@@H]', '[C@@]']
smi = 'CC(C)C[C@@H]1N2C(=O)[C@](NC(=O)[C@H]3CN(C)[C@@H]4Cc5c(Br)[nH]c6cccc(C4=C3)c56)(O[C@@]2(O)[C@@H]7CCCN7C1=O)C(C)C'
toks = atomwise_tokenizer(smi, exclusive_tokens=sep_tokens)
print(toks)

['C', 'C', '(', 'C', ')', 'C', '[C@@H]', '1', 'N', '2', 'C', '(', '=', 'O', ')', '[UNK]', '(', 'N', 'C', '(', '=', 'O', ')', '[UNK]', '3', 'C', 'N', '(', 'C', ')', '[C@@H]', '4', 'C', 'c', '5', 'c', '(', 'Br', ')', '[UNK]', 'c', '6', 'c', 'c', 'c', 'c', '(', 'C', '4', '=', 'C', '3', ')', 'c', '5', '6', ')', '(', 'O', '[C@@]', '2', '(', 'O', ')', '[C@@H]', '7', 'C', 'C', 'C', 'N', '7', 'C', '1', '=', 'O', ')', 'C', '(', 'C', ')', 'C']


In [None]:
#hide
seq = 'ABCDTTDSE'
toks = atomwise_tokenizer(seq)
print(toks)

['B', 'C', 'S']


In [None]:
#export

def kmer_tokenizer(smiles, ngram=4, stride=1, remove_last = False, exclusive_tokens = None):
    units = atomwise_tokenizer(smiles, exclusive_tokens = exclusive_tokens) #collect all the atom-wise tokens from the SMILES
    if ngram == 1:
        tokens = units
    else: 
        tokens = [tokens_to_mer(units[i:i+ngram]) for i in range(0, len(units), stride) if len(units[i:i+ngram]) == ngram]
    
    if remove_last:
        if len(tokens[-1]) < ngram: #truncate last whole k-mer if the length of the last k-mers is less than ngram.
            tokens = tokens[:-1]
    return tokens

def tokens_to_mer(toks):
    return ''.join(toks)

Tokenize a SMILES string into 4-mers.

In [None]:
smi = 'CC[N+](C)(C)Cc1ccccc1Br'
toks = kmer_tokenizer(smi, ngram=4)
print(toks)

['CC[N+](', 'C[N+](C', '[N+](C)', '(C)(', 'C)(C', ')(C)', '(C)C', 'C)Cc', ')Cc1', 'Cc1c', 'c1cc', '1ccc', 'cccc', 'cccc', 'ccc1', 'cc1Br']
