In [23]:
radius = 'ecfp0'
folder = 'ecfps_full'
filename = folder + '/' + radius
samples_count = '10M'
model_name = f'molberto_{radius}_{samples_count}'

In [18]:
with open(filename + '.txt', 'r') as fp:
    text = fp.read().split('\n')

Now we save this data to file as several *plaintext* files.

In [24]:
from tqdm.auto import tqdm
import os

def split_into_many_files(filename: str, text: list):
    """
    Cuts big file 'text' into small files with 10000 sentences.
    These small files will be fed into tokenizer (to train it).
    File 'text' should consist of sentences on each line.
    """
    if not os.path.exists('data'):
        os.mkdir('data')
    if not os.path.exists(folder):
        os.mkdir(folder)
    if not os.path.exists(f'data/{filename}'):
        os.mkdir(f'data/{filename}')
        
    text_data = []
    file_count = 0

    for sample in tqdm(text):
        sample = sample.replace('\n', '')
        text_data.append(sample)
        if len(text_data) == 10_000:
            # once we git the 10K mark, save to file
            with open(f'data/{filename}/text_{file_count}.txt', 'w', encoding='utf-8') as fp:
                fp.write('\n'.join(text_data))
            text_data = []
            file_count += 1
    with open(f'data/{filename}/text_{file_count}.txt', 'w', encoding='utf-8') as fp:
        fp.write('\n'.join(text_data))

In [22]:
split_into_many_files(filename, text)

  0%|          | 0/10000001 [00:00<?, ?it/s]

In [27]:
from pathlib import Path

paths = [str(x) for x in Path(f'data/{filename}').glob('*.txt')]

len(paths)

1001

Now we move onto training the tokenizer. We use a byte-level Byte-pair encoding (BPE) tokenizer. This allows us to build the vocabulary from an alphabet of single bytes, meaning all words will be decomposable into tokens.

In [28]:
from tokenizers import ByteLevelBPETokenizer

tokenizer = ByteLevelBPETokenizer()

In [29]:
tokenizer.train(files=paths, vocab_size=30_522, min_frequency=2,
                special_tokens=['<s>', '<pad>', '</s>', '<unk>', '<mask>'])






In [30]:
import os

os.mkdir(model_name)

tokenizer.save_model(model_name)

['molberto_efcp1_10M/vocab.json', 'molberto_efcp1_10M/merges.txt']

Now we have two files that outline our new tokenizer:

* the *vocab.json* - a mapping file between tokens to token IDs

* and *merges.txt* - which describes which characters/set of characters can be decomposed/composed smaller/larger tokens


In [13]:
from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained(model_name, max_len=512)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'RobertaTokenizer'.


In [14]:
# test our tokenizer on a simple sentence
tokens = tokenizer('2246728737 864674487 3217380708 2041434490 3218693969 3217380708 847961216 2246699815 864942730 847961216 3217380708 3218693969 2041434490 3217380708 3218693969 3218693969 3217380708 2245900962 847433064 3218693969 3217380708 3217380708 2092489639 2968968094 2968968094 3189457552 2968968094 2976033787 2246728737 3218693969 3217380708 1016841875 ')

In [15]:
print(tokens)

{'input_ids': [0, 344, 348, 279, 333, 273, 279, 337, 318, 307, 337, 279, 273, 333, 279, 273, 273, 279, 430, 462, 273, 279, 279, 341, 294, 294, 368, 294, 327, 320, 273, 279, 394, 225, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [16]:
tokens.input_ids

[0,
 344,
 348,
 279,
 333,
 273,
 279,
 337,
 318,
 307,
 337,
 279,
 273,
 333,
 279,
 273,
 273,
 279,
 430,
 462,
 273,
 279,
 279,
 341,
 294,
 294,
 368,
 294,
 327,
 320,
 273,
 279,
 394,
 225,
 2]

We can see here that our **<s\>** token is now placed at the beginning of our sequences using token ID *0*. At the end of the sequence we see the **<s\\>** token represented by *2*.