In [None]:
radius = 'ecfp0'
folder = 'ecfps_full'
filename = folder + '/' + radius
samples_count = '2M'
model_name = f'molberto_{radius}_{samples_count}'

In [None]:
with open(filename + '.txt', 'r') as fp:
    text = fp.read().split('\n')

Now we save this data to file as several *plaintext* files.

In [None]:
from tqdm.auto import tqdm
import os

folder_cutted = 'ecfps_cutted'
folder_with_paths = folder_cutted + '/' + radius

def split_into_many_files(filename: str, text: list):
    """
    Cuts big file 'text' into small files with 10000 sentences.
    These small files will be fed into tokenizer (to train it).
    File 'text' should consist of sentences on each line.
    """
    if not os.path.exists(folder_cutted):
        os.mkdir(folder_cutted)
    if not os.path.exists(folder_with_paths):
        os.mkdir(folder_with_paths)
        
    text_data = []
    file_count = 0

    for sample in tqdm(text):
        sample = sample.replace('\n', '')
        text_data.append(sample)
        if len(text_data) == 10_000:
            # once we git the 10K mark, save to file
            with open(f'{folder_with_paths}/text_{file_count}.txt', 'w', encoding='utf-8') as fp:
                fp.write('\n'.join(text_data))
            text_data = []
            file_count += 1
    with open(f'{folder_with_paths}/text_{file_count}.txt', 'w', encoding='utf-8') as fp:
        fp.write('\n'.join(text_data))

In [None]:
split_into_many_files(filename, text)

In [None]:
from pathlib import Path

paths = [str(x) for x in Path(folder_with_paths).glob('*.txt')]

len(paths)

Now we move onto training the tokenizer. We use a byte-level Byte-pair encoding (BPE) tokenizer. This allows us to build the vocabulary from an alphabet of single bytes, meaning all words will be decomposable into tokens.

In [None]:
from tokenizers import ByteLevelBPETokenizer

tokenizer = ByteLevelBPETokenizer()

In [None]:
tokenizer.train(files=paths, vocab_size=30_522, min_frequency=2,
                special_tokens=['<s>', '<pad>', '</s>', '<unk>', '<mask>'])

In [None]:
import os

# os.mkdir(model_name)

# tokenizer.save_model(model_name)

Now we have two files that outline our new tokenizer:

* the *vocab.json* - a mapping file between tokens to token IDs

* and *merges.txt* - which describes which characters/set of characters can be decomposed/composed smaller/larger tokens


In [None]:
from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained(model_name, max_len=512)

In [None]:
# test our tokenizer on a simple sentence
tokens = tokenizer('2246728737 864674487 3217380708 3218693969 3218693969 3218693969 3218693969 3217380708 2245900962 847433064')

In [None]:
print(tokens)

In [None]:
tokens.input_ids

We can see here that our **<s\>** token is now placed at the beginning of our sequences using token ID *0*. At the end of the sequence we see the **<s\\>** token represented by *2*.