In [1]:
!pip install -U datasets

Collecting datasets
  Downloading datasets-2.17.1-py3-none-any.whl.metadata (20 kB)
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)
Collecting fsspec<=2023.10.0,>=2023.1.0 (from fsspec[http]<=2023.10.0,>=2023.1.0->datasets)
  Downloading fsspec-2023.10.0-py3-none-any.whl.metadata (6.8 kB)
Downloading datasets-2.17.1-py3-none-any.whl (536 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.7/536.7 kB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2023.10.0-py3-none-any.whl (166 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m166.4/166.4 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Installing collected packages: pyarrow-hotfix, fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2024.2.0
    Uninstalling fsspec-2024.2.0:
      Successfully uninstalled fsspe

# Sources:
1 - https://karpathy.ai/zero-to-hero.html  
2 - https://www.youtube.com/watch?v=kCc8FmEb1nY  
3 - https://github.com/karpathy/minbpe/tree/master  

# Imports

In [2]:
import regex as re
from datasets import load_dataset
from collections import Counter, OrderedDict
from tqdm import trange
import json
from tqdm import tqdm

# Parameters

In [3]:
# Dataset parameters
train_ratio = 0.9
sample_size = 1_000

# Parameters for Additional test
max_single_char_vocab_size = 150
max_vocab_size = 500
min_freq = 100
UNK_TOKEN = "<UNK>"

# select a tokenizer among BasicTokenizer, RegexTokenizer and MostCommonRegexTokenizer
tokenizer_name = "MostCommonRegexTokenizer"

SEED = 15

# Load files

In [4]:
# Define the path to the dataset
dataset_name = "20231101.fr"

# Load the dataset
raw_dataset = load_dataset("wikimedia/wikipedia", dataset_name)
print(f"Dataset size: {raw_dataset['train'].num_rows}")

Downloading readme:   0%|          | 0.00/131k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/17 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/769M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/422M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/348M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/296M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/284M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/259M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/219M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/216M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/205M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/205M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/205M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/169M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/210M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/214M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/222M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/186M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/186M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2564646 [00:00<?, ? examples/s]

Dataset size: 2564646


# Data preparation

In [5]:
# Create training and evaluation datasets
if sample_size < 0:
    train_sample = round(raw_dataset['train'].num_rows * train_ratio)
    test_sample = round(raw_dataset['train'].num_rows * (1- train_ratio))
else:
    train_sample = round(sample_size * train_ratio)
    test_sample = round(sample_size * (1- train_ratio))

ds_train_test = raw_dataset['train'].train_test_split(train_size=train_sample, test_size=test_sample, seed=SEED)
train_text = ''.join([t['text'] for t in ds_train_test['train']])

In [6]:
# Get some statistics about the data
print("Number of unique words: ", len(set(train_text.split())))
print("Number of unique characters: ", len(set(train_text)))
counter = Counter(train_text)
chars_above_threshold = [c for c in counter if counter[c] >= min_freq]
print("Number of unique characters above threshold: ", len(chars_above_threshold))
chars_below_threshold = [c for c in counter if counter[c] < min_freq]
print("Number of unique characters below threshold: ", len(chars_below_threshold))

Number of unique words:  74972
Number of unique characters:  874
Number of unique characters above threshold:  110
Number of unique characters below threshold:  764


# Step 1
Create a simple Byte Pair Encoder (BPE)

In [7]:
class BasicTokenizer():
    def __init__(self):
        self.text_to_int = {}  # {text: int}
        self.int_to_text = {}  # {int: text}
        self.merged_dict = OrderedDict()  # {(token, token): token}
        
    def train(self, text, max_vocab_size, verbose=False):
        self.text_to_int, self.int_to_text = self._create_initial_vocabulary(text)  
        self.current_vocab_size = len(self.text_to_int)
        
        train_tokens = self.encode(text) 
        
        if verbose: 
            self.print_stats(train_tokens)
        
        if self.current_vocab_size < max_vocab_size:
            iter_count = max_vocab_size - self.current_vocab_size
            for _ in trange(iter_count):
                token_pairs = self._get_pairs(train_tokens)
                most_freq = self._get_most_common_token_pair(token_pairs)
                
                str_pair = ''.join([self.int_to_text[most_freq[0]], self.int_to_text[most_freq[1]]])
                self.text_to_int[str_pair] = self.current_vocab_size
                self.int_to_text[self.current_vocab_size] = str_pair
                self.merged_dict[most_freq] = self.text_to_int[str_pair]
                self.current_vocab_size += 1

                new_train_tokens = self._replace_token_pairs(train_tokens, most_freq, self.text_to_int[str_pair])
                train_tokens = new_train_tokens

                if verbose:
                    self.print_stats(train_tokens, str_pair, most_freq)

    def print_stats(self, train_tokens, str_pair="", token_pair=None):
        print("Updated vocab size:", self.current_vocab_size, 
              "Updated sequence length:", len(train_tokens), 
              "New vocab:", str_pair, 
              "Token pair:", token_pair)
        
    def _get_pairs(self, tokens):
        return list(zip(tokens, tokens[1:]))
    
    def _create_initial_vocabulary(self, text):
        sorted_text = sorted(set(text))
        text_to_int = {c: i for i, c in enumerate(sorted_text)}
        int_to_text = {i: c for i, c in enumerate(sorted_text)}
        return text_to_int, int_to_text

    def _get_most_common_token_pair(self, token_pairs):
        counter = Counter(token_pairs)
        most_freq = counter.most_common(1)[0][0]
        return most_freq
    
    def _replace_token_pairs(self, tokens, target_pair, new_token):
        """
        Replace all occurrences of `target_pair` in `tokens` by `new_token`.
        """
        new_tokens = []
        i = 0
        while i < len(tokens) - 1:
            if (tokens[i], tokens[i+1]) == target_pair:
                new_tokens.append(new_token)
                i += 2
            else:
                new_tokens.append(tokens[i])
                i += 1
        if i < len(tokens):
            new_tokens.append(tokens[i])
        return new_tokens

    def encode(self, text):
        tokens = [self.text_to_int[char] for char in text]
        for key in self.merged_dict:
            tokens = self._replace_token_pairs(tokens, key, self.merged_dict[key])
        return tokens
            
    
    def decode(self, tokens):
        return "".join([self.int_to_text[i] for i in tokens])


In [8]:
if tokenizer_name == "BasicTokenizer":
    tokenizer = BasicTokenizer()
    tokenizer.train(train_text, max_vocab_size=max_vocab_size, verbose=False)

    text = "Bonjour le monde, comment va ta journée?"
    code = tokenizer.encode(text)
    print(f"len original text: {len(text)}, len bpe text: {len(code)}") 
    print(code)
    print(tokenizer.decode(code))

# Step 2
Improve the BPE from step 1 by splitting first the text into words and then applying the BPE to each word. This will avoid having tokens that are composed of multiple words or heterogeneous (word + number for example)

In [9]:
class RegexTokenizer():
    def __init__(self):
        self.text_to_int = {}  # {text: int}
        self.int_to_text = {}  # {int: text}
        self.merged_dict = OrderedDict()  # {(token, token): token}
        
    def train(self, text, max_vocab_size, verbose=False, show_progress=False):
        self.text_to_int, self.int_to_text = self._create_initial_vocabulary(text)
        self.current_vocab_size = len(self.text_to_int)
        
        tokens_splits = self.encode(text)
        
        if verbose: 
            self.print_stats(tokens_splits)
            
        if self.current_vocab_size < max_vocab_size:
            iter_count = max_vocab_size - self.current_vocab_size
            for _ in trange(iter_count, disable=not show_progress):
                token_pairs = []
                for tokens_split in tokens_splits:
                    if len(tokens_split) > 1:
                        token_pairs.extend(self._get_pairs(tokens_split))
                        
                most_freq = self._get_most_common_token_pair(token_pairs)
                
                str_pair = ''.join([self.int_to_text[most_freq[0]], self.int_to_text[most_freq[1]]])
                self.text_to_int[str_pair] = self.current_vocab_size
                self.int_to_text[self.current_vocab_size] = str_pair
                self.merged_dict[most_freq] = self.text_to_int[str_pair]
                
                new_tokens_splits = [self._replace_token_pairs(t, most_freq, self.current_vocab_size) for t in tokens_splits]
                tokens_splits = new_tokens_splits
                self.current_vocab_size += 1
                
                if verbose: 
                    self.print_stats(tokens_splits, str_pair, most_freq)

    def print_stats(self, tokens_splits, str_pair="", token_pair=None):
        print("Updated vocab size:", self.current_vocab_size, 
              "Updated sequence length:", sum([len(split) for split in tokens_splits]),
              "New vocab:", str_pair, 
              "Token pair:", token_pair)
        
    def _get_pairs(self, tokens):
        return list(zip(tokens, tokens[1:]))
    
    def _create_initial_vocabulary(self, text):
        sorted_text = sorted(set(text))
        text_to_int = {c: i for i, c in enumerate(sorted_text)}
        int_to_text = {i: c for i, c in enumerate(sorted_text)}
        return text_to_int, int_to_text

    def _get_most_common_token_pair(self, token_pairs):
        counter = Counter(token_pairs)
        most_freq = counter.most_common(1)[0][0]
        return most_freq
    
    def _replace_token_pairs(self, tokens, target_pair, new_token):
        """
        Replace all occurrences of `target_pair` in `tokens` by `new_token`.
        """
        new_tokens = []
        i = 0
        while i < len(tokens) - 1:
            if (tokens[i], tokens[i+1]) == target_pair:
                new_tokens.append(new_token)
                i += 2
            else:
                new_tokens.append(tokens[i])
                i += 1
        if i < len(tokens):
            new_tokens.append(tokens[i])
        return new_tokens

    def _simple_encode(self, text):
        
        tokens = [self.text_to_int[char] for char in text]
        for key in self.merged_dict:
            tokens = self._replace_token_pairs(tokens, key, self.merged_dict[key])
        return tokens
    
    def encode(self, text):
        text_splits = self._split_text(text)
        tokens_splits = [self._simple_encode(text_split) for text_split in text_splits]
        return tokens_splits
        
    def _split_text(self, text):
        split_text = re.findall(GPT4_SPLIT_PATTERN, text)
        return split_text
    
    def decode(self, tokens_splits):
        texts = []
        for tokens in tokens_splits:
            texts.append("".join([self.int_to_text[i] for i in tokens]))
        return "".join(texts)
    
    def save_vocab(self, path):
        int_to_text = self.int_to_text
        with open(path, "w") as f:
            json.dump(int_to_text, f)


In [10]:
if tokenizer_name == "RegexTokenizer":
    GPT4_SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""
    tokenizer = RegexTokenizer()
    tokenizer.train(train_text, max_vocab_size=max_vocab_size, verbose=True, show_progress=True)
    tokenizer.save_vocab("RegexTokenizerVocab.txt")

    text = "Bonjour le monde, comment va ta journée?"
    code = tokenizer.encode(text)
    print(f"len original text: {len(text)}, len bpe text: {sum([len(c) for c in code])}")
    print(code)
    print(tokenizer.decode(code))

In [11]:
if tokenizer_name == "RegexTokenizer":
    tokenized_text = tokenizer.encode(train_text)
    flat_tokenized_text = [item for sublist in tokenized_text for item in tqdm(sublist)]  # flatten code
    counter = Counter(flat_tokenized_text)
    counter

In [12]:
if tokenizer_name == "RegexTokenizer":
    print([(v, tokenizer.int_to_text[v], count) for v, count in counter.most_common()])

# Additional test
Remove least common characters from the dictionary (below a threshold value) before merging characters  
This step will help to have a much smaller dictionary and will speed up the process  
Need to create a fallback in case of no common characters is found during the encoding

In [13]:
class MostCommonRegexTokenizer():
    def __init__(self):
        self.text_to_int = {UNK_TOKEN: 0}  # {text: int}
        self.int_to_text = {0: UNK_TOKEN}  # {int: text}
        self.merged_dict = OrderedDict()  # {(token, token): token}
        
    def train(self, text, max_vocab_size, min_freq, max_single_char_vocab_size, verbose=False, show_progress=False):
        self.text_size = len(text)
        self.char_counter = Counter(text)
        filtered_text = self.remove_least_common_characters(text, min_freq)
        text_to_int, int_to_text = self._create_initial_vocabulary(filtered_text, max_single_char_vocab_size)
        self.text_to_int.update(text_to_int)
        self.int_to_text.update(int_to_text)
        self.current_vocab_size = len(self.text_to_int)
        
        tokens_splits = self.encode(filtered_text)
        
        if verbose: 
            self._print_stats(tokens_splits)
        
        print('Creating merged dictionary...')
        if self.current_vocab_size < max_vocab_size:
            iter_count = max_vocab_size - self.current_vocab_size
            for _ in trange(iter_count, disable=not show_progress):
                token_pairs = []
                for tokens_split in tokens_splits:
                    if len(tokens_split) > 1:
                        token_pairs.extend(self._get_pairs(tokens_split))
                        
                most_freq = self._get_most_common_token_pair(token_pairs)
                
                str_pair = ''.join([self.int_to_text[most_freq[0]], self.int_to_text[most_freq[1]]])
                self.text_to_int[str_pair] = self.current_vocab_size
                self.int_to_text[self.current_vocab_size] = str_pair
                self.merged_dict[most_freq] = self.text_to_int[str_pair]
                
                new_tokens_splits = [self._replace_token_pairs(t, most_freq, self.current_vocab_size) for t in tokens_splits]
                tokens_splits = new_tokens_splits
                self.current_vocab_size += 1
                
                if verbose: 
                    self._print_stats(tokens_splits, str_pair, most_freq)

    def _print_stats(self, tokens_splits, str_pair="", token_pair=None):
        current_text_size = sum([len(split) for split in tokens_splits])
        print(f"Updated vocab size: {self.current_vocab_size} "
              f"Updated sequence length: {current_text_size} "
              f"New vocab: {repr(str_pair)} "
              f"Token pair: {token_pair} "
              f"Compression ratio: {self.text_size / current_text_size - 1:.2f}% "
             )
        
    def _get_pairs(self, tokens):
        """ Returns a list of tuples containing pairs of adjacent tokens from the input tokens list"""
        return list(zip(tokens, tokens[1:]))
    
    def _create_initial_vocabulary(self, text, max_single_char_vocab_size):        
        print("Creating initial vocabulary...")
        sorted_char = sorted(set(text))
        most_common_chars = [char for char, count in self.char_counter.most_common(max_single_char_vocab_size)]
        filtered_sorted_char = [c for c in sorted_char if c in most_common_chars]
        start_id = len(self.text_to_int)
        text_to_int = {c: start_id+i for i, c in enumerate(filtered_sorted_char)}
        int_to_text = {start_id+i: c for i, c in enumerate(filtered_sorted_char)}
        return text_to_int, int_to_text

    def _get_most_common_token_pair(self, token_pairs):
        """Gets the most frequent token pair from the given list of token pairs.
    
        This ignores token pairs that contain <UNK> tokens. It will raise a ValueError if no
        valid token pairs are found.
        
        Args:
            token_pairs: List of token id pairs
        
        Returns:
            The most frequent token id pair
        """
        counter = Counter(token_pairs)
        # Get the first most frequent pair (excluding <UNK>)
        for pair, freq in counter.most_common():
            if pair[0]!= 0 and pair[1]!= 0:
                return pair
        else:
            raise ValueError("No pairs found")
    

    def _replace_token_pairs(self, tokens, target_pair, new_token):
        """
        Replaces occurrences of a given token pair in the input 
        token list with a new token.
        
        Iterates through tokens checking for the target pair. 
        When found, appends the new token to the output and 
        advances the index by 2 to skip the pair. Otherwise just 
        appends the current token.
        
        Returns the new list of tokens.
        """
        new_tokens = []
        i = 0
        while i < len(tokens) - 1:
            if (tokens[i], tokens[i+1]) == target_pair:
                new_tokens.append(new_token)
                i += 2
            else:
                new_tokens.append(tokens[i])
                i += 1
        if i < len(tokens):
            new_tokens.append(tokens[i])
        return new_tokens


    def _simple_encode(self, text):
        """Encodes the given text into a list of token IDs.
    
        This uses a simple encoding scheme of looking up each character in the 
        text_to_int mapping and falling back to the unknown token ID if not found.
        
        It then iterates through the tokens, looking for any token pairs that match
        the merged_dict and replacing them with the merged token ID.
        
        Args:
            text: The text to encode
        
        Returns: 
            The list of encoded token IDs.
        """
        tokens = [self.text_to_int.get(char, 0) for char in text]  # fallback to <UNK> when not found
        for key in self.merged_dict:
            tokens = self._replace_token_pairs(tokens, key, self.merged_dict[key])
        return tokens
    
    def encode(self, text):
        print("Encoding...")
        text_splits = self._split_text(text)
        tokens_splits = [self._simple_encode(text_split) for text_split in tqdm(text_splits)]
        return tokens_splits
        
    def _split_text(self, text):
        split_text = re.findall(GPT4_SPLIT_PATTERN, text)
        return split_text
    
    def decode(self, tokens_splits):
        print("Decoding...")
        texts = []
        for tokens in tokens_splits:
            texts.append("".join([self.int_to_text[i] for i in tokens]))
        return "".join(texts)
    
    def save_vocab(self, path):
        print("Saving vocab...")
        int_to_text = self.int_to_text
        with open(path, "w") as f:
            json.dump(int_to_text, f)
                
    def remove_least_common_characters(self, text, min_freq):
        """
        Remove characters that appear less than `min_freq` in `text` using regex.
        """
        print("Removing least common characters...")
        least_common_chars = {c for c in self.char_counter if self.char_counter[c] < min_freq}
        if least_common_chars:  
            pattern = '[' + re.escape(''.join(least_common_chars)) + ']'
            filtered_text = re.sub(pattern, '', text)
            return filtered_text
        else:
            return text


In [14]:
%%time
if tokenizer_name == "MostCommonRegexTokenizer":
    GPT4_SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""
    tokenizer = MostCommonRegexTokenizer()
    tokenizer.train(
        text=train_text, 
        max_vocab_size=max_vocab_size, 
        min_freq=min_freq, 
        max_single_char_vocab_size=max_single_char_vocab_size,
        verbose=True, 
        show_progress=True)
    tokenizer.save_vocab("MostCommonRegexTokenizer.txt")

    text = "Bonjour le monde, comment va ta journée?"
    code = tokenizer.encode(text)
    print(f"len original text: {len(text)}, len bpe text: {sum([len(c) for c in code])}")
    print(code)
    print(tokenizer.decode(code))

Removing least common characters...
Creating initial vocabulary...
Encoding...


100%|██████████| 556316/556316 [00:01<00:00, 304702.66it/s]


Updated vocab size: 111 Updated sequence length: 2720406 New vocab: '' Token pair: None Compression ratio: 0.00% 
Creating merged dictionary...


  0%|          | 1/389 [00:02<18:05,  2.80s/it]

Updated vocab size: 112 Updated sequence length: 2662698 New vocab: ' d' Token pair: (2, 58) Compression ratio: 0.02% 


  1%|          | 2/389 [00:05<17:49,  2.76s/it]

Updated vocab size: 113 Updated sequence length: 2617658 New vocab: 'es' Token pair: (59, 73) Compression ratio: 0.04% 


  1%|          | 3/389 [00:08<17:50,  2.77s/it]

Updated vocab size: 114 Updated sequence length: 2580341 New vocab: ' l' Token pair: (2, 66) Compression ratio: 0.06% 


  1%|          | 4/389 [00:10<17:11,  2.68s/it]

Updated vocab size: 115 Updated sequence length: 2544761 New vocab: 'en' Token pair: (59, 68) Compression ratio: 0.07% 


  1%|▏         | 5/389 [00:13<17:23,  2.72s/it]

Updated vocab size: 116 Updated sequence length: 2510912 New vocab: 'on' Token pair: (69, 68) Compression ratio: 0.08% 


  2%|▏         | 6/389 [00:15<16:27,  2.58s/it]

Updated vocab size: 117 Updated sequence length: 2482634 New vocab: 'an' Token pair: (55, 68) Compression ratio: 0.10% 


  2%|▏         | 7/389 [00:18<16:40,  2.62s/it]

Updated vocab size: 118 Updated sequence length: 2455570 New vocab: ' de' Token pair: (111, 59) Compression ratio: 0.11% 


  2%|▏         | 8/389 [00:20<15:48,  2.49s/it]

Updated vocab size: 119 Updated sequence length: 2430038 New vocab: ' p' Token pair: (2, 70) Compression ratio: 0.12% 


  2%|▏         | 9/389 [00:23<16:04,  2.54s/it]

Updated vocab size: 120 Updated sequence length: 2408176 New vocab: 're' Token pair: (72, 59) Compression ratio: 0.13% 


  3%|▎         | 10/389 [00:25<15:54,  2.52s/it]

Updated vocab size: 121 Updated sequence length: 2387056 New vocab: 'ti' Token pair: (74, 63) Compression ratio: 0.14% 


  3%|▎         | 11/389 [00:28<15:14,  2.42s/it]

Updated vocab size: 122 Updated sequence length: 2366539 New vocab: ' s' Token pair: (2, 73) Compression ratio: 0.15% 


  3%|▎         | 12/389 [00:30<15:33,  2.48s/it]

Updated vocab size: 123 Updated sequence length: 2346456 New vocab: 'er' Token pair: (59, 72) Compression ratio: 0.16% 


  3%|▎         | 13/389 [00:33<15:29,  2.47s/it]

Updated vocab size: 124 Updated sequence length: 2326502 New vocab: ' c' Token pair: (2, 57) Compression ratio: 0.17% 


  4%|▎         | 14/389 [00:35<14:52,  2.38s/it]

Updated vocab size: 125 Updated sequence length: 2306909 New vocab: 'is' Token pair: (63, 73) Compression ratio: 0.18% 


  4%|▍         | 15/389 [00:38<15:15,  2.45s/it]

Updated vocab size: 126 Updated sequence length: 2289065 New vocab: 'in' Token pair: (63, 68) Compression ratio: 0.19% 


  4%|▍         | 16/389 [00:40<14:34,  2.35s/it]

Updated vocab size: 127 Updated sequence length: 2271253 New vocab: 'ar' Token pair: (55, 72) Compression ratio: 0.20% 


  4%|▍         | 17/389 [00:42<14:52,  2.40s/it]

Updated vocab size: 128 Updated sequence length: 2254026 New vocab: 'ou' Token pair: (69, 75) Compression ratio: 0.21% 


  5%|▍         | 18/389 [00:44<14:44,  2.38s/it]

Updated vocab size: 129 Updated sequence length: 2237494 New vocab: ' a' Token pair: (2, 55) Compression ratio: 0.22% 


  5%|▍         | 19/389 [00:47<14:36,  2.37s/it]

Updated vocab size: 130 Updated sequence length: 2222973 New vocab: 'et' Token pair: (59, 74) Compression ratio: 0.22% 


  5%|▌         | 20/389 [00:49<14:00,  2.28s/it]

Updated vocab size: 131 Updated sequence length: 2209004 New vocab: 'qu' Token pair: (71, 75) Compression ratio: 0.23% 


  5%|▌         | 21/389 [00:51<14:25,  2.35s/it]

Updated vocab size: 132 Updated sequence length: 2195336 New vocab: 'le' Token pair: (66, 59) Compression ratio: 0.24% 


  6%|▌         | 22/389 [00:54<14:19,  2.34s/it]

Updated vocab size: 133 Updated sequence length: 2182097 New vocab: 'ent' Token pair: (114, 74) Compression ratio: 0.25% 


  6%|▌         | 23/389 [00:56<13:44,  2.25s/it]

Updated vocab size: 134 Updated sequence length: 2169912 New vocab: 'it' Token pair: (63, 74) Compression ratio: 0.25% 


  6%|▌         | 24/389 [00:58<14:09,  2.33s/it]

Updated vocab size: 135 Updated sequence length: 2158101 New vocab: ' la' Token pair: (113, 55) Compression ratio: 0.26% 


  6%|▋         | 25/389 [01:01<14:00,  2.31s/it]

Updated vocab size: 136 Updated sequence length: 2146714 New vocab: 'or' Token pair: (69, 72) Compression ratio: 0.27% 


  7%|▋         | 26/389 [01:03<13:33,  2.24s/it]

Updated vocab size: 137 Updated sequence length: 2135406 New vocab: 'un' Token pair: (75, 68) Compression ratio: 0.28% 


  7%|▋         | 27/389 [01:05<14:02,  2.33s/it]

Updated vocab size: 138 Updated sequence length: 2124404 New vocab: '\n\n' Token pair: (1, 1) Compression ratio: 0.28% 


  7%|▋         | 28/389 [01:07<13:25,  2.23s/it]

Updated vocab size: 139 Updated sequence length: 2113821 New vocab: 'ur' Token pair: (75, 72) Compression ratio: 0.29% 


  7%|▋         | 29/389 [01:10<13:47,  2.30s/it]

Updated vocab size: 140 Updated sequence length: 2103589 New vocab: ' et' Token pair: (2, 129) Compression ratio: 0.29% 


  8%|▊         | 30/389 [01:12<13:13,  2.21s/it]

Updated vocab size: 141 Updated sequence length: 2093441 New vocab: 'te' Token pair: (74, 59) Compression ratio: 0.30% 


  8%|▊         | 31/389 [01:14<13:40,  2.29s/it]

Updated vocab size: 142 Updated sequence length: 2083398 New vocab: ' m' Token pair: (2, 67) Compression ratio: 0.31% 


  8%|▊         | 32/389 [01:16<13:36,  2.29s/it]

Updated vocab size: 143 Updated sequence length: 2073715 New vocab: 'il' Token pair: (63, 66) Compression ratio: 0.31% 


  8%|▊         | 33/389 [01:18<12:59,  2.19s/it]

Updated vocab size: 144 Updated sequence length: 2064381 New vocab: 'al' Token pair: (55, 66) Compression ratio: 0.32% 


  9%|▊         | 34/389 [01:21<13:22,  2.26s/it]

Updated vocab size: 145 Updated sequence length: 2055227 New vocab: ' le' Token pair: (113, 59) Compression ratio: 0.32% 


  9%|▉         | 35/389 [01:23<13:16,  2.25s/it]

Updated vocab size: 146 Updated sequence length: 2046207 New vocab: 'tion' Token pair: (120, 115) Compression ratio: 0.33% 


  9%|▉         | 36/389 [01:25<12:45,  2.17s/it]

Updated vocab size: 147 Updated sequence length: 2037697 New vocab: 'om' Token pair: (69, 67) Compression ratio: 0.34% 


 10%|▉         | 37/389 [01:27<13:08,  2.24s/it]

Updated vocab size: 148 Updated sequence length: 2029234 New vocab: ' en' Token pair: (2, 114) Compression ratio: 0.34% 


 10%|▉         | 38/389 [01:29<12:32,  2.14s/it]

Updated vocab size: 149 Updated sequence length: 2020872 New vocab: ' f' Token pair: (2, 60) Compression ratio: 0.35% 


 10%|█         | 39/389 [01:32<12:58,  2.23s/it]

Updated vocab size: 150 Updated sequence length: 2013059 New vocab: ' à' Token pair: (2, 89) Compression ratio: 0.35% 


 10%|█         | 40/389 [01:34<12:28,  2.15s/it]

Updated vocab size: 151 Updated sequence length: 2005277 New vocab: 'ra' Token pair: (72, 55) Compression ratio: 0.36% 


 11%|█         | 41/389 [01:36<13:01,  2.25s/it]

Updated vocab size: 152 Updated sequence length: 1997785 New vocab: 'el' Token pair: (59, 66) Compression ratio: 0.36% 


 11%|█         | 42/389 [01:38<12:24,  2.14s/it]

Updated vocab size: 153 Updated sequence length: 1990705 New vocab: 'ri' Token pair: (72, 63) Compression ratio: 0.37% 


 11%|█         | 43/389 [01:40<12:46,  2.21s/it]

Updated vocab size: 154 Updated sequence length: 1983658 New vocab: ' L' Token pair: (2, 38) Compression ratio: 0.37% 


 11%|█▏        | 44/389 [01:42<12:11,  2.12s/it]

Updated vocab size: 155 Updated sequence length: 1976717 New vocab: ' t' Token pair: (2, 74) Compression ratio: 0.38% 


 12%|█▏        | 45/389 [01:45<12:38,  2.20s/it]

Updated vocab size: 156 Updated sequence length: 1969950 New vocab: ' un' Token pair: (2, 136) Compression ratio: 0.38% 


 12%|█▏        | 46/389 [01:47<12:04,  2.11s/it]

Updated vocab size: 157 Updated sequence length: 1963251 New vocab: 'us' Token pair: (75, 73) Compression ratio: 0.39% 


 12%|█▏        | 47/389 [01:49<12:26,  2.18s/it]

Updated vocab size: 158 Updated sequence length: 1956671 New vocab: 'ré' Token pair: (72, 94) Compression ratio: 0.39% 


 12%|█▏        | 48/389 [01:51<12:19,  2.17s/it]

Updated vocab size: 159 Updated sequence length: 1950287 New vocab: ' des' Token pair: (111, 112) Compression ratio: 0.40% 


 13%|█▎        | 49/389 [01:53<11:47,  2.08s/it]

Updated vocab size: 160 Updated sequence length: 1944065 New vocab: 'que' Token pair: (130, 59) Compression ratio: 0.40% 


 13%|█▎        | 50/389 [01:55<12:12,  2.16s/it]

Updated vocab size: 161 Updated sequence length: 1937911 New vocab: ' du' Token pair: (111, 75) Compression ratio: 0.41% 


 13%|█▎        | 51/389 [01:57<11:40,  2.07s/it]

Updated vocab size: 162 Updated sequence length: 1931814 New vocab: 'est' Token pair: (112, 74) Compression ratio: 0.41% 


 13%|█▎        | 52/389 [02:00<12:02,  2.14s/it]

Updated vocab size: 163 Updated sequence length: 1925743 New vocab: ' C' Token pair: (2, 29) Compression ratio: 0.41% 


 14%|█▎        | 53/389 [02:02<11:57,  2.14s/it]

Updated vocab size: 164 Updated sequence length: 1919893 New vocab: 'our' Token pair: (127, 72) Compression ratio: 0.42% 


 14%|█▍        | 54/389 [02:03<11:25,  2.05s/it]

Updated vocab size: 165 Updated sequence length: 1914059 New vocab: '.\n\n' Token pair: (12, 137) Compression ratio: 0.42% 


 14%|█▍        | 55/389 [02:06<11:51,  2.13s/it]

Updated vocab size: 166 Updated sequence length: 1908232 New vocab: ' (' Token pair: (2, 8) Compression ratio: 0.43% 


 14%|█▍        | 56/389 [02:08<11:26,  2.06s/it]

Updated vocab size: 167 Updated sequence length: 1902452 New vocab: ' S' Token pair: (2, 45) Compression ratio: 0.43% 


 15%|█▍        | 57/389 [02:10<11:47,  2.13s/it]

Updated vocab size: 168 Updated sequence length: 1896708 New vocab: 'ro' Token pair: (72, 69) Compression ratio: 0.44% 


 15%|█▍        | 58/389 [02:12<11:14,  2.04s/it]

Updated vocab size: 169 Updated sequence length: 1890983 New vocab: 'ment' Token pair: (67, 132) Compression ratio: 0.44% 


 15%|█▌        | 59/389 [02:14<11:11,  2.04s/it]

Updated vocab size: 170 Updated sequence length: 1885286 New vocab: 'ant' Token pair: (116, 74) Compression ratio: 0.44% 


 15%|█▌        | 60/389 [02:16<11:33,  2.11s/it]

Updated vocab size: 171 Updated sequence length: 1879593 New vocab: 'ie' Token pair: (63, 59) Compression ratio: 0.45% 


 16%|█▌        | 61/389 [02:18<11:27,  2.09s/it]

Updated vocab size: 172 Updated sequence length: 1874009 New vocab: ' n' Token pair: (2, 68) Compression ratio: 0.45% 


 16%|█▌        | 62/389 [02:20<10:56,  2.01s/it]

Updated vocab size: 173 Updated sequence length: 1868511 New vocab: 'ch' Token pair: (57, 62) Compression ratio: 0.46% 


 16%|█▌        | 63/389 [02:22<11:17,  2.08s/it]

Updated vocab size: 174 Updated sequence length: 1863061 New vocab: 'ans' Token pair: (116, 73) Compression ratio: 0.46% 


 16%|█▋        | 64/389 [02:24<11:13,  2.07s/it]

Updated vocab size: 175 Updated sequence length: 1857614 New vocab: 'ce' Token pair: (57, 59) Compression ratio: 0.47% 


 17%|█▋        | 65/389 [02:26<10:44,  1.99s/it]

Updated vocab size: 176 Updated sequence length: 1852267 New vocab: ' par' Token pair: (118, 126) Compression ratio: 0.47% 


 17%|█▋        | 66/389 [02:28<11:07,  2.07s/it]

Updated vocab size: 177 Updated sequence length: 1846969 New vocab: ' M' Token pair: (2, 39) Compression ratio: 0.47% 


 17%|█▋        | 67/389 [02:30<10:37,  1.98s/it]

Updated vocab size: 178 Updated sequence length: 1841749 New vocab: ' :' Token pair: (2, 24) Compression ratio: 0.48% 


 17%|█▋        | 68/389 [02:32<10:58,  2.05s/it]

Updated vocab size: 179 Updated sequence length: 1836601 New vocab: '19' Token pair: (15, 23) Compression ratio: 0.48% 


 18%|█▊        | 69/389 [02:34<10:56,  2.05s/it]

Updated vocab size: 180 Updated sequence length: 1831506 New vocab: 'au' Token pair: (55, 75) Compression ratio: 0.49% 


 18%|█▊        | 70/389 [02:36<10:27,  1.97s/it]

Updated vocab size: 181 Updated sequence length: 1826502 New vocab: 'ol' Token pair: (69, 66) Compression ratio: 0.49% 


 18%|█▊        | 71/389 [02:38<10:28,  1.98s/it]

Updated vocab size: 182 Updated sequence length: 1821508 New vocab: 'at' Token pair: (55, 74) Compression ratio: 0.49% 


 19%|█▊        | 72/389 [02:40<10:45,  2.03s/it]

Updated vocab size: 183 Updated sequence length: 1816578 New vocab: ' les' Token pair: (113, 112) Compression ratio: 0.50% 


 19%|█▉        | 73/389 [02:42<10:40,  2.03s/it]

Updated vocab size: 184 Updated sequence length: 1811651 New vocab: ' P' Token pair: (2, 42) Compression ratio: 0.50% 


 19%|█▉        | 74/389 [02:44<10:37,  2.02s/it]

Updated vocab size: 185 Updated sequence length: 1806786 New vocab: ' v' Token pair: (2, 76) Compression ratio: 0.51% 


 19%|█▉        | 75/389 [02:46<10:09,  1.94s/it]

Updated vocab size: 186 Updated sequence length: 1801935 New vocab: ' é' Token pair: (2, 94) Compression ratio: 0.51% 


 20%|█▉        | 76/389 [02:48<10:31,  2.02s/it]

Updated vocab size: 187 Updated sequence length: 1797143 New vocab: 'ne' Token pair: (68, 59) Compression ratio: 0.52% 


 20%|█▉        | 77/389 [02:50<10:02,  1.93s/it]

Updated vocab size: 188 Updated sequence length: 1792367 New vocab: ' au' Token pair: (128, 75) Compression ratio: 0.52% 


 20%|██        | 78/389 [02:52<10:23,  2.00s/it]

Updated vocab size: 189 Updated sequence length: 1787599 New vocab: 'me' Token pair: (67, 59) Compression ratio: 0.52% 


 20%|██        | 79/389 [02:54<10:20,  2.00s/it]

Updated vocab size: 190 Updated sequence length: 1782831 New vocab: 'as' Token pair: (55, 73) Compression ratio: 0.53% 


 21%|██        | 80/389 [02:56<09:52,  1.92s/it]

Updated vocab size: 191 Updated sequence length: 1778287 New vocab: ' est' Token pair: (2, 161) Compression ratio: 0.53% 


 21%|██        | 81/389 [02:58<10:13,  1.99s/it]

Updated vocab size: 192 Updated sequence length: 1773913 New vocab: ' \n' Token pair: (2, 1) Compression ratio: 0.54% 


 21%|██        | 82/389 [03:00<09:45,  1.91s/it]

Updated vocab size: 193 Updated sequence length: 1769578 New vocab: 'ont' Token pair: (115, 74) Compression ratio: 0.54% 


 21%|██▏       | 83/389 [03:02<10:05,  1.98s/it]

Updated vocab size: 194 Updated sequence length: 1765282 New vocab: 'ation' Token pair: (55, 145) Compression ratio: 0.54% 


 22%|██▏       | 84/389 [03:04<10:02,  1.98s/it]

Updated vocab size: 195 Updated sequence length: 1761171 New vocab: 'si' Token pair: (73, 63) Compression ratio: 0.55% 


 22%|██▏       | 85/389 [03:06<09:35,  1.89s/it]

Updated vocab size: 196 Updated sequence length: 1757070 New vocab: ' b' Token pair: (2, 56) Compression ratio: 0.55% 


 22%|██▏       | 86/389 [03:08<09:55,  1.97s/it]

Updated vocab size: 197 Updated sequence length: 1753005 New vocab: ' B' Token pair: (2, 28) Compression ratio: 0.55% 


 22%|██▏       | 87/389 [03:10<09:55,  1.97s/it]

Updated vocab size: 198 Updated sequence length: 1748972 New vocab: ' A' Token pair: (2, 27) Compression ratio: 0.56% 


 23%|██▎       | 88/389 [03:11<09:27,  1.89s/it]

Updated vocab size: 199 Updated sequence length: 1744943 New vocab: 'ci' Token pair: (57, 63) Compression ratio: 0.56% 


 23%|██▎       | 89/389 [03:14<09:46,  1.96s/it]

Updated vocab size: 200 Updated sequence length: 1740995 New vocab: 'se' Token pair: (73, 59) Compression ratio: 0.56% 


 23%|██▎       | 90/389 [03:15<09:20,  1.87s/it]

Updated vocab size: 201 Updated sequence length: 1737097 New vocab: 'ée' Token pair: (94, 59) Compression ratio: 0.57% 


 23%|██▎       | 91/389 [03:17<09:38,  1.94s/it]

Updated vocab size: 202 Updated sequence length: 1733217 New vocab: '20' Token pair: (16, 14) Compression ratio: 0.57% 


 24%|██▎       | 92/389 [03:19<09:33,  1.93s/it]

Updated vocab size: 203 Updated sequence length: 1729353 New vocab: 've' Token pair: (76, 59) Compression ratio: 0.57% 


 24%|██▍       | 93/389 [03:21<09:08,  1.85s/it]

Updated vocab size: 204 Updated sequence length: 1725543 New vocab: ' qu' Token pair: (2, 130) Compression ratio: 0.58% 


 24%|██▍       | 94/389 [03:23<09:28,  1.93s/it]

Updated vocab size: 205 Updated sequence length: 1721741 New vocab: 'ais' Token pair: (55, 124) Compression ratio: 0.58% 


 24%|██▍       | 95/389 [03:25<09:04,  1.85s/it]

Updated vocab size: 206 Updated sequence length: 1718065 New vocab: ' g' Token pair: (2, 61) Compression ratio: 0.58% 


 25%|██▍       | 96/389 [03:27<09:23,  1.92s/it]

Updated vocab size: 207 Updated sequence length: 1714444 New vocab: 'res' Token pair: (72, 112) Compression ratio: 0.59% 


 25%|██▍       | 97/389 [03:28<08:57,  1.84s/it]

Updated vocab size: 208 Updated sequence length: 1710840 New vocab: 'de' Token pair: (58, 59) Compression ratio: 0.59% 


 25%|██▌       | 98/389 [03:30<09:17,  1.92s/it]

Updated vocab size: 209 Updated sequence length: 1707253 New vocab: 'ag' Token pair: (55, 61) Compression ratio: 0.59% 


 25%|██▌       | 99/389 [03:32<08:52,  1.84s/it]

Updated vocab size: 210 Updated sequence length: 1703746 New vocab: 'ire' Token pair: (63, 119) Compression ratio: 0.60% 


 26%|██▌       | 100/389 [03:35<09:39,  2.01s/it]

Updated vocab size: 211 Updated sequence length: 1700290 New vocab: ' com' Token pair: (123, 146) Compression ratio: 0.60% 


 26%|██▌       | 101/389 [03:37<10:01,  2.09s/it]

Updated vocab size: 212 Updated sequence length: 1696838 New vocab: 'li' Token pair: (66, 63) Compression ratio: 0.60% 


 26%|██▌       | 102/389 [03:39<10:29,  2.19s/it]

Updated vocab size: 213 Updated sequence length: 1693395 New vocab: ' dans' Token pair: (111, 173) Compression ratio: 0.61% 


 26%|██▋       | 103/389 [03:41<10:22,  2.18s/it]

Updated vocab size: 214 Updated sequence length: 1689982 New vocab: ' D' Token pair: (2, 30) Compression ratio: 0.61% 


 27%|██▋       | 104/389 [03:44<10:44,  2.26s/it]

Updated vocab size: 215 Updated sequence length: 1686576 New vocab: ' dé' Token pair: (111, 94) Compression ratio: 0.61% 


 27%|██▋       | 105/389 [03:46<10:11,  2.15s/it]

Updated vocab size: 216 Updated sequence length: 1683190 New vocab: 'am' Token pair: (55, 67) Compression ratio: 0.62% 


 27%|██▋       | 106/389 [03:47<09:27,  2.00s/it]

Updated vocab size: 217 Updated sequence length: 1680028 New vocab: 'té' Token pair: (74, 94) Compression ratio: 0.62% 


 28%|██▊       | 107/389 [03:49<09:31,  2.03s/it]

Updated vocab size: 218 Updated sequence length: 1676870 New vocab: ' re' Token pair: (2, 119) Compression ratio: 0.62% 


 28%|██▊       | 108/389 [03:51<09:19,  1.99s/it]

Updated vocab size: 219 Updated sequence length: 1673745 New vocab: ' T' Token pair: (2, 46) Compression ratio: 0.63% 


 28%|██▊       | 109/389 [03:53<08:47,  1.88s/it]

Updated vocab size: 220 Updated sequence length: 1670658 New vocab: ' une' Token pair: (155, 59) Compression ratio: 0.63% 


 28%|██▊       | 110/389 [03:55<09:03,  1.95s/it]

Updated vocab size: 221 Updated sequence length: 1667627 New vocab: 'ran' Token pair: (72, 116) Compression ratio: 0.63% 


 29%|██▊       | 111/389 [03:57<08:36,  1.86s/it]

Updated vocab size: 222 Updated sequence length: 1664643 New vocab: ' R' Token pair: (2, 44) Compression ratio: 0.64% 


 29%|██▉       | 112/389 [03:59<08:31,  1.85s/it]

Updated vocab size: 223 Updated sequence length: 1661727 New vocab: 'ain' Token pair: (55, 125) Compression ratio: 0.64% 


 29%|██▉       | 113/389 [04:01<08:45,  1.91s/it]

Updated vocab size: 224 Updated sequence length: 1658835 New vocab: ' ré' Token pair: (2, 157) Compression ratio: 0.64% 


 29%|██▉       | 114/389 [04:03<08:41,  1.90s/it]

Updated vocab size: 225 Updated sequence length: 1655986 New vocab: ' con' Token pair: (123, 115) Compression ratio: 0.64% 


 30%|██▉       | 115/389 [04:04<08:38,  1.89s/it]

Updated vocab size: 226 Updated sequence length: 1653172 New vocab: 'di' Token pair: (58, 63) Compression ratio: 0.65% 


 30%|██▉       | 116/389 [04:06<08:35,  1.89s/it]

Updated vocab size: 227 Updated sequence length: 1650372 New vocab: ' F' Token pair: (2, 32) Compression ratio: 0.65% 


 30%|███       | 117/389 [04:08<08:12,  1.81s/it]

Updated vocab size: 228 Updated sequence length: 1647676 New vocab: ' G' Token pair: (2, 33) Compression ratio: 0.65% 


 30%|███       | 118/389 [04:10<08:12,  1.82s/it]

Updated vocab size: 229 Updated sequence length: 1644983 New vocab: 'ers' Token pair: (122, 73) Compression ratio: 0.66% 


 31%|███       | 119/389 [04:12<08:27,  1.88s/it]

Updated vocab size: 230 Updated sequence length: 1642348 New vocab: 'ig' Token pair: (63, 61) Compression ratio: 0.66% 


 31%|███       | 120/389 [04:14<08:29,  1.89s/it]

Updated vocab size: 231 Updated sequence length: 1639760 New vocab: ' pour' Token pair: (118, 163) Compression ratio: 0.66% 


 31%|███       | 121/389 [04:16<08:26,  1.89s/it]

Updated vocab size: 232 Updated sequence length: 1637216 New vocab: 'ot' Token pair: (69, 74) Compression ratio: 0.66% 


 31%|███▏      | 122/389 [04:17<08:02,  1.81s/it]

Updated vocab size: 233 Updated sequence length: 1634692 New vocab: ' I' Token pair: (2, 35) Compression ratio: 0.67% 


 32%|███▏      | 123/389 [04:19<08:20,  1.88s/it]

Updated vocab size: 234 Updated sequence length: 1632181 New vocab: ' se' Token pair: (121, 59) Compression ratio: 0.67% 


 32%|███▏      | 124/389 [04:21<08:17,  1.88s/it]

Updated vocab size: 235 Updated sequence length: 1629678 New vocab: 'ist' Token pair: (124, 74) Compression ratio: 0.67% 


 32%|███▏      | 125/389 [04:23<07:55,  1.80s/it]

Updated vocab size: 236 Updated sequence length: 1627207 New vocab: 'ir' Token pair: (63, 72) Compression ratio: 0.67% 


 32%|███▏      | 126/389 [04:25<08:14,  1.88s/it]

Updated vocab size: 237 Updated sequence length: 1624749 New vocab: ' in' Token pair: (2, 125) Compression ratio: 0.68% 


 33%|███▎      | 127/389 [04:26<07:52,  1.80s/it]

Updated vocab size: 238 Updated sequence length: 1622306 New vocab: 'os' Token pair: (69, 73) Compression ratio: 0.68% 


 33%|███▎      | 128/389 [04:28<08:10,  1.88s/it]

Updated vocab size: 239 Updated sequence length: 1619876 New vocab: ' E' Token pair: (2, 31) Compression ratio: 0.68% 


 33%|███▎      | 129/389 [04:30<07:47,  1.80s/it]

Updated vocab size: 240 Updated sequence length: 1617455 New vocab: ' ch' Token pair: (123, 62) Compression ratio: 0.68% 


 33%|███▎      | 130/389 [04:32<08:04,  1.87s/it]

Updated vocab size: 241 Updated sequence length: 1615062 New vocab: ' e' Token pair: (2, 59) Compression ratio: 0.69% 


 34%|███▎      | 131/389 [04:34<07:42,  1.79s/it]

Updated vocab size: 242 Updated sequence length: 1612669 New vocab: ' pl' Token pair: (118, 66) Compression ratio: 0.69% 


 34%|███▍      | 132/389 [04:36<07:58,  1.86s/it]

Updated vocab size: 243 Updated sequence length: 1610324 New vocab: ' J' Token pair: (2, 36) Compression ratio: 0.69% 


 34%|███▍      | 133/389 [04:37<07:36,  1.78s/it]

Updated vocab size: 244 Updated sequence length: 1607981 New vocab: ' j' Token pair: (2, 64) Compression ratio: 0.69% 


 34%|███▍      | 134/389 [04:39<07:54,  1.86s/it]

Updated vocab size: 245 Updated sequence length: 1605690 New vocab: 'ale' Token pair: (55, 131) Compression ratio: 0.70% 


 35%|███▍      | 135/389 [04:41<07:32,  1.78s/it]

Updated vocab size: 246 Updated sequence length: 1603404 New vocab: 'les' Token pair: (66, 112) Compression ratio: 0.70% 


 35%|███▍      | 136/389 [04:43<07:50,  1.86s/it]

Updated vocab size: 247 Updated sequence length: 1601176 New vocab: 'ab' Token pair: (55, 56) Compression ratio: 0.70% 


 35%|███▌      | 137/389 [04:45<07:33,  1.80s/it]

Updated vocab size: 248 Updated sequence length: 1598963 New vocab: 'ac' Token pair: (55, 57) Compression ratio: 0.70% 


 35%|███▌      | 138/389 [04:47<07:52,  1.88s/it]

Updated vocab size: 249 Updated sequence length: 1596760 New vocab: 'mp' Token pair: (67, 70) Compression ratio: 0.71% 


 36%|███▌      | 139/389 [04:49<07:46,  1.87s/it]

Updated vocab size: 250 Updated sequence length: 1594601 New vocab: 'elle' Token pair: (151, 131) Compression ratio: 0.71% 


 36%|███▌      | 140/389 [04:50<07:42,  1.86s/it]

Updated vocab size: 251 Updated sequence length: 1592446 New vocab: ' pro' Token pair: (118, 167) Compression ratio: 0.71% 


 36%|███▌      | 141/389 [04:52<07:21,  1.78s/it]

Updated vocab size: 252 Updated sequence length: 1590294 New vocab: ' H' Token pair: (2, 34) Compression ratio: 0.71% 


 37%|███▋      | 142/389 [04:54<07:41,  1.87s/it]

Updated vocab size: 253 Updated sequence length: 1588145 New vocab: ' sur' Token pair: (121, 138) Compression ratio: 0.71% 


 37%|███▋      | 143/389 [04:56<07:18,  1.78s/it]

Updated vocab size: 254 Updated sequence length: 1586038 New vocab: 'ul' Token pair: (75, 66) Compression ratio: 0.72% 


 37%|███▋      | 144/389 [04:58<07:35,  1.86s/it]

Updated vocab size: 255 Updated sequence length: 1583954 New vocab: 'ren' Token pair: (72, 114) Compression ratio: 0.72% 


 37%|███▋      | 145/389 [04:59<07:13,  1.78s/it]

Updated vocab size: 256 Updated sequence length: 1581875 New vocab: 'ité' Token pair: (133, 94) Compression ratio: 0.72% 


 38%|███▊      | 146/389 [05:01<07:31,  1.86s/it]

Updated vocab size: 257 Updated sequence length: 1579819 New vocab: 'og' Token pair: (69, 61) Compression ratio: 0.72% 


 38%|███▊      | 147/389 [05:03<07:08,  1.77s/it]

Updated vocab size: 258 Updated sequence length: 1577786 New vocab: 'tu' Token pair: (74, 75) Compression ratio: 0.73% 


 38%|███▊      | 148/389 [05:05<07:24,  1.85s/it]

Updated vocab size: 259 Updated sequence length: 1575777 New vocab: 'op' Token pair: (69, 70) Compression ratio: 0.73% 


 38%|███▊      | 149/389 [05:07<07:04,  1.77s/it]

Updated vocab size: 260 Updated sequence length: 1573775 New vocab: 'ter' Token pair: (74, 122) Compression ratio: 0.73% 


 39%|███▊      | 150/389 [05:09<07:20,  1.84s/it]

Updated vocab size: 261 Updated sequence length: 1571787 New vocab: 'ic' Token pair: (63, 57) Compression ratio: 0.73% 


 39%|███▉      | 151/389 [05:10<06:58,  1.76s/it]

Updated vocab size: 262 Updated sequence length: 1569812 New vocab: 'ère' Token pair: (93, 119) Compression ratio: 0.73% 


 39%|███▉      | 152/389 [05:12<07:13,  1.83s/it]

Updated vocab size: 263 Updated sequence length: 1567858 New vocab: ' N' Token pair: (2, 40) Compression ratio: 0.74% 


 39%|███▉      | 153/389 [05:14<07:11,  1.83s/it]

Updated vocab size: 264 Updated sequence length: 1565917 New vocab: 'ait' Token pair: (55, 133) Compression ratio: 0.74% 


 40%|███▉      | 154/389 [05:16<06:52,  1.75s/it]

Updated vocab size: 265 Updated sequence length: 1563979 New vocab: '.\n' Token pair: (12, 1) Compression ratio: 0.74% 


 40%|███▉      | 155/389 [05:18<07:12,  1.85s/it]

Updated vocab size: 266 Updated sequence length: 1562045 New vocab: 'iv' Token pair: (63, 76) Compression ratio: 0.74% 


 40%|████      | 156/389 [05:19<06:50,  1.76s/it]

Updated vocab size: 267 Updated sequence length: 1560144 New vocab: ' qui' Token pair: (203, 63) Compression ratio: 0.75% 


 40%|████      | 157/389 [05:21<06:48,  1.76s/it]

Updated vocab size: 268 Updated sequence length: 1558265 New vocab: 'eu' Token pair: (59, 75) Compression ratio: 0.75% 


 41%|████      | 158/389 [05:23<06:59,  1.82s/it]

Updated vocab size: 269 Updated sequence length: 1556391 New vocab: 'ion' Token pair: (63, 115) Compression ratio: 0.75% 


 41%|████      | 159/389 [05:25<06:56,  1.81s/it]

Updated vocab size: 270 Updated sequence length: 1554518 New vocab: 'ff' Token pair: (60, 60) Compression ratio: 0.75% 


 41%|████      | 160/389 [05:26<06:53,  1.81s/it]

Updated vocab size: 271 Updated sequence length: 1552699 New vocab: ' h' Token pair: (2, 62) Compression ratio: 0.75% 


 41%|████▏     | 161/389 [05:28<06:34,  1.73s/it]

Updated vocab size: 272 Updated sequence length: 1550882 New vocab: 'ouv' Token pair: (127, 76) Compression ratio: 0.76% 


 42%|████▏     | 162/389 [05:30<06:49,  1.80s/it]

Updated vocab size: 273 Updated sequence length: 1549075 New vocab: 'ien' Token pair: (63, 114) Compression ratio: 0.76% 


 42%|████▏     | 163/389 [05:32<06:48,  1.81s/it]

Updated vocab size: 274 Updated sequence length: 1547270 New vocab: ' \n\n' Token pair: (2, 137) Compression ratio: 0.76% 


 42%|████▏     | 164/389 [05:33<06:28,  1.73s/it]

Updated vocab size: 275 Updated sequence length: 1545474 New vocab: 'ès' Token pair: (93, 73) Compression ratio: 0.76% 


 42%|████▏     | 165/389 [05:35<06:44,  1.81s/it]

Updated vocab size: 276 Updated sequence length: 1543678 New vocab: 'éri' Token pair: (94, 152) Compression ratio: 0.76% 


 43%|████▎     | 166/389 [05:37<06:41,  1.80s/it]

Updated vocab size: 277 Updated sequence length: 1541890 New vocab: 'ue' Token pair: (75, 59) Compression ratio: 0.77% 


 43%|████▎     | 167/389 [05:39<06:22,  1.72s/it]

Updated vocab size: 278 Updated sequence length: 1540113 New vocab: 'ces' Token pair: (57, 112) Compression ratio: 0.77% 


 43%|████▎     | 168/389 [05:41<06:38,  1.80s/it]

Updated vocab size: 279 Updated sequence length: 1538339 New vocab: ' o' Token pair: (2, 69) Compression ratio: 0.77% 


 43%|████▎     | 169/389 [05:42<06:19,  1.73s/it]

Updated vocab size: 280 Updated sequence length: 1536580 New vocab: 'tre' Token pair: (74, 119) Compression ratio: 0.77% 


 44%|████▎     | 170/389 [05:44<06:34,  1.80s/it]

Updated vocab size: 281 Updated sequence length: 1534831 New vocab: 'pe' Token pair: (70, 59) Compression ratio: 0.77% 


 44%|████▍     | 171/389 [05:46<06:16,  1.73s/it]

Updated vocab size: 282 Updated sequence length: 1533105 New vocab: 'né' Token pair: (68, 94) Compression ratio: 0.78% 


 44%|████▍     | 172/389 [05:48<06:32,  1.81s/it]

Updated vocab size: 283 Updated sequence length: 1531388 New vocab: ' Le' Token pair: (153, 59) Compression ratio: 0.78% 


 44%|████▍     | 173/389 [05:50<06:30,  1.81s/it]

Updated vocab size: 284 Updated sequence length: 1529675 New vocab: '201' Token pair: (201, 15) Compression ratio: 0.78% 


 45%|████▍     | 174/389 [05:51<06:11,  1.73s/it]

Updated vocab size: 285 Updated sequence length: 1527981 New vocab: 'vec' Token pair: (202, 57) Compression ratio: 0.78% 


 45%|████▍     | 175/389 [05:53<06:25,  1.80s/it]

Updated vocab size: 286 Updated sequence length: 1526289 New vocab: 'ique' Token pair: (63, 159) Compression ratio: 0.78% 


 45%|████▌     | 176/389 [05:55<06:07,  1.72s/it]

Updated vocab size: 287 Updated sequence length: 1524608 New vocab: 'and' Token pair: (116, 58) Compression ratio: 0.79% 


 46%|████▌     | 177/389 [05:57<06:19,  1.79s/it]

Updated vocab size: 288 Updated sequence length: 1522929 New vocab: "'un" Token pair: (7, 136) Compression ratio: 0.79% 


 46%|████▌     | 178/389 [05:58<06:01,  1.71s/it]

Updated vocab size: 289 Updated sequence length: 1521260 New vocab: 'ille' Token pair: (142, 131) Compression ratio: 0.79% 


 46%|████▌     | 179/389 [06:00<06:14,  1.78s/it]

Updated vocab size: 290 Updated sequence length: 1519612 New vocab: ' il' Token pair: (2, 142) Compression ratio: 0.79% 


 46%|████▋     | 180/389 [06:02<05:56,  1.71s/it]

Updated vocab size: 291 Updated sequence length: 1517969 New vocab: 'ap' Token pair: (55, 70) Compression ratio: 0.79% 


 47%|████▋     | 181/389 [06:04<06:10,  1.78s/it]

Updated vocab size: 292 Updated sequence length: 1516332 New vocab: '||' Token pair: (81, 81) Compression ratio: 0.80% 


 47%|████▋     | 182/389 [06:05<05:55,  1.72s/it]

Updated vocab size: 293 Updated sequence length: 1514707 New vocab: 'aire' Token pair: (55, 209) Compression ratio: 0.80% 


 47%|████▋     | 183/389 [06:07<05:53,  1.72s/it]

Updated vocab size: 294 Updated sequence length: 1513087 New vocab: ' K' Token pair: (2, 37) Compression ratio: 0.80% 


 47%|████▋     | 184/389 [06:09<06:04,  1.78s/it]

Updated vocab size: 295 Updated sequence length: 1511488 New vocab: '200' Token pair: (201, 14) Compression ratio: 0.80% 


 48%|████▊     | 185/389 [06:10<06:01,  1.77s/it]

Updated vocab size: 296 Updated sequence length: 1509895 New vocab: ' V' Token pair: (2, 48) Compression ratio: 0.80% 


 48%|████▊     | 186/389 [06:12<05:58,  1.77s/it]

Updated vocab size: 297 Updated sequence length: 1508310 New vocab: 'urs' Token pair: (138, 73) Compression ratio: 0.81% 


 48%|████▊     | 187/389 [06:14<05:43,  1.70s/it]

Updated vocab size: 298 Updated sequence length: 1506737 New vocab: 'mi' Token pair: (67, 63) Compression ratio: 0.81% 


 48%|████▊     | 188/389 [06:16<05:57,  1.78s/it]

Updated vocab size: 299 Updated sequence length: 1505165 New vocab: 'ge' Token pair: (61, 59) Compression ratio: 0.81% 


 49%|████▊     | 189/389 [06:17<05:54,  1.77s/it]

Updated vocab size: 300 Updated sequence length: 1503593 New vocab: ' que' Token pair: (2, 159) Compression ratio: 0.81% 


 49%|████▉     | 190/389 [06:19<05:36,  1.69s/it]

Updated vocab size: 301 Updated sequence length: 1502025 New vocab: 'ai' Token pair: (55, 63) Compression ratio: 0.81% 


 49%|████▉     | 191/389 [06:21<05:39,  1.71s/it]

Updated vocab size: 302 Updated sequence length: 1500459 New vocab: 'ort' Token pair: (135, 74) Compression ratio: 0.81% 


 49%|████▉     | 192/389 [06:23<05:49,  1.77s/it]

Updated vocab size: 303 Updated sequence length: 1498895 New vocab: 'ction' Token pair: (57, 145) Compression ratio: 0.82% 


 50%|████▉     | 193/389 [06:24<05:46,  1.77s/it]

Updated vocab size: 304 Updated sequence length: 1497340 New vocab: ' ex' Token pair: (240, 78) Compression ratio: 0.82% 


 50%|████▉     | 194/389 [06:26<05:43,  1.76s/it]

Updated vocab size: 305 Updated sequence length: 1495794 New vocab: ' avec' Token pair: (128, 284) Compression ratio: 0.82% 


 50%|█████     | 195/389 [06:28<05:28,  1.69s/it]

Updated vocab size: 306 Updated sequence length: 1494281 New vocab: ' r' Token pair: (2, 72) Compression ratio: 0.82% 


 50%|█████     | 196/389 [06:30<05:40,  1.76s/it]

Updated vocab size: 307 Updated sequence length: 1492768 New vocab: 'ques' Token pair: (130, 112) Compression ratio: 0.82% 


 51%|█████     | 197/389 [06:31<05:37,  1.76s/it]

Updated vocab size: 308 Updated sequence length: 1491273 New vocab: 'és' Token pair: (94, 73) Compression ratio: 0.83% 


 51%|█████     | 198/389 [06:33<05:22,  1.69s/it]

Updated vocab size: 309 Updated sequence length: 1489785 New vocab: 'ite' Token pair: (133, 59) Compression ratio: 0.83% 


 51%|█████     | 199/389 [06:35<05:36,  1.77s/it]

Updated vocab size: 310 Updated sequence length: 1488303 New vocab: ' su' Token pair: (121, 75) Compression ratio: 0.83% 


 51%|█████▏    | 200/389 [06:37<05:34,  1.77s/it]

Updated vocab size: 311 Updated sequence length: 1486833 New vocab: ' plus' Token pair: (241, 156) Compression ratio: 0.83% 


 52%|█████▏    | 201/389 [06:38<05:18,  1.70s/it]

Updated vocab size: 312 Updated sequence length: 1485370 New vocab: 'ux' Token pair: (75, 78) Compression ratio: 0.83% 


 52%|█████▏    | 202/389 [06:40<05:18,  1.71s/it]

Updated vocab size: 313 Updated sequence length: 1483934 New vocab: ' an' Token pair: (2, 116) Compression ratio: 0.84% 


 52%|█████▏    | 203/389 [06:42<05:29,  1.77s/it]

Updated vocab size: 314 Updated sequence length: 1482525 New vocab: 'ine' Token pair: (125, 59) Compression ratio: 0.84% 


 52%|█████▏    | 204/389 [06:44<05:25,  1.76s/it]

Updated vocab size: 315 Updated sequence length: 1481122 New vocab: '),' Token pair: (9, 10) Compression ratio: 0.84% 


 53%|█████▎    | 205/389 [06:45<05:11,  1.69s/it]

Updated vocab size: 316 Updated sequence length: 1479757 New vocab: 'ois' Token pair: (69, 124) Compression ratio: 0.84% 


 53%|█████▎    | 206/389 [06:47<05:24,  1.77s/it]

Updated vocab size: 317 Updated sequence length: 1478393 New vocab: ' son' Token pair: (121, 115) Compression ratio: 0.84% 


 53%|█████▎    | 207/389 [06:49<05:08,  1.69s/it]

Updated vocab size: 318 Updated sequence length: 1477032 New vocab: 'vi' Token pair: (76, 63) Compression ratio: 0.84% 


 53%|█████▎    | 208/389 [06:50<05:19,  1.76s/it]

Updated vocab size: 319 Updated sequence length: 1475675 New vocab: 'mb' Token pair: (67, 56) Compression ratio: 0.85% 


 54%|█████▎    | 209/389 [06:52<05:06,  1.70s/it]

Updated vocab size: 320 Updated sequence length: 1474322 New vocab: "'é" Token pair: (7, 94) Compression ratio: 0.85% 


 54%|█████▍    | 210/389 [06:54<05:17,  1.78s/it]

Updated vocab size: 321 Updated sequence length: 1472972 New vocab: 'ut' Token pair: (75, 74) Compression ratio: 0.85% 


 54%|█████▍    | 211/389 [06:55<05:02,  1.70s/it]

Updated vocab size: 322 Updated sequence length: 1471627 New vocab: 'age' Token pair: (208, 59) Compression ratio: 0.85% 


 54%|█████▍    | 212/389 [06:57<05:13,  1.77s/it]

Updated vocab size: 323 Updated sequence length: 1470284 New vocab: ' Ch' Token pair: (162, 62) Compression ratio: 0.85% 


 55%|█████▍    | 213/389 [06:59<04:56,  1.68s/it]

Updated vocab size: 324 Updated sequence length: 1468942 New vocab: 'ub' Token pair: (75, 56) Compression ratio: 0.85% 


 55%|█████▌    | 214/389 [07:01<05:06,  1.75s/it]

Updated vocab size: 325 Updated sequence length: 1467611 New vocab: 'ad' Token pair: (55, 58) Compression ratio: 0.86% 


 55%|█████▌    | 215/389 [07:03<05:03,  1.74s/it]

Updated vocab size: 326 Updated sequence length: 1466289 New vocab: 'ph' Token pair: (70, 62) Compression ratio: 0.86% 


 56%|█████▌    | 216/389 [07:04<04:49,  1.67s/it]

Updated vocab size: 327 Updated sequence length: 1464983 New vocab: 'st' Token pair: (73, 74) Compression ratio: 0.86% 


 56%|█████▌    | 217/389 [07:06<05:01,  1.75s/it]

Updated vocab size: 328 Updated sequence length: 1463706 New vocab: 'ill' Token pair: (142, 66) Compression ratio: 0.86% 


 56%|█████▌    | 218/389 [07:08<04:58,  1.75s/it]

Updated vocab size: 329 Updated sequence length: 1462432 New vocab: 'ay' Token pair: (55, 79) Compression ratio: 0.86% 


 56%|█████▋    | 219/389 [07:09<04:43,  1.67s/it]

Updated vocab size: 330 Updated sequence length: 1461162 New vocab: 'nes' Token pair: (68, 112) Compression ratio: 0.86% 


 57%|█████▋    | 220/389 [07:11<04:56,  1.76s/it]

Updated vocab size: 331 Updated sequence length: 1459894 New vocab: '18' Token pair: (15, 22) Compression ratio: 0.87% 


 57%|█████▋    | 221/389 [07:13<04:41,  1.68s/it]

Updated vocab size: 332 Updated sequence length: 1458631 New vocab: ' ||' Token pair: (2, 291) Compression ratio: 0.87% 


 57%|█████▋    | 222/389 [07:15<04:54,  1.76s/it]

Updated vocab size: 333 Updated sequence length: 1457373 New vocab: 'du' Token pair: (58, 75) Compression ratio: 0.87% 


 57%|█████▋    | 223/389 [07:16<04:39,  1.68s/it]

Updated vocab size: 334 Updated sequence length: 1456119 New vocab: 'che' Token pair: (172, 59) Compression ratio: 0.87% 


 58%|█████▊    | 224/389 [07:18<04:50,  1.76s/it]

Updated vocab size: 335 Updated sequence length: 1454869 New vocab: ' Il' Token pair: (232, 66) Compression ratio: 0.87% 


 58%|█████▊    | 225/389 [07:20<04:35,  1.68s/it]

Updated vocab size: 336 Updated sequence length: 1453620 New vocab: ' La' Token pair: (153, 55) Compression ratio: 0.87% 


 58%|█████▊    | 226/389 [07:21<04:46,  1.76s/it]

Updated vocab size: 337 Updated sequence length: 1452384 New vocab: ' ou' Token pair: (2, 127) Compression ratio: 0.87% 


 58%|█████▊    | 227/389 [07:23<04:34,  1.69s/it]

Updated vocab size: 338 Updated sequence length: 1451150 New vocab: ' «' Token pair: (2, 83) Compression ratio: 0.88% 


 59%|█████▊    | 228/389 [07:25<04:44,  1.77s/it]

Updated vocab size: 339 Updated sequence length: 1449936 New vocab: '199' Token pair: (178, 23) Compression ratio: 0.88% 


 59%|█████▉    | 229/389 [07:26<04:29,  1.68s/it]

Updated vocab size: 340 Updated sequence length: 1448726 New vocab: 'aux' Token pair: (179, 78) Compression ratio: 0.88% 


 59%|█████▉    | 230/389 [07:28<04:28,  1.69s/it]

Updated vocab size: 341 Updated sequence length: 1447517 New vocab: 'pp' Token pair: (70, 70) Compression ratio: 0.88% 


 59%|█████▉    | 231/389 [07:30<04:36,  1.75s/it]

Updated vocab size: 342 Updated sequence length: 1446318 New vocab: 'oc' Token pair: (69, 57) Compression ratio: 0.88% 


 60%|█████▉    | 232/389 [07:32<04:33,  1.74s/it]

Updated vocab size: 343 Updated sequence length: 1445129 New vocab: ' comm' Token pair: (210, 67) Compression ratio: 0.88% 


 60%|█████▉    | 233/389 [07:33<04:30,  1.73s/it]

Updated vocab size: 344 Updated sequence length: 1443941 New vocab: 'ard' Token pair: (126, 58) Compression ratio: 0.89% 


 60%|██████    | 234/389 [07:35<04:18,  1.67s/it]

Updated vocab size: 345 Updated sequence length: 1442755 New vocab: ' parti' Token pair: (175, 120) Compression ratio: 0.89% 


 60%|██████    | 235/389 [07:37<04:26,  1.73s/it]

Updated vocab size: 346 Updated sequence length: 1441582 New vocab: 'ous' Token pair: (127, 73) Compression ratio: 0.89% 


 61%|██████    | 236/389 [07:39<04:24,  1.73s/it]

Updated vocab size: 347 Updated sequence length: 1440411 New vocab: ' ,' Token pair: (2, 10) Compression ratio: 0.89% 


 61%|██████    | 237/389 [07:40<04:11,  1.65s/it]

Updated vocab size: 348 Updated sequence length: 1439249 New vocab: 'ib' Token pair: (63, 56) Compression ratio: 0.89% 


 61%|██████    | 238/389 [07:42<04:20,  1.73s/it]

Updated vocab size: 349 Updated sequence length: 1438098 New vocab: 'ance' Token pair: (116, 174) Compression ratio: 0.89% 


 61%|██████▏   | 239/389 [07:44<04:18,  1.73s/it]

Updated vocab size: 350 Updated sequence length: 1436950 New vocab: ' W' Token pair: (2, 49) Compression ratio: 0.90% 


 62%|██████▏   | 240/389 [07:45<04:06,  1.65s/it]

Updated vocab size: 351 Updated sequence length: 1435808 New vocab: ' pré' Token pair: (118, 157) Compression ratio: 0.90% 


 62%|██████▏   | 241/389 [07:47<04:16,  1.73s/it]

Updated vocab size: 352 Updated sequence length: 1434669 New vocab: 'rès' Token pair: (72, 274) Compression ratio: 0.90% 


 62%|██████▏   | 242/389 [07:49<04:03,  1.66s/it]

Updated vocab size: 353 Updated sequence length: 1433531 New vocab: ' sont' Token pair: (121, 192) Compression ratio: 0.90% 


 62%|██████▏   | 243/389 [07:51<04:13,  1.74s/it]

Updated vocab size: 354 Updated sequence length: 1432396 New vocab: ' ar' Token pair: (2, 126) Compression ratio: 0.90% 


 63%|██████▎   | 244/389 [07:52<04:00,  1.66s/it]

Updated vocab size: 355 Updated sequence length: 1431271 New vocab: 'tique' Token pair: (120, 159) Compression ratio: 0.90% 


 63%|██████▎   | 245/389 [07:54<04:11,  1.74s/it]

Updated vocab size: 356 Updated sequence length: 1430152 New vocab: ' »' Token pair: (2, 85) Compression ratio: 0.90% 


 63%|██████▎   | 246/389 [07:55<03:59,  1.68s/it]

Updated vocab size: 357 Updated sequence length: 1429034 New vocab: "'A" Token pair: (7, 27) Compression ratio: 0.91% 


 63%|██████▎   | 247/389 [07:57<04:07,  1.74s/it]

Updated vocab size: 358 Updated sequence length: 1427930 New vocab: 'ond' Token pair: (115, 58) Compression ratio: 0.91% 


 64%|██████▍   | 248/389 [07:59<03:54,  1.66s/it]

Updated vocab size: 359 Updated sequence length: 1426832 New vocab: 'um' Token pair: (75, 67) Compression ratio: 0.91% 


 64%|██████▍   | 249/389 [08:01<04:02,  1.73s/it]

Updated vocab size: 360 Updated sequence length: 1425742 New vocab: 'oire' Token pair: (69, 209) Compression ratio: 0.91% 


 64%|██████▍   | 250/389 [08:02<04:00,  1.73s/it]

Updated vocab size: 361 Updated sequence length: 1424654 New vocab: ' di' Token pair: (111, 63) Compression ratio: 0.91% 


 65%|██████▍   | 251/389 [08:04<03:58,  1.73s/it]

Updated vocab size: 362 Updated sequence length: 1423567 New vocab: 'ons' Token pair: (115, 73) Compression ratio: 0.91% 


 65%|██████▍   | 252/389 [08:06<03:46,  1.65s/it]

Updated vocab size: 363 Updated sequence length: 1422486 New vocab: 'entre' Token pair: (132, 119) Compression ratio: 0.91% 


 65%|██████▌   | 253/389 [08:08<03:54,  1.73s/it]

Updated vocab size: 364 Updated sequence length: 1421408 New vocab: 'ors' Token pair: (135, 73) Compression ratio: 0.92% 


 65%|██████▌   | 254/389 [08:09<03:52,  1.72s/it]

Updated vocab size: 365 Updated sequence length: 1420343 New vocab: 'teur' Token pair: (140, 138) Compression ratio: 0.92% 


 66%|██████▌   | 255/389 [08:11<03:41,  1.66s/it]

Updated vocab size: 366 Updated sequence length: 1419282 New vocab: ' comp' Token pair: (210, 70) Compression ratio: 0.92% 


 66%|██████▌   | 256/389 [08:13<03:49,  1.73s/it]

Updated vocab size: 367 Updated sequence length: 1418229 New vocab: 'rou' Token pair: (72, 127) Compression ratio: 0.92% 


 66%|██████▌   | 257/389 [08:14<03:38,  1.65s/it]

Updated vocab size: 368 Updated sequence length: 1417192 New vocab: 'ert' Token pair: (122, 74) Compression ratio: 0.92% 


 66%|██████▋   | 258/389 [08:16<03:46,  1.73s/it]

Updated vocab size: 369 Updated sequence length: 1416157 New vocab: 'uis' Token pair: (75, 124) Compression ratio: 0.92% 


 67%|██████▋   | 259/389 [08:18<03:44,  1.72s/it]

Updated vocab size: 370 Updated sequence length: 1415123 New vocab: 'ob' Token pair: (69, 56) Compression ratio: 0.92% 


 67%|██████▋   | 260/389 [08:19<03:33,  1.65s/it]

Updated vocab size: 371 Updated sequence length: 1414095 New vocab: 'ord' Token pair: (135, 58) Compression ratio: 0.93% 


 67%|██████▋   | 261/389 [08:21<03:41,  1.73s/it]

Updated vocab size: 372 Updated sequence length: 1413074 New vocab: 'end' Token pair: (114, 58) Compression ratio: 0.93% 


 67%|██████▋   | 262/389 [08:23<03:30,  1.65s/it]

Updated vocab size: 373 Updated sequence length: 1412056 New vocab: ' O' Token pair: (2, 41) Compression ratio: 0.93% 


 68%|██████▊   | 263/389 [08:25<03:37,  1.73s/it]

Updated vocab size: 374 Updated sequence length: 1411049 New vocab: 'ui' Token pair: (75, 63) Compression ratio: 0.93% 


 68%|██████▊   | 264/389 [08:26<03:36,  1.73s/it]

Updated vocab size: 375 Updated sequence length: 1410057 New vocab: 'im' Token pair: (63, 67) Compression ratio: 0.93% 


 68%|██████▊   | 265/389 [08:28<03:25,  1.66s/it]

Updated vocab size: 376 Updated sequence length: 1409068 New vocab: ' sa' Token pair: (121, 55) Compression ratio: 0.93% 


 68%|██████▊   | 266/389 [08:30<03:33,  1.73s/it]

Updated vocab size: 377 Updated sequence length: 1408086 New vocab: 'lis' Token pair: (66, 124) Compression ratio: 0.93% 


 69%|██████▊   | 267/389 [08:31<03:21,  1.65s/it]

Updated vocab size: 378 Updated sequence length: 1407105 New vocab: ' tra' Token pair: (154, 150) Compression ratio: 0.94% 


 69%|██████▉   | 268/389 [08:33<03:28,  1.72s/it]

Updated vocab size: 379 Updated sequence length: 1406124 New vocab: 'onn' Token pair: (115, 68) Compression ratio: 0.94% 


 69%|██████▉   | 269/389 [08:34<03:17,  1.65s/it]

Updated vocab size: 380 Updated sequence length: 1405146 New vocab: ' nom' Token pair: (171, 146) Compression ratio: 0.94% 


 69%|██████▉   | 270/389 [08:36<03:24,  1.72s/it]

Updated vocab size: 381 Updated sequence length: 1404175 New vocab: 'fé' Token pair: (60, 94) Compression ratio: 0.94% 


 70%|██████▉   | 271/389 [08:38<03:13,  1.64s/it]

Updated vocab size: 382 Updated sequence length: 1403204 New vocab: 'ris' Token pair: (72, 124) Compression ratio: 0.94% 


 70%|██████▉   | 272/389 [08:40<03:20,  1.71s/it]

Updated vocab size: 383 Updated sequence length: 1402234 New vocab: 'ct' Token pair: (57, 74) Compression ratio: 0.94% 


 70%|███████   | 273/389 [08:41<03:17,  1.70s/it]

Updated vocab size: 384 Updated sequence length: 1401265 New vocab: ' ra' Token pair: (2, 150) Compression ratio: 0.94% 


 70%|███████   | 274/389 [08:43<03:07,  1.63s/it]

Updated vocab size: 385 Updated sequence length: 1400300 New vocab: 'ette' Token pair: (129, 140) Compression ratio: 0.94% 


 71%|███████   | 275/389 [08:45<03:15,  1.71s/it]

Updated vocab size: 386 Updated sequence length: 1399335 New vocab: ')\n' Token pair: (9, 1) Compression ratio: 0.95% 


 71%|███████   | 276/389 [08:46<03:04,  1.64s/it]

Updated vocab size: 387 Updated sequence length: 1398372 New vocab: 'he' Token pair: (62, 59) Compression ratio: 0.95% 


 71%|███████   | 277/389 [08:48<03:11,  1.71s/it]

Updated vocab size: 388 Updated sequence length: 1397410 New vocab: ' aux' Token pair: (187, 78) Compression ratio: 0.95% 


 71%|███████▏  | 278/389 [08:50<03:01,  1.64s/it]

Updated vocab size: 389 Updated sequence length: 1396458 New vocab: 'lle' Token pair: (66, 131) Compression ratio: 0.95% 


 72%|███████▏  | 279/389 [08:51<03:08,  1.71s/it]

Updated vocab size: 390 Updated sequence length: 1395511 New vocab: 'ing' Token pair: (125, 61) Compression ratio: 0.95% 


 72%|███████▏  | 280/389 [08:53<02:57,  1.63s/it]

Updated vocab size: 391 Updated sequence length: 1394573 New vocab: ' pas' Token pair: (118, 189) Compression ratio: 0.95% 


 72%|███████▏  | 281/389 [08:55<03:04,  1.71s/it]

Updated vocab size: 392 Updated sequence length: 1393635 New vocab: 'mes' Token pair: (67, 112) Compression ratio: 0.95% 


 72%|███████▏  | 282/389 [08:56<03:02,  1.70s/it]

Updated vocab size: 393 Updated sequence length: 1392699 New vocab: 'la' Token pair: (66, 55) Compression ratio: 0.96% 


 73%|███████▎  | 283/389 [08:58<02:54,  1.65s/it]

Updated vocab size: 394 Updated sequence length: 1391769 New vocab: 'Le' Token pair: (38, 59) Compression ratio: 0.96% 


 73%|███████▎  | 284/389 [09:00<03:00,  1.72s/it]

Updated vocab size: 395 Updated sequence length: 1390842 New vocab: 'ranç' Token pair: (220, 92) Compression ratio: 0.96% 


 73%|███████▎  | 285/389 [09:01<02:50,  1.64s/it]

Updated vocab size: 396 Updated sequence length: 1389916 New vocab: 'oir' Token pair: (69, 235) Compression ratio: 0.96% 


 74%|███████▎  | 286/389 [09:03<02:56,  1.71s/it]

Updated vocab size: 397 Updated sequence length: 1389008 New vocab: 'remi' Token pair: (119, 297) Compression ratio: 0.96% 


 74%|███████▍  | 287/389 [09:05<02:47,  1.64s/it]

Updated vocab size: 398 Updated sequence length: 1388107 New vocab: 'iste' Token pair: (124, 140) Compression ratio: 0.96% 


 74%|███████▍  | 288/389 [09:07<02:52,  1.71s/it]

Updated vocab size: 399 Updated sequence length: 1387216 New vocab: ' for' Token pair: (148, 135) Compression ratio: 0.96% 


 74%|███████▍  | 289/389 [09:08<02:50,  1.70s/it]

Updated vocab size: 400 Updated sequence length: 1386338 New vocab: 'iè' Token pair: (63, 93) Compression ratio: 0.96% 


 75%|███████▍  | 290/389 [09:10<02:41,  1.63s/it]

Updated vocab size: 401 Updated sequence length: 1385467 New vocab: ' ét' Token pair: (185, 74) Compression ratio: 0.97% 


 75%|███████▍  | 291/389 [09:12<02:47,  1.71s/it]

Updated vocab size: 402 Updated sequence length: 1384599 New vocab: 'oy' Token pair: (69, 79) Compression ratio: 0.97% 


 75%|███████▌  | 292/389 [09:13<02:38,  1.63s/it]

Updated vocab size: 403 Updated sequence length: 1383731 New vocab: 'av' Token pair: (55, 76) Compression ratio: 0.97% 


 75%|███████▌  | 293/389 [09:15<02:44,  1.71s/it]

Updated vocab size: 404 Updated sequence length: 1382869 New vocab: 'lé' Token pair: (66, 94) Compression ratio: 0.97% 


 76%|███████▌  | 294/389 [09:16<02:35,  1.63s/it]

Updated vocab size: 405 Updated sequence length: 1382010 New vocab: 'Un' Token pair: (47, 68) Compression ratio: 0.97% 


 76%|███████▌  | 295/389 [09:18<02:40,  1.71s/it]

Updated vocab size: 406 Updated sequence length: 1381153 New vocab: 'ins' Token pair: (125, 73) Compression ratio: 0.97% 


 76%|███████▌  | 296/389 [09:20<02:38,  1.70s/it]

Updated vocab size: 407 Updated sequence length: 1380301 New vocab: 'féren' Token pair: (380, 254) Compression ratio: 0.97% 


 76%|███████▋  | 297/389 [09:21<02:29,  1.62s/it]

Updated vocab size: 408 Updated sequence length: 1379452 New vocab: 'oci' Token pair: (69, 198) Compression ratio: 0.97% 


 77%|███████▋  | 298/389 [09:23<02:34,  1.70s/it]

Updated vocab size: 409 Updated sequence length: 1378603 New vocab: 'id' Token pair: (63, 58) Compression ratio: 0.98% 


 77%|███████▋  | 299/389 [09:25<02:26,  1.63s/it]

Updated vocab size: 410 Updated sequence length: 1377760 New vocab: '198' Token pair: (178, 22) Compression ratio: 0.98% 


 77%|███████▋  | 300/389 [09:27<02:31,  1.70s/it]

Updated vocab size: 411 Updated sequence length: 1376920 New vocab: 'ure' Token pair: (75, 119) Compression ratio: 0.98% 


 77%|███████▋  | 301/389 [09:28<02:28,  1.69s/it]

Updated vocab size: 412 Updated sequence length: 1376080 New vocab: 'éc' Token pair: (94, 57) Compression ratio: 0.98% 


 78%|███████▊  | 302/389 [09:30<02:22,  1.64s/it]

Updated vocab size: 413 Updated sequence length: 1375240 New vocab: 'ier' Token pair: (63, 122) Compression ratio: 0.98% 


 78%|███████▊  | 303/389 [09:32<02:27,  1.71s/it]

Updated vocab size: 414 Updated sequence length: 1374402 New vocab: ' été' Token pair: (185, 216) Compression ratio: 0.98% 


 78%|███████▊  | 304/389 [09:33<02:18,  1.63s/it]

Updated vocab size: 415 Updated sequence length: 1373567 New vocab: ' ;' Token pair: (2, 25) Compression ratio: 0.98% 


 78%|███████▊  | 305/389 [09:35<02:22,  1.69s/it]

Updated vocab size: 416 Updated sequence length: 1373100 New vocab: '  ' Token pair: (2, 2) Compression ratio: 0.98% 


 79%|███████▊  | 306/389 [09:37<02:20,  1.69s/it]

Updated vocab size: 417 Updated sequence length: 1372272 New vocab: 'raph' Token pair: (150, 325) Compression ratio: 0.98% 


 79%|███████▉  | 307/389 [09:38<02:17,  1.68s/it]

Updated vocab size: 418 Updated sequence length: 1371446 New vocab: ' Les' Token pair: (153, 112) Compression ratio: 0.99% 


 79%|███████▉  | 308/389 [09:40<02:09,  1.60s/it]

Updated vocab size: 419 Updated sequence length: 1370630 New vocab: 'art' Token pair: (126, 74) Compression ratio: 0.99% 


 79%|███████▉  | 309/389 [09:42<02:13,  1.67s/it]

Updated vocab size: 420 Updated sequence length: 1369816 New vocab: 'rançais' Token pair: (394, 204) Compression ratio: 0.99% 


 80%|███████▉  | 310/389 [09:43<02:12,  1.67s/it]

Updated vocab size: 421 Updated sequence length: 1369021 New vocab: ' -' Token pair: (2, 11) Compression ratio: 0.99% 


 80%|███████▉  | 311/389 [09:45<02:05,  1.61s/it]

Updated vocab size: 422 Updated sequence length: 1368231 New vocab: ' av' Token pair: (128, 76) Compression ratio: 0.99% 


 80%|████████  | 312/389 [09:47<02:09,  1.68s/it]

Updated vocab size: 423 Updated sequence length: 1367443 New vocab: ' premi' Token pair: (118, 396) Compression ratio: 0.99% 


 80%|████████  | 313/389 [09:48<02:07,  1.68s/it]

Updated vocab size: 424 Updated sequence length: 1366656 New vocab: 'tin' Token pair: (120, 68) Compression ratio: 0.99% 


 81%|████████  | 314/389 [09:50<02:00,  1.61s/it]

Updated vocab size: 425 Updated sequence length: 1365870 New vocab: 'eur' Token pair: (59, 138) Compression ratio: 0.99% 


 81%|████████  | 315/389 [09:52<02:04,  1.69s/it]

Updated vocab size: 426 Updated sequence length: 1365087 New vocab: 'ations' Token pair: (193, 73) Compression ratio: 0.99% 


 81%|████████  | 316/389 [09:53<01:57,  1.61s/it]

Updated vocab size: 427 Updated sequence length: 1364316 New vocab: 'ru' Token pair: (72, 75) Compression ratio: 1.00% 


 81%|████████▏ | 317/389 [09:55<01:56,  1.62s/it]

Updated vocab size: 428 Updated sequence length: 1363546 New vocab: ' ce' Token pair: (123, 59) Compression ratio: 1.00% 


 82%|████████▏ | 318/389 [09:56<01:59,  1.68s/it]

Updated vocab size: 429 Updated sequence length: 1362777 New vocab: 'tif' Token pair: (120, 60) Compression ratio: 1.00% 


 82%|████████▏ | 319/389 [09:58<01:57,  1.68s/it]

Updated vocab size: 430 Updated sequence length: 1362010 New vocab: 'ec' Token pair: (59, 57) Compression ratio: 1.00% 


 82%|████████▏ | 320/389 [10:00<01:55,  1.67s/it]

Updated vocab size: 431 Updated sequence length: 1361246 New vocab: ' ac' Token pair: (128, 57) Compression ratio: 1.00% 


 83%|████████▎ | 321/389 [10:01<01:49,  1.61s/it]

Updated vocab size: 432 Updated sequence length: 1360483 New vocab: ' En' Token pair: (238, 68) Compression ratio: 1.00% 


 83%|████████▎ | 322/389 [10:03<01:53,  1.69s/it]

Updated vocab size: 433 Updated sequence length: 1359725 New vocab: ' cont' Token pair: (123, 192) Compression ratio: 1.00% 


 83%|████████▎ | 323/389 [10:05<01:51,  1.69s/it]

Updated vocab size: 434 Updated sequence length: 1358968 New vocab: 'ograph' Token pair: (256, 416) Compression ratio: 1.00% 


 83%|████████▎ | 324/389 [10:06<01:44,  1.61s/it]

Updated vocab size: 435 Updated sequence length: 1358211 New vocab: 'férences' Token pair: (406, 277) Compression ratio: 1.00% 


 84%|████████▎ | 325/389 [10:08<01:47,  1.68s/it]

Updated vocab size: 436 Updated sequence length: 1357459 New vocab: 'oi' Token pair: (69, 63) Compression ratio: 1.01% 


 84%|████████▍ | 326/389 [10:10<01:41,  1.61s/it]

Updated vocab size: 437 Updated sequence length: 1356712 New vocab: ' Par' Token pair: (183, 126) Compression ratio: 1.01% 


 84%|████████▍ | 327/389 [10:11<01:43,  1.67s/it]

Updated vocab size: 438 Updated sequence length: 1355975 New vocab: 'od' Token pair: (69, 58) Compression ratio: 1.01% 


 84%|████████▍ | 328/389 [10:13<01:41,  1.67s/it]

Updated vocab size: 439 Updated sequence length: 1355238 New vocab: 'ées' Token pair: (94, 112) Compression ratio: 1.01% 


 85%|████████▍ | 329/389 [10:15<01:40,  1.67s/it]

Updated vocab size: 440 Updated sequence length: 1354503 New vocab: ' comme' Token pair: (210, 188) Compression ratio: 1.01% 


 85%|████████▍ | 330/389 [10:16<01:34,  1.60s/it]

Updated vocab size: 441 Updated sequence length: 1353768 New vocab: '197' Token pair: (178, 21) Compression ratio: 1.01% 


 85%|████████▌ | 331/389 [10:18<01:36,  1.67s/it]

Updated vocab size: 442 Updated sequence length: 1353039 New vocab: 'ens' Token pair: (114, 73) Compression ratio: 1.01% 


 85%|████████▌ | 332/389 [10:20<01:34,  1.66s/it]

Updated vocab size: 443 Updated sequence length: 1352311 New vocab: "'une" Token pair: (287, 59) Compression ratio: 1.01% 


 86%|████████▌ | 333/389 [10:21<01:29,  1.60s/it]

Updated vocab size: 444 Updated sequence length: 1351586 New vocab: 'amp' Token pair: (215, 70) Compression ratio: 1.01% 


 86%|████████▌ | 334/389 [10:23<01:32,  1.68s/it]

Updated vocab size: 445 Updated sequence length: 1350861 New vocab: ' al' Token pair: (128, 66) Compression ratio: 1.02% 


 86%|████████▌ | 335/389 [10:25<01:30,  1.68s/it]

Updated vocab size: 446 Updated sequence length: 1350138 New vocab: ' deux' Token pair: (117, 311) Compression ratio: 1.02% 


 86%|████████▋ | 336/389 [10:26<01:24,  1.60s/it]

Updated vocab size: 447 Updated sequence length: 1349416 New vocab: ' entre' Token pair: (2, 362) Compression ratio: 1.02% 


 87%|████████▋ | 337/389 [10:28<01:27,  1.67s/it]

Updated vocab size: 448 Updated sequence length: 1348697 New vocab: 'ussi' Token pair: (156, 194) Compression ratio: 1.02% 


 87%|████████▋ | 338/389 [10:29<01:21,  1.60s/it]

Updated vocab size: 449 Updated sequence length: 1347978 New vocab: 'olog' Token pair: (180, 256) Compression ratio: 1.02% 


 87%|████████▋ | 339/389 [10:31<01:24,  1.68s/it]

Updated vocab size: 450 Updated sequence length: 1347260 New vocab: ' français' Token pair: (148, 419) Compression ratio: 1.02% 


 87%|████████▋ | 340/389 [10:33<01:23,  1.70s/it]

Updated vocab size: 451 Updated sequence length: 1346543 New vocab: ' commun' Token pair: (342, 136) Compression ratio: 1.02% 


 88%|████████▊ | 341/389 [10:34<01:17,  1.62s/it]

Updated vocab size: 452 Updated sequence length: 1345829 New vocab: 'cip' Token pair: (198, 70) Compression ratio: 1.02% 


 88%|████████▊ | 342/389 [10:36<01:19,  1.69s/it]

Updated vocab size: 453 Updated sequence length: 1345116 New vocab: 'rit' Token pair: (72, 133) Compression ratio: 1.02% 


 88%|████████▊ | 343/389 [10:38<01:17,  1.69s/it]

Updated vocab size: 454 Updated sequence length: 1344407 New vocab: ' am' Token pair: (128, 67) Compression ratio: 1.03% 


 88%|████████▊ | 344/389 [10:39<01:12,  1.61s/it]

Updated vocab size: 455 Updated sequence length: 1343699 New vocab: ' É' Token pair: (2, 87) Compression ratio: 1.03% 


 89%|████████▊ | 345/389 [10:41<01:14,  1.69s/it]

Updated vocab size: 456 Updated sequence length: 1342999 New vocab: ' app' Token pair: (128, 340) Compression ratio: 1.03% 


 89%|████████▉ | 346/389 [10:43<01:09,  1.61s/it]

Updated vocab size: 457 Updated sequence length: 1342301 New vocab: 'La' Token pair: (38, 55) Compression ratio: 1.03% 


 89%|████████▉ | 347/389 [10:44<01:10,  1.69s/it]

Updated vocab size: 458 Updated sequence length: 1341604 New vocab: 'alis' Token pair: (143, 124) Compression ratio: 1.03% 


 89%|████████▉ | 348/389 [10:46<01:05,  1.61s/it]

Updated vocab size: 459 Updated sequence length: 1340908 New vocab: 'all' Token pair: (143, 66) Compression ratio: 1.03% 


 90%|████████▉ | 349/389 [10:48<01:07,  1.68s/it]

Updated vocab size: 460 Updated sequence length: 1340213 New vocab: 'sent' Token pair: (73, 132) Compression ratio: 1.03% 


 90%|████████▉ | 350/389 [10:49<01:05,  1.67s/it]

Updated vocab size: 461 Updated sequence length: 1339519 New vocab: 'tions' Token pair: (145, 73) Compression ratio: 1.03% 


 90%|█████████ | 351/389 [10:51<01:00,  1.60s/it]

Updated vocab size: 462 Updated sequence length: 1338833 New vocab: ' cl' Token pair: (123, 66) Compression ratio: 1.03% 


 90%|█████████ | 352/389 [10:53<01:02,  1.68s/it]

Updated vocab size: 463 Updated sequence length: 1338150 New vocab: ' te' Token pair: (2, 140) Compression ratio: 1.03% 


 91%|█████████ | 353/389 [10:54<00:57,  1.61s/it]

Updated vocab size: 464 Updated sequence length: 1337467 New vocab: 'iens' Token pair: (272, 73) Compression ratio: 1.04% 


 91%|█████████ | 354/389 [10:56<00:56,  1.61s/it]

Updated vocab size: 465 Updated sequence length: 1336785 New vocab: 'ubli' Token pair: (323, 211) Compression ratio: 1.04% 


 91%|█████████▏| 355/389 [10:58<00:57,  1.68s/it]

Updated vocab size: 466 Updated sequence length: 1336105 New vocab: 'tat' Token pair: (74, 181) Compression ratio: 1.04% 


 92%|█████████▏| 356/389 [10:59<00:55,  1.67s/it]

Updated vocab size: 467 Updated sequence length: 1335428 New vocab: 'ence' Token pair: (114, 174) Compression ratio: 1.04% 


 92%|█████████▏| 357/389 [11:01<00:53,  1.67s/it]

Updated vocab size: 468 Updated sequence length: 1334755 New vocab: 'En' Token pair: (31, 68) Compression ratio: 1.04% 


 92%|█████████▏| 358/389 [11:02<00:49,  1.59s/it]

Updated vocab size: 469 Updated sequence length: 1334084 New vocab: ' déc' Token pair: (214, 57) Compression ratio: 1.04% 


 92%|█████████▏| 359/389 [11:04<00:50,  1.69s/it]

Updated vocab size: 470 Updated sequence length: 1333419 New vocab: 'aine' Token pair: (222, 59) Compression ratio: 1.04% 


 93%|█████████▎| 360/389 [11:06<00:48,  1.68s/it]

Updated vocab size: 471 Updated sequence length: 1332756 New vocab: 'ants' Token pair: (169, 73) Compression ratio: 1.04% 


 93%|█████████▎| 361/389 [11:07<00:44,  1.60s/it]

Updated vocab size: 472 Updated sequence length: 1332095 New vocab: 'ém' Token pair: (94, 67) Compression ratio: 1.04% 


 93%|█████████▎| 362/389 [11:09<00:45,  1.67s/it]

Updated vocab size: 473 Updated sequence length: 1331435 New vocab: ' mar' Token pair: (141, 126) Compression ratio: 1.05% 


 93%|█████████▎| 363/389 [11:11<00:43,  1.68s/it]

Updated vocab size: 474 Updated sequence length: 1330777 New vocab: 'ign' Token pair: (229, 68) Compression ratio: 1.05% 


 94%|█████████▎| 364/389 [11:12<00:40,  1.61s/it]

Updated vocab size: 475 Updated sequence length: 1330121 New vocab: 'ang' Token pair: (116, 61) Compression ratio: 1.05% 


 94%|█████████▍| 365/389 [11:14<00:38,  1.61s/it]

Updated vocab size: 476 Updated sequence length: 1329465 New vocab: 'tres' Token pair: (74, 206) Compression ratio: 1.05% 


 94%|█████████▍| 366/389 [11:16<00:38,  1.68s/it]

Updated vocab size: 477 Updated sequence length: 1328809 New vocab: 'alement' Token pair: (244, 168) Compression ratio: 1.05% 


 94%|█████████▍| 367/389 [11:17<00:36,  1.67s/it]

Updated vocab size: 478 Updated sequence length: 1328159 New vocab: 'rin' Token pair: (72, 125) Compression ratio: 1.05% 


 95%|█████████▍| 368/389 [11:19<00:35,  1.67s/it]

Updated vocab size: 479 Updated sequence length: 1327511 New vocab: ' Mar' Token pair: (176, 126) Compression ratio: 1.05% 


 95%|█████████▍| 369/389 [11:20<00:31,  1.60s/it]

Updated vocab size: 480 Updated sequence length: 1326863 New vocab: ' ses' Token pair: (121, 112) Compression ratio: 1.05% 


 95%|█████████▌| 370/389 [11:22<00:31,  1.67s/it]

Updated vocab size: 481 Updated sequence length: 1326215 New vocab: ' ti' Token pair: (2, 120) Compression ratio: 1.05% 


 95%|█████████▌| 371/389 [11:24<00:30,  1.68s/it]

Updated vocab size: 482 Updated sequence length: 1325570 New vocab: 'out' Token pair: (127, 74) Compression ratio: 1.05% 


 96%|█████████▌| 372/389 [11:25<00:27,  1.60s/it]

Updated vocab size: 483 Updated sequence length: 1324927 New vocab: "'in" Token pair: (7, 125) Compression ratio: 1.06% 


 96%|█████████▌| 373/389 [11:27<00:26,  1.68s/it]

Updated vocab size: 484 Updated sequence length: 1324286 New vocab: 'sion' Token pair: (194, 115) Compression ratio: 1.06% 


 96%|█████████▌| 374/389 [11:29<00:25,  1.69s/it]

Updated vocab size: 485 Updated sequence length: 1323646 New vocab: "'ar" Token pair: (7, 126) Compression ratio: 1.06% 


 96%|█████████▋| 375/389 [11:30<00:22,  1.61s/it]

Updated vocab size: 486 Updated sequence length: 1323008 New vocab: 'éd' Token pair: (94, 58) Compression ratio: 1.06% 


 97%|█████████▋| 376/389 [11:32<00:21,  1.69s/it]

Updated vocab size: 487 Updated sequence length: 1322371 New vocab: 'éric' Token pair: (275, 57) Compression ratio: 1.06% 


 97%|█████████▋| 377/389 [11:34<00:19,  1.62s/it]

Updated vocab size: 488 Updated sequence length: 1321739 New vocab: ' mais' Token pair: (141, 204) Compression ratio: 1.06% 


 97%|█████████▋| 378/389 [11:36<00:18,  1.71s/it]

Updated vocab size: 489 Updated sequence length: 1321108 New vocab: 'ture' Token pair: (257, 119) Compression ratio: 1.06% 


 97%|█████████▋| 379/389 [11:37<00:16,  1.62s/it]

Updated vocab size: 490 Updated sequence length: 1320479 New vocab: 'éné' Token pair: (94, 281) Compression ratio: 1.06% 


 98%|█████████▊| 380/389 [11:39<00:15,  1.69s/it]

Updated vocab size: 491 Updated sequence length: 1319851 New vocab: 'imp' Token pair: (63, 248) Compression ratio: 1.06% 


 98%|█████████▊| 381/389 [11:40<00:12,  1.60s/it]

Updated vocab size: 492 Updated sequence length: 1319227 New vocab: ' An' Token pair: (197, 68) Compression ratio: 1.06% 


 98%|█████████▊| 382/389 [11:42<00:11,  1.67s/it]

Updated vocab size: 493 Updated sequence length: 1318603 New vocab: 'én' Token pair: (94, 68) Compression ratio: 1.07% 


 98%|█████████▊| 383/389 [11:44<00:09,  1.66s/it]

Updated vocab size: 494 Updated sequence length: 1317980 New vocab: 'erm' Token pair: (122, 67) Compression ratio: 1.07% 


 99%|█████████▊| 384/389 [11:45<00:07,  1.58s/it]

Updated vocab size: 495 Updated sequence length: 1317357 New vocab: ' po' Token pair: (118, 69) Compression ratio: 1.07% 


 99%|█████████▉| 385/389 [11:47<00:06,  1.65s/it]

Updated vocab size: 496 Updated sequence length: 1316738 New vocab: 'ours' Token pair: (163, 73) Compression ratio: 1.07% 


 99%|█████████▉| 386/389 [11:48<00:04,  1.58s/it]

Updated vocab size: 497 Updated sequence length: 1316124 New vocab: ' ne' Token pair: (171, 59) Compression ratio: 1.07% 


 99%|█████████▉| 387/389 [11:50<00:03,  1.65s/it]

Updated vocab size: 498 Updated sequence length: 1315520 New vocab: ' peu' Token pair: (118, 267) Compression ratio: 1.07% 


100%|█████████▉| 388/389 [11:52<00:01,  1.65s/it]

Updated vocab size: 499 Updated sequence length: 1314917 New vocab: 'ét' Token pair: (94, 74) Compression ratio: 1.07% 


100%|██████████| 389/389 [11:53<00:00,  1.83s/it]


Updated vocab size: 500 Updated sequence length: 1314314 New vocab: 'ass' Token pair: (189, 73) Compression ratio: 1.07% 
Saving vocab...
Encoding...


100%|██████████| 9/9 [00:00<00:00, 1505.73it/s]

len original text: 40, len bpe text: 20
[[28, 115, 64, 163], [144], [141, 115, 207], [10], [210, 168], [184, 55], [154, 55], [243, 163, 68, 200], [0]]
Decoding...
Bonjour le monde, comment va ta journée<UNK>
CPU times: user 11min 53s, sys: 4.02 s, total: 11min 57s
Wall time: 11min 57s





In [15]:
if tokenizer_name == "MostCommonRegexTokenizer":
    counter = Counter(train_text)
    counter.most_common()

In [16]:
if tokenizer_name == "MostCommonRegexTokenizer":
    tokenized_text = tokenizer.encode(train_text)
    flat_tokenized_text = [item for sublist in tokenized_text for item in sublist]  # flatten code
    counter = Counter(flat_tokenized_text)
    print('\n'.join(f'key:{key:<8} str:{repr(tokenizer.int_to_text[key]):<10} count:{value}' for key, value in counter.most_common()))

Encoding...


100%|██████████| 557149/557149 [04:31<00:00, 2050.43it/s]


key:2        str:' '        count:33220
key:117      str:' de'      count:26341
key:10       str:','        count:25110
key:73       str:'s'        count:18508
key:55       str:'a'        count:16118
key:59       str:'e'        count:16074
key:112      str:'es'       count:14013
key:57       str:'c'        count:13260
key:63       str:'i'        count:13103
key:122      str:'er'       count:12889
key:74       str:'t'        count:12607
key:69       str:'o'        count:12014
key:134      str:' la'      count:11810
key:70       str:'p'        count:11781
key:12       str:'.'        count:11714
key:94       str:'é'        count:11611
key:113      str:' l'       count:11416
key:66       str:'l'        count:10687
key:115      str:'on'       count:10593
key:7        str:"'"        count:10443
key:75       str:'u'        count:10426
key:139      str:' et'      count:10232
key:121      str:' s'       count:10229
key:111      str:' d'       count:10169
key:61       str:'g'        count:9843
k