# Sources:
1 - https://karpathy.ai/zero-to-hero.html  
2 - https://www.youtube.com/watch?v=kCc8FmEb1nY  
3 - https://github.com/karpathy/minbpe/tree/master  

# Import libraries

In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torch.nn import functional as F
from tqdm import tqdm
import time
import re

# Parameters

In [2]:
# Dataset parameters
train_ratio = 0.9
sample_size = 10

SEED = 15

# Load files

In [3]:
from datasets import load_dataset

# Define the path to the dataset
dataset_name = "20231101.fr"

# Load the dataset
raw_dataset = load_dataset("wikimedia/wikipedia", dataset_name)

  from .autonotebook import tqdm as notebook_tqdm
Resolving data files: 100%|██████████| 17/17 [00:02<00:00,  7.29it/s]


# Data preparation

In [4]:
raw_dataset['train'].num_rows

2564646

In [5]:
# Create training and evaluation datasets
if sample_size < 0:
    train_sample = round(raw_dataset['train'].num_rows * train_ratio)
    test_sample = round(raw_dataset['train'].num_rows * (1- train_ratio))
else:
    train_sample = round(sample_size * train_ratio)
    test_sample = round(sample_size * (1- train_ratio))

ds_train_test = raw_dataset['train'].train_test_split(train_size=train_sample, test_size=test_sample, seed=SEED)
train_text = ''.join([t['text'] for t in ds_train_test['train']])

In [6]:
# # Create the vocabulary from the training data
# def create_vocab(text, type="char"):
#     if type == "char":
#         sorted_text = sorted(list(set(text)))
#     elif type == "word":
#         sorted_text = sorted(list(set(text)))
#     else:
#         raise Exception("Invalid type")
    
#     text_to_int = {c: i for i, c in enumerate(sorted_text)}
#     int_to_text = {i: c for i, c in enumerate(sorted_text)}

#     return text_to_int, int_to_text
    

# def char_encoder(text, text_to_int):
#     return [text_to_int[char] for char in text]


# def get_most_common_token_pair(tokens, verbose=False, int_to_text=None):
#     d = {}
#     for pairs in tqdm(zip(tokens, tokens[1:])):
#         d[pairs] = d.get(pairs, 0) + 1

#     # Get the key having the highest value (higher frequency of occurrence)
#     most_freq = max(d, key=lambda k: d[k])
#     if verbose:
#         print(most_freq)
#         most_freq_text = "".join([int_to_text[i] for i in most_freq])
#         print(f'Most frequent word: {most_freq} --> "{most_freq_text}"')
#     return most_freq

In [7]:
# max_vocab_size = 150
# text_to_int, int_to_text = create_vocab(train_text, "char")
# train_tokens = char_encoder(train_text, text_to_int)    
# train_tokens_temp = train_tokens.copy()
# vocab_size = len(text_to_int)
# print("Vocab size:", vocab_size, "Sequence length:", len(train_tokens_temp))

# while vocab_size < max_vocab_size:
#     # Get the most frequent token pair
#     most_freq = get_most_common_token_pair(train_tokens_temp, True, int_to_text)
    
#     # Update the vocabulary
#     token_pair = (most_freq[0], most_freq[1])
#     str_pair = ''.join([int_to_text[most_freq[0]], int_to_text[most_freq[1]]])
#     text_to_int[str_pair] = vocab_size
#     int_to_text[vocab_size] = str_pair

#     # Replace the most frequent token pair with a new one
#     i = 0
#     new_train_token = []
#     while i < len(train_tokens_temp) - 1:
#         if (train_tokens_temp[i], train_tokens_temp[i+1]) == token_pair:
#             new_train_token.append(text_to_int[str_pair])
#             i += 2
#         else:
#             new_train_token.append(train_tokens_temp[i])
#             i += 1
    
#     # Print the statistics
#     vocab_size = len(text_to_int)
#     print("Vocab size:", vocab_size, "Sequence length:", len(new_train_token))
#     train_tokens_temp = new_train_token
    


In [8]:
class BasicTokenizer():
    def __init__(self, ):
        pass
        
    def train(self, text, max_vocab_size, verbose=False):
        self.text_to_int, self.int_to_text = self._create_vocab(text)
        train_tokens = self._char_encoder(text, self.text_to_int)    
        train_tokens_temp = train_tokens.copy()
        current_vocab_size = len(self.text_to_int)
        
        if verbose: 
            print("Vocab size:", current_vocab_size, "Sequence length:", len(train_tokens_temp))
        
        while current_vocab_size < max_vocab_size:
            # Get the most frequent token pair
            most_freq = self._get_most_common_token_pair(train_tokens_temp, True, self.int_to_text)
            
            # Update the vocabulary
            token_pair = (most_freq[0], most_freq[1])
            str_pair = ''.join([self.int_to_text[most_freq[0]], self.int_to_text[most_freq[1]]])
            self.text_to_int[str_pair] = current_vocab_size
            self.int_to_text[current_vocab_size] = str_pair

            # Replace the most frequent token pair with a new one
            i = 0
            new_train_token = []
            while i < len(train_tokens_temp) - 1:
                if (train_tokens_temp[i], train_tokens_temp[i+1]) == token_pair:
                    new_train_token.append(self.text_to_int[str_pair])
                    i += 2
                else:
                    new_train_token.append(train_tokens_temp[i])
                    i += 1
            
            # Print the statistics
            if verbose:
                current_vocab_size = len(self.text_to_int)
                print("Vocab size:", current_vocab_size, "Sequence length:", len(new_train_token))
                
            train_tokens_temp = new_train_token
            
    # Create the vocabulary from the training data
    def _create_vocab(self, text):
        sorted_text = sorted(list(set(text)))
        text_to_int = {c: i for i, c in enumerate(sorted_text)}
        int_to_text = {i: c for i, c in enumerate(sorted_text)}
        return text_to_int, int_to_text
        
    def _char_encoder(self, text, text_to_int):
        return [text_to_int[char] for char in text]

    def _get_most_common_token_pair(self, tokens, verbose=False, int_to_text=None):
        d = {}
        for pairs in tqdm(zip(tokens, tokens[1:])):
            d[pairs] = d.get(pairs, 0) + 1

        # Get the key having the highest value (higher frequency of occurrence)
        most_freq = max(d, key=lambda k: d[k])
        if verbose:
            print(most_freq)
            most_freq_text = "".join([int_to_text[i] for i in most_freq])
            print(f'Most frequent word: {most_freq} --> "{most_freq_text}"')
        return most_freq
    
    def encode(self, text):
        return self._char_encoder(text, self.text_to_int)
    
    def decode(self, tokens):
        return "".join([self.int_to_text[i] for i in tokens])

In [9]:
tokenizer = BasicTokenizer()
tokenizer.train(train_text, max_vocab_size=200, verbose=True)

Vocab size: 94 Sequence length: 35611


35610it [00:00, 2099923.59it/s]


(54, 2)
Most frequent word: (54, 2) --> "e "
Vocab size: 95 Sequence length: 34075


34074it [00:00, 1798305.27it/s]


(68, 2)
Most frequent word: (68, 2) --> "s "
Vocab size: 96 Sequence length: 33267


33266it [00:00, 1899318.24it/s]


(64, 63)
Most frequent word: (64, 63) --> "on"
Vocab size: 97 Sequence length: 32749


32748it [00:00, 1958354.49it/s]


(69, 2)
Most frequent word: (69, 2) --> "t "
Vocab size: 98 Sequence length: 32255


32254it [00:00, 2186145.91it/s]


(53, 94)
Most frequent word: (53, 94) --> "de "
Vocab size: 99 Sequence length: 31819


31818it [00:00, 3751668.86it/s]


(50, 63)
Most frequent word: (50, 63) --> "an"
Vocab size: 100 Sequence length: 31385


31384it [00:00, 2390130.31it/s]


(54, 63)
Most frequent word: (54, 63) --> "en"
Vocab size: 101 Sequence length: 30953


30952it [00:00, 2134003.41it/s]


(54, 95)
Most frequent word: (54, 95) --> "es "
Vocab size: 102 Sequence length: 30590


30589it [00:00, 1334965.87it/s]


(69, 58)
Most frequent word: (69, 58) --> "ti"
Vocab size: 103 Sequence length: 30271


30270it [00:00, 1537418.80it/s]


(6, 2)
Most frequent word: (6, 2) --> ", "
Vocab size: 104 Sequence length: 29972


29971it [00:00, 1416071.34it/s]


(58, 63)
Most frequent word: (58, 63) --> "in"
Vocab size: 105 Sequence length: 29675


29674it [00:00, 1858137.66it/s]


(54, 67)
Most frequent word: (54, 67) --> "er"
Vocab size: 106 Sequence length: 29386


29385it [00:00, 2923863.62it/s]


(50, 2)
Most frequent word: (50, 2) --> "a "
Vocab size: 107 Sequence length: 29099


29098it [00:00, 1315900.87it/s]


(66, 70)
Most frequent word: (66, 70) --> "qu"
Vocab size: 108 Sequence length: 28842


28841it [00:00, 2041652.69it/s]


(70, 67)
Most frequent word: (70, 67) --> "ur"
Vocab size: 109 Sequence length: 28602


28601it [00:00, 2851265.39it/s]


(70, 63)
Most frequent word: (70, 63) --> "un"
Vocab size: 110 Sequence length: 28372


28371it [00:00, 1981229.38it/s]

(69, 67)
Most frequent word: (69, 67) --> "tr"
Vocab size: 111 Sequence length: 28167



28166it [00:00, 2775378.62it/s]


(61, 106)
Most frequent word: (61, 106) --> "la "
Vocab size: 112 Sequence length: 27964


27963it [00:00, 2795502.87it/s]


(61, 94)
Most frequent word: (61, 94) --> "le "
Vocab size: 113 Sequence length: 27762


27761it [00:00, 1084254.34it/s]


(50, 67)
Most frequent word: (50, 67) --> "ar"
Vocab size: 114 Sequence length: 27562


27561it [00:00, 1761941.39it/s]


(2, 53)
Most frequent word: (2, 53) --> " d"
Vocab size: 115 Sequence length: 27373


27372it [00:00, 2725376.60it/s]


(58, 68)
Most frequent word: (58, 68) --> "is"
Vocab size: 116 Sequence length: 27185


27184it [00:00, 2717106.97it/s]


(54, 68)
Most frequent word: (54, 68) --> "es"
Vocab size: 117 Sequence length: 26998


26997it [00:00, 1242673.20it/s]


(94, 98)
Most frequent word: (94, 98) --> "e de "
Vocab size: 118 Sequence length: 26825


26824it [00:00, 2668975.91it/s]


(64, 67)
Most frequent word: (64, 67) --> "or"
Vocab size: 119 Sequence length: 26659


26658it [00:00, 1036493.68it/s]

(64, 62)
Most frequent word: (64, 62) --> "om"
Vocab size: 120 Sequence length: 26509



26508it [00:00, ?it/s]


(54, 61)
Most frequent word: (54, 61) --> "el"
Vocab size: 121 Sequence length: 26365


26364it [00:00, 2630382.04it/s]


(96, 2)
Most frequent word: (96, 2) --> "on "
Vocab size: 122 Sequence length: 26223


26222it [00:00, 2483359.82it/s]


(65, 67)
Most frequent word: (65, 67) --> "pr"
Vocab size: 123 Sequence length: 26084


26083it [00:00, 1362374.46it/s]


(54, 52)
Most frequent word: (54, 52) --> "ec"
Vocab size: 124 Sequence length: 25949


25948it [00:00, 1908193.22it/s]


(68, 58)
Most frequent word: (68, 58) --> "si"
Vocab size: 125 Sequence length: 25818


25817it [00:00, 1003301.70it/s]


(86, 2)
Most frequent word: (86, 2) --> "é "
Vocab size: 126 Sequence length: 25688


25687it [00:00, 999657.50it/s]


(50, 69)
Most frequent word: (50, 69) --> "at"
Vocab size: 127 Sequence length: 25558


25557it [00:00, 2520902.76it/s]


(50, 61)
Most frequent word: (50, 61) --> "al"
Vocab size: 128 Sequence length: 25429


25428it [00:00, 2540500.75it/s]


(50, 70)
Most frequent word: (50, 70) --> "au"
Vocab size: 129 Sequence length: 25303


25302it [00:00, 2528634.94it/s]


(54, 62)
Most frequent word: (54, 62) --> "em"
Vocab size: 130 Sequence length: 25184


25183it [00:00, 2517102.15it/s]


(1, 1)
Most frequent word: (1, 1) --> "

"
Vocab size: 131 Sequence length: 25066


25065it [00:00, 1247273.99it/s]


(54, 97)
Most frequent word: (54, 97) --> "et "
Vocab size: 132 Sequence length: 24949


24948it [00:00, 2374931.82it/s]


(50, 58)
Most frequent word: (50, 58) --> "ai"
Vocab size: 133 Sequence length: 24832


24831it [00:00, 1208250.34it/s]


(1, 2)
Most frequent word: (1, 2) --> "
 "
Vocab size: 134 Sequence length: 24719


24718it [00:00, 3565158.40it/s]


(64, 70)
Most frequent word: (64, 70) --> "ou"
Vocab size: 135 Sequence length: 24608


24607it [00:00, ?it/s]

(100, 97)
Most frequent word: (100, 97) --> "ent "
Vocab size: 136 Sequence length: 24497



24496it [00:00, 2413352.85it/s]


(99, 95)
Most frequent word: (99, 95) --> "ans "
Vocab size: 137 Sequence length: 24387


24386it [00:00, 236839.69it/s]


(58, 61)
Most frequent word: (58, 61) --> "il"
Vocab size: 138 Sequence length: 24277


24276it [00:00, 5729288.99it/s]


(100, 2)
Most frequent word: (100, 2) --> "en "
Vocab size: 139 Sequence length: 24170


24169it [00:00, 2661035.13it/s]


(94, 53)
Most frequent word: (94, 53) --> "e d"
Vocab size: 140 Sequence length: 24065


24064it [00:00, 1474553.78it/s]


(50, 102)
Most frequent word: (50, 102) --> "ati"
Vocab size: 141 Sequence length: 23961


23960it [00:00, 3639032.58it/s]


(61, 3)
Most frequent word: (61, 3) --> "l'"
Vocab size: 142 Sequence length: 23858


23857it [00:00, 2789537.80it/s]


(58, 69)
Most frequent word: (58, 69) --> "it"
Vocab size: 143 Sequence length: 23759


23758it [00:00, 1669849.59it/s]


(52, 57)
Most frequent word: (52, 57) --> "ch"
Vocab size: 144 Sequence length: 23663


23662it [00:00, 2334037.80it/s]


(82, 2)
Most frequent word: (82, 2) --> "à "
Vocab size: 145 Sequence length: 23567


23566it [00:00, 1785877.61it/s]


(8, 2)
Most frequent word: (8, 2) --> ". "
Vocab size: 146 Sequence length: 23472


23471it [00:00, 6614116.45it/s]


(86, 67)
Most frequent word: (86, 67) --> "ér"
Vocab size: 147 Sequence length: 23377


23376it [00:00, ?it/s]


(108, 2)
Most frequent word: (108, 2) --> "ur "
Vocab size: 148 Sequence length: 23291


23290it [00:00, 1372891.38it/s]


(61, 101)
Most frequent word: (61, 101) --> "les "
Vocab size: 149 Sequence length: 23205


23204it [00:00, 2283329.35it/s]


(61, 64)
Most frequent word: (61, 64) --> "lo"
Vocab size: 150 Sequence length: 23121


23120it [00:00, 1430227.85it/s]


(58, 107)
Most frequent word: (58, 107) --> "iqu"
Vocab size: 151 Sequence length: 23037


23036it [00:00, 1471318.08it/s]


(86, 61)
Most frequent word: (86, 61) --> "él"
Vocab size: 152 Sequence length: 22955


22954it [00:00, 786974.13it/s]


(109, 2)
Most frequent word: (109, 2) --> "un "
Vocab size: 153 Sequence length: 22876


22875it [00:00, 1748773.40it/s]


(65, 113)
Most frequent word: (65, 113) --> "par"
Vocab size: 154 Sequence length: 22798


22797it [00:00, 1064238.23it/s]

(8, 130)





Most frequent word: (8, 130) --> ".

"
Vocab size: 155 Sequence length: 22721


22720it [00:00, 1956125.03it/s]


(65, 64)
Most frequent word: (65, 64) --> "po"
Vocab size: 156 Sequence length: 22644


22643it [00:00, 1158487.24it/s]


(122, 64)
Most frequent word: (122, 64) --> "pro"
Vocab size: 157 Sequence length: 22567


22566it [00:00, 1877167.53it/s]


(109, 94)
Most frequent word: (109, 94) --> "une "
Vocab size: 158 Sequence length: 22491


22490it [00:00, 1930023.47it/s]


(67, 86)
Most frequent word: (67, 86) --> "ré"
Vocab size: 159 Sequence length: 22418


22417it [00:00, 1873019.64it/s]


(50, 62)
Most frequent word: (50, 62) --> "am"
Vocab size: 160 Sequence length: 22345


22344it [00:00, 2240438.17it/s]


(54, 103)
Most frequent word: (54, 103) --> "e, "
Vocab size: 161 Sequence length: 22277


22276it [00:00, 1765638.94it/s]


(2, 98)
Most frequent word: (2, 98) --> " de "
Vocab size: 162 Sequence length: 22209


22208it [00:00, 1735520.17it/s]


(52, 58)
Most frequent word: (52, 58) --> "ci"
Vocab size: 163 Sequence length: 22144


22143it [00:00, 2170522.18it/s]


(96, 95)
Most frequent word: (96, 95) --> "ons "
Vocab size: 164 Sequence length: 22080


22079it [00:00, 1479778.81it/s]


(52, 119)
Most frequent word: (52, 119) --> "com"
Vocab size: 165 Sequence length: 22016


22015it [00:00, 1835409.22it/s]


(116, 97)
Most frequent word: (116, 97) --> "est "
Vocab size: 166 Sequence length: 21953


21952it [00:00, 2194208.13it/s]


(105, 2)
Most frequent word: (105, 2) --> "er "
Vocab size: 167 Sequence length: 21890


21889it [00:00, 1719271.91it/s]


(50, 56)
Most frequent word: (50, 56) --> "ag"
Vocab size: 168 Sequence length: 21829


21828it [00:00, 1931951.88it/s]


(52, 96)
Most frequent word: (52, 96) --> "con"
Vocab size: 169 Sequence length: 21768


21767it [00:00, 1629060.10it/s]

(65, 61)
Most frequent word: (65, 61) --> "pl"
Vocab size: 170 Sequence length: 21707



21706it [00:00, 1921072.83it/s]


(99, 69)
Most frequent word: (99, 69) --> "ant"
Vocab size: 171 Sequence length: 21648


21647it [00:00, 2320200.83it/s]


(12, 10)
Most frequent word: (12, 10) --> "20"
Vocab size: 172 Sequence length: 21590


21589it [00:00, 16089344.18it/s]


(50, 71)
Most frequent word: (50, 71) --> "av"
Vocab size: 173 Sequence length: 21533


21532it [00:00, 1364061.05it/s]


(70, 2)
Most frequent word: (70, 2) --> "u "
Vocab size: 174 Sequence length: 21477


21476it [00:00, 1935763.28it/s]


(50, 52)
Most frequent word: (50, 52) --> "ac"
Vocab size: 175 Sequence length: 21421


21420it [00:00, 1391518.36it/s]


(67, 94)
Most frequent word: (67, 94) --> "re "
Vocab size: 176 Sequence length: 21365


21364it [00:00, 2133909.09it/s]


(99, 97)
Most frequent word: (99, 97) --> "ant "
Vocab size: 177 Sequence length: 21311


21310it [00:00, 1335454.26it/s]


(54, 108)
Most frequent word: (54, 108) --> "eur"
Vocab size: 178 Sequence length: 21257


21256it [00:00, 1751692.19it/s]

(107, 94)
Most frequent word: (107, 94) --> "que "
Vocab size: 179 Sequence length: 21203



21202it [00:00, 1311249.55it/s]


(54, 73)
Most frequent word: (54, 73) --> "ex"
Vocab size: 180 Sequence length: 21150


21149it [00:00, 1370855.77it/s]


(151, 123)
Most frequent word: (151, 123) --> "élec"
Vocab size: 181 Sequence length: 21097


21096it [00:00, 1673944.59it/s]


(52, 117)
Most frequent word: (52, 117) --> "ce de "
Vocab size: 182 Sequence length: 21044


21043it [00:00, 2099047.26it/s]


(180, 110)
Most frequent word: (180, 110) --> "électr"
Vocab size: 183 Sequence length: 20992


20991it [00:00, 2183813.75it/s]


(129, 135)
Most frequent word: (129, 135) --> "ement "
Vocab size: 184 Sequence length: 20941


20940it [00:00, 1918411.73it/s]


(50, 68)
Most frequent word: (50, 68) --> "as"
Vocab size: 185 Sequence length: 20890


20889it [00:00, 1856914.91it/s]

(58, 2)
Most frequent word: (58, 2) --> "i "
Vocab size: 186 Sequence length: 20841



20840it [00:00, 2147806.85it/s]


(64, 58)
Most frequent word: (64, 58) --> "oi"
Vocab size: 187 Sequence length: 20792


20791it [00:00, 2150259.51it/s]


(50, 65)
Most frequent word: (50, 65) --> "ap"
Vocab size: 188 Sequence length: 20743


20742it [00:00, 2165428.45it/s]


(153, 2)
Most frequent word: (153, 2) --> "par "
Vocab size: 189 Sequence length: 20695


20694it [00:00, 2227161.22it/s]


(136, 111)
Most frequent word: (136, 111) --> "ans la "
Vocab size: 190 Sequence length: 20647


20646it [00:00, 2130168.27it/s]


(71, 104)
Most frequent word: (71, 104) --> "vin"
Vocab size: 191 Sequence length: 20599


20598it [00:00, 2010478.31it/s]


(156, 190)
Most frequent word: (156, 190) --> "provin"
Vocab size: 192 Sequence length: 20553


20552it [00:00, 2261967.93it/s]


(70, 61)
Most frequent word: (70, 61) --> "ul"
Vocab size: 193 Sequence length: 20508


20507it [00:00, 2003507.77it/s]


(53, 58)
Most frequent word: (53, 58) --> "di"
Vocab size: 194 Sequence length: 20463


20462it [00:00, 1984458.20it/s]


(86, 69)
Most frequent word: (86, 69) --> "ét"
Vocab size: 195 Sequence length: 20418


20417it [00:00, 1919639.20it/s]


(164, 62)
Most frequent word: (164, 62) --> "comm"
Vocab size: 196 Sequence length: 20373


20372it [00:00, 1971353.85it/s]


(50, 51)
Most frequent word: (50, 51) --> "ab"
Vocab size: 197 Sequence length: 20330


20329it [00:00, 1722197.66it/s]


(55, 55)
Most frequent word: (55, 55) --> "ff"
Vocab size: 198 Sequence length: 20287


20286it [00:00, 1719874.90it/s]


(54, 56)
Most frequent word: (54, 56) --> "eg"
Vocab size: 199 Sequence length: 20244


20243it [00:00, 1803502.61it/s]

(191, 181)
Most frequent word: (191, 181) --> "province de "
Vocab size: 200 Sequence length: 20201





In [10]:
tokenizer.encode('Bonjour')

[25, 64, 63, 59, 64, 70, 67]

In [11]:
tokenizer.decode([25, 64, 63, 59, 64, 70, 67])

'Bonjour'