In [1]:
# What is Tokenization?
# Unstructured text data is difficult to understand and analyze. 
# Tokenization bridges this gap by breaking up the text into smaller units calls tokens.
# These can be words, characters, or subwords, depending on the tokenization strategy.

In [2]:
# Different tokenization strategies:
# 1. Character Tokenization: Breaks the text into individual characters.
text = "Tokenizing text is very important in NLP Space."
tokenized_text = list(text)
print("The tokenized text is: ",tokenized_text)

# This is good, but our model expects each character to be converted to an integer
token2idx = {ch: idx for idx, ch in enumerate(sorted(set(tokenized_text)))}
print("This is the tokenized ids: ", token2idx)

# This gives us a mappging from each character in our vocab to a unique integer.
inputs_ids = [token2idx[ch] for ch in tokenized_text]
print("Mapping of each char to unique int: ",inputs_ids)

# Converting this inputs_ids to a tensor of one-hot vectors
import torch
import torch.nn.functional as F
inputs_ids = torch.tensor(inputs_ids)
one_hot_encodings = F.one_hot(inputs_ids, num_classes=len(token2idx))
print(one_hot_encodings.shape)

# For each of our 47 input tokens we now have one-hot vector with 24 dimensions.
print(f"Token: {tokenized_text[0]}")
print(f"Tensor index: {inputs_ids[0]}")
print(f"One-hot: {one_hot_encodings[0]}")

The tokenized text is:  ['T', 'o', 'k', 'e', 'n', 'i', 'z', 'i', 'n', 'g', ' ', 't', 'e', 'x', 't', ' ', 'i', 's', ' ', 'v', 'e', 'r', 'y', ' ', 'i', 'm', 'p', 'o', 'r', 't', 'a', 'n', 't', ' ', 'i', 'n', ' ', 'N', 'L', 'P', ' ', 'S', 'p', 'a', 'c', 'e', '.']
This is the tokenized ids:  {' ': 0, '.': 1, 'L': 2, 'N': 3, 'P': 4, 'S': 5, 'T': 6, 'a': 7, 'c': 8, 'e': 9, 'g': 10, 'i': 11, 'k': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'r': 17, 's': 18, 't': 19, 'v': 20, 'x': 21, 'y': 22, 'z': 23}
Mapping of each char to unique int:  [6, 15, 12, 9, 14, 11, 23, 11, 14, 10, 0, 19, 9, 21, 19, 0, 11, 18, 0, 20, 9, 17, 22, 0, 11, 13, 16, 15, 17, 19, 7, 14, 19, 0, 11, 14, 0, 3, 2, 4, 0, 5, 16, 7, 8, 9, 1]
torch.Size([47, 24])
Token: T
Tensor index: 6
One-hot: tensor([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])


In [3]:
# Word Tokenization: Breaks the text into individual words.
tokenized_text = text.split()
print("Tokenized text: ", tokenized_text)

Tokenized text:  ['Tokenizing', 'text', 'is', 'very', 'important', 'in', 'NLP', 'Space.']


In [4]:
# Subword Tokenization: Breaks the text into subwords.
# Example: "Tokenizing" -> "Token" + "izing"

# Tokenizer from scratch

In [7]:
import datasets

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
df = datasets.load_dataset('oscar', 'unshuffled_deduplicated_la', trust_remote_code=True)

Downloading data: 100%|██████████| 81.0/81.0 [00:00<00:00, 34.4kB/s]
Downloading data: 100%|██████████| 3.42M/3.42M [00:01<00:00, 2.60MB/s]
Generating train split: 100%|██████████| 18808/18808 [00:00<00:00, 48476.58 examples/s]


In [12]:
df

DatasetDict({
    train: Dataset({
        features: ['id', 'text'],
        num_rows: 18808
    })
})

In [13]:
df['train'][0]

{'id': 0,
 'text': 'Hæ sunt generationes Noë: Noë vir justus atque perfectus fuit in generationibus suis; cum Deo ambulavit.\nEcce ego adducam aquas diluvii super terram, ut interficiam omnem carnem, in qua spiritus vitæ est subter cælum: universa quæ in terra sunt, consumentur.\nTolles igitur tecum ex omnibus escis, quæ mandi possunt, et comportabis apud te: et erunt tam tibi, quam illis in cibum.'}

In [19]:
from tqdm.auto import tqdm

text_data = []
file_count = 0

for sample in tqdm(df['train']):
    # Remove the newline characters from each sample text
    sample = sample['text'].replace('\n', ' ')
    text_data.append(sample)
    if len(text_data) == 5000:
        # once we hit 5k mark, save to file
        with open(f'oscar_text_data_{file_count}.txt', 'w', encoding='utf-8') as f:
            f.write('\n'.join(text_data))
        text_data = []
        file_count += 1

# after saving in 5k chunks, we'll save the remaining data
with open(f'oscar_text_data_{file_count}.txt', 'w', encoding='utf-8') as f:
    f.write('\n'.join(text_data))

100%|██████████| 18808/18808 [00:00<00:00, 29895.78it/s]


In [20]:
# Training the tokenzier
from pathlib import Path
paths = [str(x) for x in Path().glob("oscar_text_data_*.txt")]
paths

['oscar_text_data_0.txt',
 'oscar_text_data_1.txt',
 'oscar_text_data_2.txt',
 'oscar_text_data_3.txt']

In [21]:
from tokenizers import ByteLevelBPETokenizer

# initialize
tokenizer = ByteLevelBPETokenizer()

# train
tokenizer.train(files=paths, vocab_size=50_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

In [22]:
# saving it to a file
import os
os.makedirs("oscar_tokenizer", exist_ok=True)
tokenizer.save_model("oscar_tokenizer")

['oscar_tokenizer\\vocab.json', 'oscar_tokenizer\\merges.txt']

In [23]:
# Using the tokenizer
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained("oscar_tokenizer")

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.
0it [00:00, ?it/s]


In [24]:
lorem_ipsum = (
    "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor "
    "incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud "
    "exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute "
    "irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla "
    "pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia "
    "deserunt mollit anim id est laborum."
)

In [25]:
# we'll include the typical padding/truncation
tokenizer(lorem_ipsum, max_length=512, padding="max_length", truncation=True)

{'input_ids': [0, 3835, 655, 1597, 463, 1790, 16, 2624, 3718, 3093, 16, 399, 3705, 13722, 3733, 16061, 330, 2225, 290, 1909, 1544, 1640, 18, 1297, 413, 320, 10160, 1919, 16, 634, 13292, 23610, 39958, 9074, 695, 330, 20797, 353, 508, 7510, 10485, 18, 14878, 37618, 40180, 1597, 285, 2065, 285, 1600, 1262, 361, 17113, 1909, 2514, 2072, 1088, 2527, 18, 15399, 28646, 910, 24426, 30158, 312, 20788, 16, 338, 285, 2529, 367, 2571, 3044, 17737, 581, 464, 297, 3562, 18, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

## GG!
### We made a custom tokenizer from scratch tranined on Latin Subset of the huge OSCAR dataset.