# Tokenization for South African Languages

This notebook demonstrates the process of tokenizing text data from various South African languages using a language-optimized Byte Pair Encoding (BPE) tokenizer.

## Setup and Imports

We start by importing the necessary libraries and setting up the environment.

In [2]:
from pathlib import Path
from tokenizers import Tokenizer, trainers, pre_tokenizers, decoders
from tokenizers.models import BPE
from tqdm import tqdm
from collections import Counter

# Load and process text data


*Load text data and prepend language-specific tokens.

    Args:
        data_paths (dict): Dictionary with data file paths for each language and split.
        language_tokens (dict): Mapping of languages to language-specific tokens.
        max_train_sequences (int, optional): Maximum number of sequences for training. Defaults to None.
        max_val_sequences (int, optional): Maximum number of sequences for validation. Defaults to None.
        max_test_sequences (int, optional): Maximum number of sequences for testing. Defaults to None.

    Returns:
        tuple: Dictionaries containing loaded text data for each split (train, validation, test).*

In [3]:
def read_texts(data_paths, language_tokens, max_train_sequences=None, max_val_sequences=None, max_test_sequences=None):

    train_texts, val_texts, test_texts = {}, {}, {}

    for split, split_paths in data_paths.items():
        split_texts = {}
        for language, file_path in split_paths.items():
            language_token = language_tokens[language]
            path = Path(file_path)
            with path.open("r", encoding="utf-8") as f:
                lines = f.readlines()
                if split == "train":
                    split_lines = lines[:max_train_sequences] if max_train_sequences else lines
                elif split == "validation":
                    split_lines = lines[:max_val_sequences] if max_val_sequences else lines
                elif split == "test":
                    split_lines = lines[:max_test_sequences] if max_test_sequences else lines

                split_texts[language] = [language_token + " " + line.strip() for line in split_lines]

        if split == "train":
            train_texts = split_texts
        elif split == "validation":
            val_texts = split_texts
        elif split == "test":
            test_texts = split_texts

    return train_texts, val_texts, test_texts

*We define the paths for the training, validation, and test datasets and specify the language tokens.

*

In [7]:
language_tokens = {
        "sesotho": "<st>",
        "setswana": "<tn>",
        "xhosa": "<xh>",
        "xitsonga": "<ts>",
        "zulu": "<zu>"
    }
    
data_files = {
        "train": {
            "sesotho": "/kaggle/input/tokenization/South_African_Languages_Preprocessed_Datasets/South_African_Languages_Preprocessed_Datasets/sesotho_train.txt",
            "setswana": "/kaggle/input/tokenization/South_African_Languages_Preprocessed_Datasets/South_African_Languages_Preprocessed_Datasets/setswana_train.txt",
            "xhosa": "/kaggle/input/tokenization/South_African_Languages_Preprocessed_Datasets/South_African_Languages_Preprocessed_Datasets/xhosa_train.txt",
            "xitsonga": "/kaggle/input/tokenization/South_African_Languages_Preprocessed_Datasets/South_African_Languages_Preprocessed_Datasets/xitsonga_train.txt",
            "zulu": "/kaggle/input/tokenization/South_African_Languages_Preprocessed_Datasets/South_African_Languages_Preprocessed_Datasets/zulu_train.txt",
        },
        "validation": {
            "sesotho": "/kaggle/input/tokenization/South_African_Languages_Preprocessed_Datasets/South_African_Languages_Preprocessed_Datasets/sesotho_validation.txt",
            "setswana": "/kaggle/input/tokenization/South_African_Languages_Preprocessed_Datasets/South_African_Languages_Preprocessed_Datasets/setswana_validation.txt",
            "xhosa": "/kaggle/input/tokenization/South_African_Languages_Preprocessed_Datasets/South_African_Languages_Preprocessed_Datasets/xhosa_validation.txt",
            "xitsonga": "/kaggle/input/tokenization/South_African_Languages_Preprocessed_Datasets/South_African_Languages_Preprocessed_Datasets/xitsonga_validation.txt",
            "zulu": "/kaggle/input/tokenization/South_African_Languages_Preprocessed_Datasets/South_African_Languages_Preprocessed_Datasets/zulu_validation.txt",
        },
        "test": {
            "sesotho": "/kaggle/input/tokenization/South_African_Languages_Preprocessed_Datasets/South_African_Languages_Preprocessed_Datasets/sesotho_test.txt",
            "setswana": "/kaggle/input/tokenization/South_African_Languages_Preprocessed_Datasets/South_African_Languages_Preprocessed_Datasets/setswana_test.txt",
            "xhosa": "/kaggle/input/tokenization/South_African_Languages_Preprocessed_Datasets/South_African_Languages_Preprocessed_Datasets/xhosa_test.txt",
            "xitsonga": "/kaggle/input/tokenization/South_African_Languages_Preprocessed_Datasets/South_African_Languages_Preprocessed_Datasets/xitsonga_test.txt",
            "zulu": "/kaggle/input/tokenization/South_African_Languages_Preprocessed_Datasets/South_African_Languages_Preprocessed_Datasets/zulu_test.txt",
        },
    }
    
train_sequences = 100000
validation_sequences = 1000
test_sequences = 1000
    
train_texts, validation_texts, test_texts = read_texts(
data_files, 
language_tokens, 
max_train_sequences=train_sequences, 
max_val_sequences=validation_sequences, 
max_test_sequences=test_sequences
)

FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/tokenization/South_African_Languages_Preprocessed_Datasets/South_African_Languages_Preprocessed_Datasets/sesotho_train.txt'

# Identify common words in the corpus

* Identify common words in the training corpus, filtering out language-specific tokens
    and ensuring a minimum word length.

    Args:
        train_texts (dict): Dictionary of training texts for each language.
        num_common_words (int): Number of common words to identify.
        language_tokens (list): List of language-specific tokens to exclude.
        min_word_length (int, optional): Minimum number of characters a word must contain. Defaults to 2.

    Returns:
        list: List of common words.*

In [None]:
def identify_common_words(train_texts, num_common_words, language_tokens, min_word_length):
    counter = Counter()
    for texts in train_texts.values():
        for text in texts:
            counter.update(text.split())

    # Filter out language-specific tokens and short words
    common_words = [word for word, _ in counter.most_common(num_common_words)
                    if word not in language_tokens and len(word) >= min_word_length]
    return common_words

In [None]:
num_of_common_words = 50 #Number of common words 
min_word_length = 4  # Minimum number of characters a word must contain

# Identify common words
common_words = identify_common_words(train_texts, num_of_common_words, language_tokens, min_word_length)
    

# Initialize and configure the tokenizer with common words

*Initialize and configure the tokenizer with special tokens and common words.

    Args:
        special_tokens (list): List of special tokens to add to the tokenizer.
        common_words (list): List of common words to initialize the vocabulary with.

    Returns:
        Tokenizer: Configured tokenizer instance.*

In [None]:
def initialize_tokenizer(special_tokens, common_words):
    tokenizer = Tokenizer(BPE())
    # Add special tokens first
    tokenizer.add_special_tokens(special_tokens)
    # Add common words to the tokenizer
    tokenizer.add_tokens(common_words)
    tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()
    tokenizer.decoder = decoders.WordPiece()
    return tokenizer


In [None]:
special_tokens = list(language_tokens.values()) + ["<unk>", "<pad>", "<mask>", "<s>", "</s>", "[UNK]"]


# Train Tokenizer

*Train the tokenizer with the provided training texts.

    Args:
        tokenizer (Tokenizer): The tokenizer to be trained.
        train_texts (dict): Dictionary of training texts for each language.
        special_tokens (list): List of special tokens.
        max_vocab_size (int): Maximum vocabulary size.
        min_frequency (int): Minimum frequency for BPE merges.
        num_epochs (int, optional): Number of epochs for training. Defaults to 1.*

In [None]:
def train_tokenizer(tokenizer, train_texts, special_tokens, max_vocab_size, min_frequency, num_epochs=1):
    trainer = trainers.BpeTrainer(
        special_tokens=special_tokens,
        vocab_size=max_vocab_size,
        min_frequency=min_frequency,
        show_progress=True,
    )

    with tqdm(total=num_epochs, desc="Training", unit="epoch") as pbar:
        for _ in range(num_epochs):
            iterator = (text for lang_texts in train_texts.values() for text in lang_texts)
            tokenizer.train_from_iterator(iterator, trainer=trainer)
            pbar.update(1)


In [None]:
vocab_size = 1000
merging_threshold = 2
train_tokenizer(BPE_Optimized_Tokenizer, train_texts, special_tokens, max_vocab_size=vocab_size, min_frequency=merging_threshold)


# Save the trained tokenizer

*Save the trained tokenizer to the specified path.

    Args:
        tokenizer (Tokenizer): The tokenizer to save.
        path (str): The path to save the tokenizer to.*

In [None]:

def save_tokenizer(tokenizer, path):
    tokenizer.save(path)

In [None]:
 save_dir = Path("/kaggle/working/")
    save_dir.mkdir(parents=True, exist_ok=True)
    BPE_Optimized_Tokenizer.save(str(save_dir / "Tokenizer(Language_Optimized)"))

# Encoding a Text

*We demonstrate the tokenization process with an example text.

*

In [None]:
text = "Sanibonani Emakhaya."
tokenized_text = BPE_Optimized_Tokenizer.encode(text).tokens
print("Original sequence:", text)
print("Tokenized sequence:", tokenized_text)