In [1]:
import os
from datasets import load_dataset, concatenate_datasets
from tokenizers.models import WordLevel, BPE
from tokenizers import Tokenizer
from tokenizers.trainers import WordLevelTrainer, BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing
import re
import os
import hashlib
import time
import multiprocessing
import argparse
from datasets import load_from_disk
from lingua import Language, LanguageDetectorBuilder
import epitran
from functools import lru_cache
from transformers import PreTrainedTokenizerFast

In [2]:
dataset = load_dataset("wikitext", "wikitext-103-raw-v1")
dataset

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 1801350
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})

In [3]:
num_processes = multiprocessing.cpu_count() - 1
# Exact duplication removal (on individual sentences/paragraphs)
def remove_exact_duplicates(examples):
    seen = set()
    deduped_examples = []
    for sentence in examples["text"]:
        hash_val = hashlib.md5(sentence.encode()).hexdigest()
        if hash_val not in seen:
            seen.add(hash_val)
            deduped_examples.append(sentence)
    return {"text": deduped_examples}


def filter_by_language(examples):
    detector = LanguageDetectorBuilder.from_languages(Language.ENGLISH, Language.FRENCH).build()
    return {
        "text": [
            sentence for sentence in examples["text"] if detector.detect_language_of(sentence) == Language.ENGLISH
        ]
    }

In [4]:
# Basic text cleaning
def clean_text(examples):
    cleaned_text = []
    for sentence in examples["text"]:
        # Lowercase
        #sentence = sentence.lower()
        # Remove extra spaces
        sentence = re.sub(r"\s+", " ", sentence)
        # Remove URLs
        sentence = re.sub(r"http\S+", "", sentence)
        # Remove special characters
        sentence = re.sub(r"[^a-zA-Z0-9,.!?;:\'\" ]+", "", sentence)
        cleaned_text.append(sentence.strip())
    return {"text": cleaned_text}

def clean(dataset):
    dataset = dataset.map(remove_exact_duplicates, batched=True, num_proc=num_processes)
    dataset = dataset.map(filter_by_language, batched=True, num_proc=num_processes)
    dataset = dataset.map(clean_text, batched=True, num_proc=num_processes)
    return dataset


In [5]:
dataset_cleaned = dataset

In [6]:
dataset_cleaned

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 1801350
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})

In [7]:
epi = epitran.Epitran("eng-Latn")

@lru_cache(maxsize=None)
def xsampa_list(word: str) -> list:
    return epi.xsampa_list(word)

def translate_sentence(sentence: str) -> str:
    return ' '.join(' '.join(xsampa_list(word)) + ' [WORD]' for word in sentence.split())

def translate_function(examples):
    return {"text": [translate_sentence(sentence) for sentence in examples["text"]]}


In [8]:
dataset_translated = dataset_cleaned.map(translate_function, batched=True, num_proc=num_processes)
dataset_translated

Map (num_proc=15):   0%|          | 0/4358 [00:00<?, ? examples/s]

Map (num_proc=15):   0%|          | 0/1801350 [00:00<?, ? examples/s]

Map (num_proc=15):   0%|          | 0/3760 [00:00<?, ? examples/s]

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 1801350
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})

In [9]:
dataset_translated["train"][0]

{'text': ''}

In [10]:
def get_training_corpus(dataset):
    dataset = concatenate_datasets([dataset["train"], dataset["test"], dataset["validation"]])
    for i in range(0, len(dataset), 1000):
        samples = dataset[i : i + 1000]
        yield samples["text"]

In [19]:
def train_tokenizer(dataset):
    tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
    tokenizer.pre_tokenizer = Whitespace()
    tokenizer.post_processor = TemplateProcessing(
        single="[CLS] $A [SEP]",
        pair="[CLS] $A [SEP] $B:1 [SEP]:1",
        special_tokens=[
            ("[CLS]", 1),
            ("[SEP]", 2),
        ],
    )
    trainer = WordLevelTrainer(vocab_size=1000 ,special_tokens=["[PAD]", "[CLS]", "[SEP]", "[MASK]", "[WORD]", "[UNK]"])
    tokenizer.train_from_iterator(get_training_corpus(dataset), trainer)
    tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)
    tokenizer.add_special_tokens(
        {
            "pad_token": "[PAD]",
            "mask_token": "[MASK]",
            "cls_token": "[CLS]",
            "sep_token": "[SEP]",
            "unk_token": "[UNK]",
        }
    )
    return tokenizer

In [20]:
tokenizer = train_tokenizer(dataset_translated)



In [23]:
print(tokenizer.vocab_size)
print(tokenizer.vocab)

159
{'k': 19, 'Y': 94, 'S': 41, '4': 66, '@': 9, 'i_X': 99, 'm': 24, 't_': 126, 'G_w': 130, 'z_G': 127, 'y': 59, 'g_w': 119, '_h': 113, 'n_w': 120, 'Q': 65, 'f_w': 150, 'l_': 153, 's_h': 138, 'p_G': 155, 'I': 13, 'i_k': 152, 'M': 85, '[MASK]': 3, 'n_G': 101, 'k_w': 125, 'a_': 52, 's': 14, '>': 93, 'tS': 44, 'z': 23, '{:': 114, 'n_': 154, 'O': 36, 'i_G': 77, 't_w': 140, 'G': 74, 'r': 10, 'l': 17, '@:': 69, '\\:': 132, '|\\': 115, '}': 111, 'S_w': 146, '[WORD]': 4, 'D': 27, 's_G': 80, 'u_t': 110, 'R_w': 145, 'g_w_h': 151, '9': 64, 'b_': 107, '?\\': 72, '\\': 15, '5': 83, 'k_h': 137, 'n': 11, 'V': 31, '!\\': 128, '~:': 92, 'K_': 144, 'W': 109, 'p': 28, 's_w': 139, 'A': 29, 'N': 40, 'f': 33, 'B_G': 143, 'e_X': 102, 'g_': 136, 'h': 39, 'v': 30, '_T': 95, 'B': 58, '[SEP]': 2, 'j': 18, '_w': 133, "'": 63, 'i_': 60, '4_G': 73, '[UNK]': 5, 'd': 16, 'e_': 55, '&': 142, '?\\:': 122, 'P': 84, '_L': 118, ']': 8, '_': 97, 'u_k': 157, '`': 68, '@\\': 123, 'b': 32, 'g': 42, 'T_': 147, 't_h': 156, "'=\

In [24]:
tokenizer.save_pretrained("/home/toure215/BERT_phonetic/tokenizers/tokenizer_phonetic_WordLevel")

('/home/toure215/BERT_phonetic/tokenizers/tokenizer_phonetic_WordLevel/tokenizer_config.json',
 '/home/toure215/BERT_phonetic/tokenizers/tokenizer_phonetic_WordLevel/special_tokens_map.json',
 '/home/toure215/BERT_phonetic/tokenizers/tokenizer_phonetic_WordLevel/tokenizer.json')

In [35]:
def chunked_text(examples):
    chunked_text = []
    for sentence in examples["text"]:
        words = sentence.split()
        chunks = [words[i : i + 200] for i in range(0, len(words), 200)]
        chunked_text.extend([" ".join(chunk) for chunk in chunks])
    return {"text": chunked_text}

In [36]:
dataset_chunked = (
    dataset_translated.map(chunked_text, batched=True, num_proc=num_processes)
    .flatten_indices()
    .filter(lambda x: len(x["text"]) > 0)
)

dataset_chunked

Map (num_proc=15):   0%|          | 0/4358 [00:00<?, ? examples/s]

Map (num_proc=15):   0%|          | 0/1801350 [00:00<?, ? examples/s]

Map (num_proc=15):   0%|          | 0/3760 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/7395 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/3059184 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/6440 [00:00<?, ? examples/s]

Filter:   0%|          | 0/7395 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3059184 [00:00<?, ? examples/s]

Filter:   0%|          | 0/6440 [00:00<?, ? examples/s]

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 7395
    })
    train: Dataset({
        features: ['text'],
        num_rows: 3059184
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 6440
    })
})

In [37]:
dataset_chunked.save_to_disk("/home/toure215/BERT_phonetic/DATASETS/phonetic_WordLevel_wikitext")

Saving the dataset (0/1 shards):   0%|          | 0/7395 [00:00<?, ? examples/s]

Saving the dataset (0/3 shards):   0%|          | 0/3059184 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/6440 [00:00<?, ? examples/s]

In [25]:
len(tokenizer)

159

In [29]:
s = translate_sentence("Hello, how are you?")
print(s)
tokenizer.encode(s)

h @ l o w [WORD] h a w [WORD] A r\ [WORD] j u [WORD]


[1, 39, 9, 17, 37, 20, 4, 39, 34, 20, 4, 29, 10, 15, 4, 18, 38, 4, 2]

In [34]:
print(dataset_chunked["train"][:5])

{'text': ['[WORD] v { l k I r\\ i @ [WORD] k r\\ A n @ k @ l z [WORD] a j I i [WORD] [WORD]', 's E n dZ o [WORD] n o w [WORD] v { l k I r\\ i @ [WORD] [WORD] [WORD] V n r\\ I k O r\\ d I d [WORD] k r\\ A n @ k @ l z [WORD] [WORD] dZ { p @ n i z [WORD] [WORD] [WORD] [WORD] l I t [WORD] [WORD] v { l k I r\\ i @ [WORD] V v [WORD] D @ [WORD] b { t @ l f i l d [WORD] [WORD] [WORD] [WORD] k A m @ n l i [WORD] r\\ @ f r\\= d [WORD] t @ [WORD] { z [WORD] v { l k I r\\ i @ [WORD] k r\\ A n @ k @ l z [WORD] a j I i [WORD] a w t s a j d [WORD] dZ @ p { n [WORD] [WORD] I z [WORD] @ [WORD] t { k t I k @ l [WORD] r\\ o w l [WORD] [WORD] p l e j I N [WORD] v I d i o w [WORD] g e j m [WORD] d I v E l @ p t [WORD] b a j [WORD] s i g @ [WORD] { n d [WORD] m i d i @ v I Z @ n [WORD] f O r\\ [WORD] D @ [WORD] p l e j s t e j S @ n [WORD] p O r\\ t @ b @ l [WORD] [WORD] r\\ i l i s t [WORD] I n', '[WORD] dZ { n j u E r\\ i [WORD] [WORD] I n [WORD] dZ @ p { n [WORD] [WORD] I t [WORD] I z [WORD] D @ [WORD] T