In [1]:
import os
from datasets import load_dataset, concatenate_datasets
from tokenizers.models import WordLevel, BPE
from tokenizers import Tokenizer
from tokenizers.trainers import WordLevelTrainer, BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing
import re
import os
import hashlib
import time
import multiprocessing
import argparse
from datasets import load_from_disk
from lingua import Language, LanguageDetectorBuilder
import epitran
from functools import lru_cache
from transformers import PreTrainedTokenizerFast

In [2]:
dataset = load_dataset("wikitext", "wikitext-103-raw-v1")
dataset

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 1801350
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})

In [3]:
num_processes = multiprocessing.cpu_count() - 1
# Exact duplication removal (on individual sentences/paragraphs)
def remove_exact_duplicates(examples):
    seen = set()
    deduped_examples = []
    for sentence in examples["text"]:
        hash_val = hashlib.md5(sentence.encode()).hexdigest()
        if hash_val not in seen:
            seen.add(hash_val)
            deduped_examples.append(sentence)
    return {"text": deduped_examples}


def filter_by_language(examples):
    detector = LanguageDetectorBuilder.from_languages(Language.ENGLISH, Language.FRENCH).build()
    return {
        "text": [
            sentence for sentence in examples["text"] if detector.detect_language_of(sentence) == Language.ENGLISH
        ]
    }

In [4]:
# Basic text cleaning
def clean_text(examples):
    cleaned_text = []
    for sentence in examples["text"]:
        # Lowercase
        #sentence = sentence.lower()
        # Remove extra spaces
        sentence = re.sub(r"\s+", " ", sentence)
        # Remove URLs
        sentence = re.sub(r"http\S+", "", sentence)
        # Remove special characters
        sentence = re.sub(r"[^a-zA-Z0-9,.!?;:\'\" ]+", "", sentence)
        cleaned_text.append(sentence.strip())
    return {"text": cleaned_text}

def clean(dataset):
    dataset = dataset.map(remove_exact_duplicates, batched=True, num_proc=num_processes)
    dataset = dataset.map(filter_by_language, batched=True, num_proc=num_processes)
    dataset = dataset.map(clean_text, batched=True, num_proc=num_processes)
    return dataset


In [5]:
dataset_cleaned = dataset

In [6]:
dataset_cleaned

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 1801350
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})

In [7]:
epi = epitran.Epitran("eng-Latn")

@lru_cache(maxsize=None)
def xsampa_list(word: str) -> list:
    return epi.xsampa_list(word)

def translate_sentence(sentence: str) -> str:
    return ' '.join(' '.join(xsampa_list(word)) for word in sentence.split())

def translate_function(examples):
    return {"text": [translate_sentence(sentence) for sentence in examples["text"]]}


In [8]:
dataset_translated = dataset_cleaned.map(translate_function, batched=True, num_proc=num_processes)
dataset_translated

Map (num_proc=15):   0%|          | 0/4358 [00:00<?, ? examples/s]

Map (num_proc=15):   0%|          | 0/1801350 [00:00<?, ? examples/s]

Map (num_proc=15):   0%|          | 0/3760 [00:00<?, ? examples/s]

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 1801350
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})

In [11]:
dataset_translated["train"][:5]

{'text': ['',
  ' v { l k I r\\ i @ k r\\ A n @ k @ l z a j I i ',
  '',
  's E n dZ o n o w v { l k I r\\ i @   V n r\\ I k O r\\ d I d k r\\ A n @ k @ l z  dZ { p @ n i z    l I t  v { l k I r\\ i @ V v D @ b { t @ l f i l d    k A m @ n l i r\\ @ f r\\= d t @ { z v { l k I r\\ i @ k r\\ A n @ k @ l z a j I i a w t s a j d dZ @ p { n  I z @ t { k t I k @ l r\\ o w l  p l e j I N v I d i o w g e j m d I v E l @ p t b a j s i g @ { n d m i d i @ v I Z @ n f O r\\ D @ p l e j s t e j S @ n p O r\\ t @ b @ l  r\\ i l i s t I n dZ { n j u E r\\ i  I n dZ @ p { n  I t I z D @ T r\\= d g e j m I n D @ v { l k I r\\ i @ s I r\\ i z  E m p l o j I N D @ s e j m f j u Z @ n V v t { k t I k @ l { n d r\\ i l  t a j m g A m p l e j { z I t s p r\\ E d @ s E s r\\= z  D @ s t O r\\ i r\\ V n z p E r\\ @ l E l t @ D @ f r\\= s t g e j m { n d f A l o w z D @  n e j m l @ s   @ p i n @ l m I l @ t E r\\ i j u n @ t s r\\= v I N D @ n e j S @ n V v g { l i @ d U r\\ I N D @ s E k @ n d j U r\\ o w p

In [12]:
def get_training_corpus(dataset):
    dataset = concatenate_datasets([dataset["train"], dataset["test"], dataset["validation"]])
    for i in range(0, len(dataset), 1000):
        samples = dataset[i : i + 1000]
        yield samples["text"]

In [13]:
def train_tokenizer(dataset):
    tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
    tokenizer.pre_tokenizer = Whitespace()
    tokenizer.post_processor = TemplateProcessing(
        single="[CLS] $A [SEP]",
        pair="[CLS] $A [SEP] $B:1 [SEP]:1",
        special_tokens=[
            ("[CLS]", 1),
            ("[SEP]", 2),
        ],
    )
    trainer = WordLevelTrainer(vocab_size=1000 ,special_tokens=["[PAD]", "[CLS]", "[SEP]", "[MASK]", "[UNK]"])
    tokenizer.train_from_iterator(get_training_corpus(dataset), trainer)
    tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)
    tokenizer.add_special_tokens(
        {
            "pad_token": "[PAD]",
            "mask_token": "[MASK]",
            "cls_token": "[CLS]",
            "sep_token": "[SEP]",
            "unk_token": "[UNK]",
        }
    )
    return tokenizer

In [14]:
tokenizer = train_tokenizer(dataset_translated)



In [15]:
print(tokenizer.vocab_size)
print(tokenizer.vocab)

155
{'e_': 51, 'g_w_h': 147, 'T_': 143, 'a_X': 50, 'i_G': 73, 'i_X': 95, 'f': 29, 'S_w': 142, 'u_t': 106, 'a_': 48, 'B': 54, 'p_G': 151, 'tS': 40, '`:': 145, 'd_': 104, 'N': 36, 'e_X': 98, 'n': 7, '|\\': 111, 'k_h': 133, '7': 108, '&': 138, '1': 71, 'h': 35, 'm': 20, 't': 8, '9': 60, 'X': 57, '!\\': 124, 'k_w': 121, 'u_X': 74, 'l_G': 96, 't_h': 152, '{:': 110, 'O': 32, 's_w': 135, 'l_': 149, 'D': 23, 'j': 14, 'A': 25, 's_G': 76, 'g_': 132, "'": 59, 'o': 33, 'f_G': 100, 'u': 34, 'z_G': 123, 's_h': 134, '@:': 65, '|\\|\\': 117, 's_': 102, 'n_G': 97, 'u_k': 153, '[UNK]': 4, ':': 53, 'i_k': 148, '_h': 109, 'E': 22, '2': 49, 'V': 27, '8': 92, 'S': 37, 'G': 70, 'R_w': 141, '\\`': 99, 'g': 38, 'm_G': 85, 'v_w': 154, 'z': 19, "`'": 130, 'H': 112, 'f_w': 146, 'K': 75, 'P': 80, '@\\': 119, '[SEP]': 2, 'J': 67, '?': 66, '_H': 113, "'=\\": 86, 'k': 15, 'v': 26, '{': 17, '~:': 88, '}': 107, 'y': 55, 'i': 18, '_T': 91, 'o_X': 72, '>': 89, '_': 93, 'b_h': 131, 'n_w': 116, 'R': 58, 'g_w': 115, 'l': 13

In [16]:
tokenizer.save_pretrained("/home/toure215/BERT_phonetic/tokenizers/tokenizer_phonetic_WordLevel")

('/home/toure215/BERT_phonetic/tokenizers/tokenizer_phonetic_WordLevel/tokenizer_config.json',
 '/home/toure215/BERT_phonetic/tokenizers/tokenizer_phonetic_WordLevel/special_tokens_map.json',
 '/home/toure215/BERT_phonetic/tokenizers/tokenizer_phonetic_WordLevel/tokenizer.json')

In [17]:
def chunked_text(examples):
    chunked_text = []
    for sentence in examples["text"]:
        words = sentence.split()
        chunks = [words[i : i + 200] for i in range(0, len(words), 200)]
        chunked_text.extend([" ".join(chunk) for chunk in chunks])
    return {"text": chunked_text}

In [18]:
dataset_chunked = (
    dataset_translated.map(chunked_text, batched=True, num_proc=num_processes)
    .flatten_indices()
    .filter(lambda x: len(x["text"]) > 0)
)

dataset_chunked

Map (num_proc=15):   0%|          | 0/4358 [00:00<?, ? examples/s]

Map (num_proc=15):   0%|          | 0/1801350 [00:00<?, ? examples/s]

Map (num_proc=15):   0%|          | 0/3760 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/6226 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/2567227 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/5417 [00:00<?, ? examples/s]

Filter:   0%|          | 0/6226 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2567227 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5417 [00:00<?, ? examples/s]

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 6226
    })
    train: Dataset({
        features: ['text'],
        num_rows: 2567227
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 5417
    })
})

In [20]:
dataset_chunked.save_to_disk("/home/toure215/BERT_phonetic/DATASETS/phoneme_wikitext")

Saving the dataset (0/1 shards):   0%|          | 0/6226 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/2567227 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5417 [00:00<?, ? examples/s]

In [21]:
tokenizer = PreTrainedTokenizerFast.from_pretrained("/home/toure215/BERT_phonetic/tokenizers/tokenizer_phonetic_WordLevel")

In [22]:
s = translate_sentence("Hello, how are you?")
print(s)
e = tokenizer.encode(s)
d = tokenizer.decode(e)
print(e)
print(d)

h @ l o w h a w A r\ j u
[1, 35, 5, 13, 33, 16, 35, 30, 16, 25, 6, 11, 14, 34, 2]
[CLS] h @ l o w h a w A r \ j u [SEP]


In [23]:
print(dataset_chunked["train"][:5])

{'text': ['v { l k I r\\ i @ k r\\ A n @ k @ l z a j I i', 's E n dZ o n o w v { l k I r\\ i @ V n r\\ I k O r\\ d I d k r\\ A n @ k @ l z dZ { p @ n i z l I t v { l k I r\\ i @ V v D @ b { t @ l f i l d k A m @ n l i r\\ @ f r\\= d t @ { z v { l k I r\\ i @ k r\\ A n @ k @ l z a j I i a w t s a j d dZ @ p { n I z @ t { k t I k @ l r\\ o w l p l e j I N v I d i o w g e j m d I v E l @ p t b a j s i g @ { n d m i d i @ v I Z @ n f O r\\ D @ p l e j s t e j S @ n p O r\\ t @ b @ l r\\ i', 'l i s t I n dZ { n j u E r\\ i I n dZ @ p { n I t I z D @ T r\\= d g e j m I n D @ v { l k I r\\ i @ s I r\\ i z E m p l o j I N D @ s e j m f j u Z @ n V v t { k t I k @ l { n d r\\ i l t a j m g A m p l e j { z I t s p r\\ E d @ s E s r\\= z D @ s t O r\\ i r\\ V n z p E r\\ @ l E l t @ D @ f r\\= s t g e j m { n d f A l o w z D @ n e j m l @ s @ p i n @ l m I l @ t E r\\ i j u n @ t s r\\= v I N D @ n e j S @ n V v g { l i @', 'd U r\\ I N D @ s E k @ n d j U r\\ o w p { n w O r\\ h u p r\\= f O r\\