In [1]:
import re
import epitran
from datasets import load_dataset
from transformers import (
    PreTrainedTokenizerFast,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
    BertTokenizerFast,
    DataCollatorWithPadding,
)
from tokenizers import Tokenizer
from tokenizers.models import WordLevel, WordPiece
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing
from tokenizers.trainers import WordLevelTrainer, WordPieceTrainer
from transformers import DataCollatorWithPadding

In [20]:
epi = epitran.Epitran('eng-Latn')

In [4]:
translated = epi.transliterate('hello')
print(translated)

h…ôlow


In [5]:
s = "hello to you! and you"
words = re.findall( r'\w+|[^\s\w]+', s)
print(words)
res = " ".join(map(lambda x: "".join(epi.xsampa_list(x)), words))
print(res)

['hello', 'to', 'you', '!', 'and', 'you']
h@low t@ ju  {nd ju


In [8]:
def xsampa(sentence):
    words = re.findall( r'\w+|[^\s\w]+', sentence)
    return " ".join(map(lambda x: "".join(epi.xsampa_list(x)), words))

def translate_to_phonetic(dataset_dict):
    sentence1 = xsampa(dataset_dict['sentence1'])
    sentence2 = xsampa(dataset_dict['sentence2'])
    return {'sentence1': sentence1, 'sentence2': sentence2}

In [14]:
wiki = load_dataset('wikitext', 'wikitext-2-v1')
wiki

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 36718
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})

In [15]:
phonetic_wiki = wiki.map(lambda x: {"text": xsampa(x["text"])})
phonetic_wiki

Map:   0%|          | 0/4358 [00:00<?, ? examples/s]

Map:   0%|          | 0/36718 [00:00<?, ? examples/s]

Map:   0%|          | 0/3760 [00:00<?, ? examples/s]

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 36718
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})

In [16]:
# save phonetic_wiki to a file
with open('phonetic_wiki.txt', 'w') as f:
    for split in ['train', 'validation', 'test']:
        split_dataset = phonetic_wiki[split]
        for example in split_dataset:
            f.write(f"{example['text']}\n")

In [17]:
!wc -l phonetic_wiki.txt

44836 phonetic_wiki.txt


In [32]:
task = 'mrpc'
dataset = load_dataset('glue', task)
dataset

README.md:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/649k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/75.7k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/308k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/408 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [33]:
phonetic_dataset = dataset.map(translate_to_phonetic)
phonetic_dataset

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [34]:
print(dataset['train'][0])
print(phonetic_dataset['train'][0])

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .', 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .', 'label': 1, 'idx': 0}
{'sentence1': '{mr\\owzi @kjuzd hIz br\\VDr\\=  hum hi kOld  D@ wItn@s   Vv dIlIbr\\=@tli dIstOr\\tIN hIz Ev@d@ns ', 'sentence2': 'r\\Ifr\\=IN t@ hIm {z ownli  D@ wItn@s   {mr\\owzi @kjuzd hIz br\\VDr\\= Vv dIlIbr\\=@tli dIstOr\\tIN hIz Ev@d@ns ', 'label': 1, 'idx': 0}


In [None]:
# save phonetic_dataset to a file
# with open('phonetic_dataset.txt', 'w') as f:
#     for split in ['train', 'validation', 'test']:
#         split_dataset = phonetic_dataset[split]
#         for example in split_dataset:
#             f.write(f"{example['sentence1']}\n{example['sentence2']}\n")


In [None]:
#!wc -l phonetic_dataset.txt

11602 phonetic_dataset.txt


In [2]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
print(r"\ "[0])
x_sampa_tokens = [
    "a", "b", "b_<", "c", "d", "d`", "d_<", "e", "f", "g", "g_<", "h", "h\\",
    "i", "j", "j\\", "k", "l", "l`", "m", "n", "n`", "o", "p", "p\\", "q", "r",
    "r\\", "r\\`", "s", "s`", "t", "t`", "u", "v", "v\\", "w", "x", "x\\", "y",
    "z", "z`", "z\\", "A", "B", "B\\", "C", "D", "E", "F", "G", "G\\", "G\\_<",
    "H", "H\\", "I", "I\\", "J", "J\\", "J\\_<", "K", "K\\", "L", "L\\", "M",
    "N", "N\\", "O", "O\\", "P", "Q", "R", "R\\", "S", "T", "U", "U\\", "V",
    "W", "X", "X\\", "Y", "Z", ".", '"', "%", "'", ":", ":\\", "-", "@", "@\\",
    "@`", "{", "}", "1", "2", "3", "3\\", "4", "5", "6", "7", "8", "9", "&",
    "?", "?\\", "*", "/", "<", "<\\", ">", ">\\", "^", "!", "|", "|\\", "||",
    "|\\|\\", "=\\", "-\\", '_"', "_+", "_-", "_/", "_0", "_<", "=", "_>", "_?\\",
    "_\\", "_^", "_}", "`", "~", "_~", "_A", "_B", "_a", "_B_L", "_c", "_d", "_e",
    "<F>", "_F", "_G", "_H", "_H_T", "_h", "_j", "')", "_k", "_L", "_l", "_M",
    "_m", "_N", "_n", "_O", "_o", "_q", "<R>", "<_R_F>", "_r", "_T", "_t", "_v",
    "_w", "_X", "_x", "ts", "dz", "tS", "dZ", "ts\\", "dz\\", "tK", "kp", "gb",
    "Nm", "b_<", "d_<", "J\_<", "g_<", "G\_<", "_>", "p_>", "t_>", "k_>", "s_v"
]
print(len(x_sampa_tokens))
print(x_sampa_tokens)
x_sampa_tokens = list(set(x_sampa_tokens))
print(len(x_sampa_tokens))
print(x_sampa_tokens)
print("j\\")

\
191
['a', 'b', 'b_<', 'c', 'd', 'd`', 'd_<', 'e', 'f', 'g', 'g_<', 'h', 'h\\', 'i', 'j', 'j\\', 'k', 'l', 'l`', 'm', 'n', 'n`', 'o', 'p', 'p\\', 'q', 'r', 'r\\', 'r\\`', 's', 's`', 't', 't`', 'u', 'v', 'v\\', 'w', 'x', 'x\\', 'y', 'z', 'z`', 'z\\', 'A', 'B', 'B\\', 'C', 'D', 'E', 'F', 'G', 'G\\', 'G\\_<', 'H', 'H\\', 'I', 'I\\', 'J', 'J\\', 'J\\_<', 'K', 'K\\', 'L', 'L\\', 'M', 'N', 'N\\', 'O', 'O\\', 'P', 'Q', 'R', 'R\\', 'S', 'T', 'U', 'U\\', 'V', 'W', 'X', 'X\\', 'Y', 'Z', '.', '"', '%', "'", ':', ':\\', '-', '@', '@\\', '@`', '{', '}', '1', '2', '3', '3\\', '4', '5', '6', '7', '8', '9', '&', '?', '?\\', '*', '/', '<', '<\\', '>', '>\\', '^', '!', '|', '|\\', '||', '|\\|\\', '=\\', '-\\', '_"', '_+', '_-', '_/', '_0', '_<', '=', '_>', '_?\\', '_\\', '_^', '_}', '`', '~', '_~', '_A', '_B', '_a', '_B_L', '_c', '_d', '_e', '<F>', '_F', '_G', '_H', '_H_T', '_h', '_j', "')", '_k', '_L', '_l', '_M', '_m', '_N', '_n', '_O', '_o', '_q', '<R>', '<_R_F>', '_r', '_T', '_t', '_v', '_w', '_X',

In [21]:
phonetic_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
phonetic_tokenizer.pre_tokenizer = Whitespace()
phonetic_tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", 1),
        ("[SEP]", 2),
    ],
)
phonetic_tokenizer.add_tokens(x_sampa_tokens)
trainer = WordPieceTrainer(vocab_size=36000, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
files = ["phonetic_wiki.txt"]
phonetic_tokenizer.train(files, trainer)


In [22]:
print(phonetic_tokenizer.get_vocab())
print(phonetic_tokenizer.get_vocab_size())

{'gnejt': 11526, 'nzlowkejS': 11719, 'Nst': 11934, 'Itun': 6241, '##owvic': 14127, 'wAkAbAjASi': 13887, 'lajtr': 5909, 'inejs': 16328, 'IntImIdejtIN': 12763, 'lowvz': 16083, 'IkspEld': 7381, 'lowni': 2445, 'blISIz': 8500, 'owfawnd': 5949, 'mtv': 3212, 'InfEstejS': 15183, 'ImwAndZIn': 17511, 'VlEvaEl': 12608, 'tSaj': 10746, '##tw': 398, '!': 100, 'bETlIhEm': 11206, 'lEtIN': 7782, 'gk': 15414, 'InvItejS': 5039, 'IgAr': 6396, 'bEnzin': 15206, 'mdz': 11994, 'tejnm': 2355, 'owbAt': 5678, 'ImpVlsIv': 12432, 'hejli': 14740, '##tr': 105, 'kEn': 2271, 'fEstejS': 7425, 'sIlvEstr': 8841, 'tejstIN': 15069, 'bAbkejdZIn': 11714, 'iSVf': 17193, 'kAntEnt': 2812, 'mAntownr': 16805, 'IndVldZd': 17268, '##awm': 4519, 'dEntS': 2658, 'hajbr': 5915, 'sImplIstIk': 13686, '_j': 131, 'pAsup': 10396, 'bejs': 725, 'majn': 993, 'vind': 12348, '##i': 59, '##EktejS': 5556, 'nEm': 8609, 'gu': 4607, '##fajr': 4020, 'ntIks': 10185, 'swAnsi': 7606, 'tEndInajt': 16662, 'dIfVNkt': 9793, 'aw': 183, 'dIzni': 6381, 'ndfOl':

In [23]:
# Now, you can tokenize phonetic sentences
sentence = 'hello people'
phonetic_sentence = xsampa(sentence)
print(phonetic_sentence)
encoded = phonetic_tokenizer.encode(phonetic_sentence)
print(encoded)
print(encoded.tokens)
print(encoded.ids)
print(max(phonetic_tokenizer.get_vocab().values()))

h@low pip@l
Encoding(num_tokens=12, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
['[CLS]', 'h', '@', 'l', 'o', 'w', 'p', 'i', 'p', '@', 'l', '[SEP]']
[1, 159, 62, 60, 43, 82, 61, 164, 61, 62, 60, 2]
17604


In [25]:
# Create FastTokenizer
tokenizer = PreTrainedTokenizerFast(tokenizer_object=phonetic_tokenizer)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

0

In [26]:
# save tokenizer
!rm -rf tokenizer
tokenizer.save_pretrained('tokenizer')

('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/tokenizer.json')

In [38]:
def tokenize_function(examples):
    return tokenizer(examples['sentence1'], examples['sentence2'], truncation=True, max_length=512)

tokenized_dataset = phonetic_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

In [39]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
model.resize_token_embeddings(len(tokenizer))

Embedding(17737, 768, padding_idx=0)

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    learning_rate=5e-5, # default 5e-5
    num_train_epochs=3,
    weight_decay=3e-5,
    logging_dir='./logs',
    logging_steps=10,
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train the model
trainer.train()

In [None]:
predictions = trainer.predict(tokenized_dataset['test'])
preds, label_ids, metrics = predictions
preds = preds.argmax(-1)
print(preds)

[1 1 1 ... 1 1 1]


In [None]:
# evaluate
import evaluate
metric = evaluate.load("glue", "mrpc")
metric.compute(predictions=preds, references=label_ids)

Downloading builder script:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

{'accuracy': 0.664927536231884, 'f1': 0.7987465181058496}

In [None]:
# load tokenizer
tokenizer = PreTrainedTokenizerFast.from_pretrained('tokenizer')

In [None]:
# load model
model = BertForSequenceClassification.from_pretrained('./results/checkpoint-345')

In [27]:
!zip -r models.zip ./phonetic_wiki.txt ./tokenizer

  adding: phonetic_wiki.txt (deflated 67%)
  adding: tokenizer/ (stored 0%)
  adding: tokenizer/tokenizer_config.json (deflated 96%)
  adding: tokenizer/special_tokens_map.json (deflated 36%)
  adding: tokenizer/tokenizer.json (deflated 73%)


In [28]:
!du -h models.zip

4.0M	models.zip
