Recommended spec data cleaning 
- ml.m7i.8xlarge

In [None]:
%%capture
! pip install tokenizers transformers ipywidgets pandas datasets wandb huggingface_hub tqdm

In [None]:
! pip install accelerate -U
# ! pip install transformers[torch]

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
import pandas as pd
from pathlib import Path

from tokenizers import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

import torch
from torch.utils.data.dataset import Dataset

import os
import math

from huggingface_hub import HfFolder, notebook_login

In [None]:
%%time 
paths = [str(x) for x in Path("/home/ec2-user/SageMaker/monolingual").glob("A_filtered_deduped/*.csv")]

In [None]:
# openpecha/Madlad-v1 has 256000. sangjeedondrub/tibetan-roberta-base has 52000.
# when I set it to be 52000 BPE generated 52000 tokens
# when I set it to be 256000 BPE generated 86761 tokens
# 86761 seems optimal cos the tokenizer training step uses 86761 even if it has option to generate more for min_frequency 2.
# Looking at tokenizer_G using 86761, there are too many nonsensical tokens and tokenizer A with vocab size 52000 has fewer of those.

VOCAB_SIZE = 256000
MAX_LEN    = 512

In [None]:
%%time
# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=paths, vocab_size=VOCAB_SIZE, min_frequency=2, # tried 1 gives more tokens, try 3 may give better tokens
                show_progress=True,
                special_tokens=[
                                "<s>",
                                "<pad>",
                                "</s>",
                                "<unk>",
                                "<mask>",
])

In [None]:
print('done')

In [None]:
# Save the Tokenizer to disk
tokenizer_folder = 'tokenizer_A_f_d'
! mkdir {tokenizer_folder}
tokenizer.save_model(tokenizer_folder)

In [None]:
tokenizer_folder = 'tokenizer_A_f_d'

tokenizer = ByteLevelBPETokenizer(
    os.path.abspath(os.path.join(tokenizer_folder,'vocab.json')),
    os.path.abspath(os.path.join(tokenizer_folder,'merges.txt'))
)

In [None]:
# Prepare the tokenizer
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=MAX_LEN)

In [None]:
from transformers import RobertaTokenizerFast

# add_prefix_space=True
# Set this when you want to tokenizer to work with syllables using text.split('་'). Useful for NER/POS/Word Chuncking. 
# use is_split_into_words=True when calling tokenizer to use this

tokenizer = RobertaTokenizerFast.from_pretrained(tokenizer_folder, max_len=MAX_LEN)

In [None]:
tokenizer

In [None]:
# Load model directly
from transformers import AutoTokenizer
md_tokenizer = AutoTokenizer.from_pretrained("google/madlad400-3b-mt")

In [None]:
md_tokenizer

In [None]:
special_tokens_dict = {"bos_token": "<s>", "eos_token": "</s>", "sep_token": '</s>', "pad_token": '<pad>', "cls_token":'<s>', "mask_token":'<mask>'}
md_tokenizer.add_special_tokens(special_tokens_dict)
md_tokenizer.model_max_length = 512

In [None]:
md_tokenizer

In [None]:
token_id = [118443,256000]  # The ID of the beginning-of-sentence token
tok = md_tokenizer.decode(token_id, skip_special_tokens=False)
print(f"_{tok}-")

In [None]:
text = 'ཁ་ཤས་སོ་སོས་དོ་སྣང་བྱས་ཏེ་ཉམས་ལེན་བྱེད་མཁན་དེ་འདྲ་ཡོང་གི་འདུག་གང་ལྟར་སྔོན་མ་ཡིན་ན་ཆོས་ཁྲིམས་པས་སྐོར་བ་བརྒྱབ་བྱས། མ་སྡོད་ཉལ་ནས་སྡོད་ན་ཆད་པ་ཡིན།'
print(len(text), len(tokenizer(text)['input_ids']))

print(tokenizer(text)['input_ids'])
'_'.join([tokenizer.decode(i) for i in tokenizer(text)['input_ids']])

In [None]:
print(len(text), len(md_tokenizer(text)['input_ids']))

print(md_tokenizer(text)['input_ids'])
'_'.join([md_tokenizer.decode(i) for i in md_tokenizer(text)['input_ids']])

In [None]:
len(text), len(md_tokenizer(text)['input_ids'])
md_tokenizer.decode(md_tokenizer(text)['input_ids'])
print(md_tokenizer(text, padding='max_length', truncation=False, max_length=100)['input_ids'])
'_'.join([md_tokenizer.decode(i, skip_special_tokens=False) for i in md_tokenizer(text)['input_ids']])

In [None]:
import json
f = open('/home/ec2-user/SageMaker/RoBERTa/tokenizer_A_f_d/vocab.json')
data = json.load(f)

In [None]:
# All BPE tokens
for k in data:
    print(tokenizer.decode(data[k]), sep=' ')