### Refer this [blog](https://medium.com/analytics-vidhya/create-a-tokenizer-and-train-a-huggingface-roberta-model-from-scratch-f3ed1138180c) and this [code](https://github.com/edumunozsala/RoBERTa_Encoder_Decoder_Product_Names/blob/03c0456f03d8cff62e2d1b04f03029130694e18b/RoBERTa%20MLM%20and%20Tokenizer%20train%20for%20Text%20generation.ipynb)


Recommended spec for training
- ml.g5.4xlarge
- ml.g5.2xlarge

In [None]:
%%capture
! pip install tokenizers transformers ipywidgets pandas datasets wandb huggingface_hub tqdm

In [None]:
! pip install accelerate -U
# ! pip install transformers[torch]

In [None]:
# aws s3 sync s3://monolingual.data/A/ /home/ec2-user/SageMaker/monolingual/A/ --no-sign-request

# aws s3 sync s3://openpecha.cleaned/tokenized_raw_text/ /home/ec2-user/SageMaker/monolingual/gold/ --no-sign-request

In [None]:
import os
os.environ['HF_HOME'] = '/home/ec2-user/SageMaker/cache'
os.environ['HF_DATASETS_CACHE'] = '/home/ec2-user/SageMaker/cache/datasets'

In [None]:
!echo $HF_HOME
!echo $HF_DATASETS_CACHE

In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
import wandb

wandb.login()

In [None]:
import pandas as pd
from pathlib import Path

from tokenizers import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

import torch
from torch.utils.data.dataset import Dataset

import os
import math

from huggingface_hub import HfFolder, notebook_login

In [None]:
from datasets import load_dataset
from transformers import RobertaTokenizerFast

dataset = load_dataset('spsither/tibetan_monolingual_S_cleaned_train_test', num_proc=24)

VOCAB_SIZE = 52000
MAX_LEN = 512
tokenizer_folder = "tokenizer_S_b"
tokenizer = RobertaTokenizerFast.from_pretrained(tokenizer_folder, max_len=MAX_LEN)

def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True, return_tensors="pt", return_special_tokens_mask=True, return_special_tokens_mask=True)

# Tokenize dataset
tokenized_dataset = dataset.map(tokenize, batched=True, remove_columns=["text"])

tokenized_dataset.push_to_hub('spsither/tibetan_monolingual_S_cleaned_train_test_tokenized')

In [None]:
# openpecha/Madlad-v1 has 256000. sangjeedondrub/tibetan-roberta-base has 52000.
# when I set it to be 52000 BPE generated 52000 tokens
# when I set it to be 256000 BPE generated 86761 tokens
# 86761 seems optimal cos the tokenizer training step uses 86761 even if it has option to generate more for min_frequency 2.
# Looking at tokenizer_G using 86761, there are too many nonsensical tokens and tokenizer A with vocab size 52000 has fewer of those.

VOCAB_SIZE = 52000
MAX_LEN    = 512

In [None]:
tokenizer_folder = 'tokenizer_S_b'

tokenizer = ByteLevelBPETokenizer(
    os.path.abspath(os.path.join(tokenizer_folder,'vocab.json')),
    os.path.abspath(os.path.join(tokenizer_folder,'merges.txt'))
)

In [None]:
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=MAX_LEN)

In [None]:
from transformers import RobertaConfig
from transformers import RobertaForMaskedLM

# Set a configuration for our RoBERTa model
config = RobertaConfig(
    vocab_size=VOCAB_SIZE,
    max_position_embeddings=MAX_LEN + 2,
    num_attention_heads=12, #  12 Medium | 16 Large
    num_hidden_layers=6,    #   6 Medium | 24 Large
    hidden_size=768,        # 768 Medium | 1024 Large
    type_vocab_size=1
)
# Initialize the model from a configuration without pretrained weights
model = RobertaForMaskedLM(config=config)
print('Num parameters: ',model.num_parameters())

In [None]:
from transformers import RobertaTokenizerFast

# add_prefix_space=True
# Set this when you want to tokenizer to work with syllables using text.split('་'). Useful for NER/POS/Word Chuncking. 
# use is_split_into_words=True when calling tokenizer to use this
tokenizer_folder = 'tokenizer_S_b'
tokenizer = RobertaTokenizerFast.from_pretrained(tokenizer_folder, max_len=MAX_LEN)

In [None]:
tokenizer

In [None]:
from datasets import load_dataset
dataset = load_dataset('spsither/tibetan_monolingual_S_cleaned_train_test', cache_dir="/home/ec2-user/SageMaker/cache/datasets", num_proc=48)

In [None]:
dataset

In [None]:
# Check that PyTorch sees it
import torch
torch.cuda.is_available()

what ever value you see for vocab_size, consider using that for VOCAB_SIZE. i.e. the least required value

In [None]:
import pandas as pd

class CustomDataset(Dataset):
    def __init__(self, dataset, tokenizer, max_len=MAX_LEN):
        self.df = pd.DataFrame(dataset['text'])
        self.tokenizer = tokenizer
        self.max_len = max_len
    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, i):
        inputs = self.tokenizer.encode_plus(self.df.iloc[i, 0],
                                       max_length=self.max_len,
                                       truncation=True,
                                       padding='max_length',
                                       return_tensors='pt')

        return {'input_ids': inputs.input_ids[0], 'attention_mask': inputs.attention_mask[0]}

In [None]:
eval_dataset = CustomDataset(dataset['test'], tokenizer)

In [None]:
train_dataset = CustomDataset(dataset['train'], tokenizer)

In [None]:
from transformers import DataCollatorForLanguageModeling

# Define the Data Collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [None]:
# continue training
# from transformers import RobertaForMaskedLM
# model = RobertaForMaskedLM.from_pretrained('/media/monlamai/SSD/RoBERTa/checkpoint-480288')

In [None]:
from transformers import Trainer, TrainingArguments
# Batch size of 60 worked on ml.g5.4xlarge with gradient_checkpointing & fp16 True. group_by_length takes extra time.
# Running on ml.g5.12xlarge uses Data Parallelism on 4gpus 
# but batch size it takes is 8 at max thus not making it worth the additional cost. Also not sure if it actually works.

training_args = TrainingArguments(
    output_dir = '/media/monlamai/SSD/RoBERTa',
    overwrite_output_dir = False,
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch',
    num_train_epochs = 50,         
    learning_rate = 1e-4,          # default: 0.001
    warmup_steps = 500,
    weight_decay = 0.01,
    per_device_train_batch_size = 60, # 59-61 $ 60-1gpu TTF # 30-1gpu FFF # 4-4gpus TTT 
    per_device_eval_batch_size  = 60, # can be larger than per_device_train_batch_size, no need for grad
    gradient_checkpointing = True,   # default False Saves a lot of mem
    fp16                   = True,   # default False Saves some mem
    group_by_length        = False,  # default False # takes time
    gradient_accumulation_steps = 1, # default 1
    logging_strategy = "steps",
    logging_steps = 100,
    save_total_limit = 40,
    report_to = ['wandb'],
)
# Create the trainer for our model
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=eval_dataset,
    eval_dataset=eval_dataset,
    #prediction_loss_only=True,
)

In [None]:
print(f"parallel_mode: {training_args.parallel_mode} \nn_gpus: {training_args.n_gpu}")

In [None]:
trainer.train()
# trainer.train(resume_from_checkpoint=True)  # continue training

In [None]:
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [None]:
trainer.save_model(model_folder)

In [None]:
# Save our tokenizer and create a model card
repository_id = 'spsither/tibetan-RoBERTa'
tokenizer.save_pretrained(repository_id)
trainer.create_model_card()
# Push the results to the hub
trainer.push_to_hub()

In [None]:
# model.config.to_json_file(f"{tokenizer_folder}/config.json")

In [None]:
from transformers import RobertaTokenizerFast
from transformers import RobertaForMaskedLM

tokenizer_folder = 'tokenizer_f_d'
MAX_LEN    = 512
tokenizer = RobertaTokenizerFast.from_pretrained(tokenizer_folder, max_len=MAX_LEN)
model = RobertaForMaskedLM.from_pretrained('/home/ec2-user/SageMaker/RoBERTa/checkpoint-1445231')

In [None]:
model.push_to_hub('tibetan_RoBERTa_Afd_e1')
tokenizer.push_to_hub('tibetan_RoBERTa_Afd_e1')

In [None]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model=f"{model_folder}/checkpoint-110120",
    tokenizer=tokenizer_folder
)

In [None]:
#          སེམས་ཀྱི་རང་བཞིན་འོད་གསལ་བ་ཟེར་ཡ་དེ་
fill_mask("སེམས་ཀྱི་རང་བཞིནའོད་<mask>་བ་ཟེར་ཡ་དེ")

In [None]:
samples = """རིན་ <mask>
ཆོས་ཀྱི་ <mask>
རྫོགས་པའི་ <mask>
གངས་རིའི་ <mask>
མེ་ལོང་ <mask>
བདེན་པའི་ <mask>
'འབྱུང་ <mask>""".splitlines()

for idx, sample in enumerate(samples, start=1):
    outputs = fill_mask(sample)
    print(idx, sample)
    for output in outputs:
        print(output)

In [None]:
text = "དེ་ནས་ཤར་ཕྱོགས་སུ་ནགས་སྟུག་པོ་བརྒྱུད་དེ་རྒྱང་གྲགས་ཉིས་བརྒྱ་བགྲོད་པ་ན་ཨི་ར་ན་བྷ་ཏའི་ཡུལ་ལོ། །ཐ་གྲུར་རྒྱང་གྲགས་སུམ་སྟོང་ལྷག་པ། ལྟེ་བའི་མཁར་ཆེན་ནི།"
ground = text.split('་')
corrects = []
for i in range(len(ground)):
    test = ground[::]
    mask = test[i]
    test[i] = '<mask>'
    test = '་'.join(test)
    infs = fill_mask(test)
    correct = False
    for inf in infs:
        if inf['token_str'] == mask:
            correct = True
    corrects += [correct]
    
print(sum(corrects), len(corrects))

In [None]:
from transformers import RobertaTokenizerFast
from transformers import RobertaForMaskedLM

tokenizer_folder = 'tokenizer_S_b'
MAX_LEN    = 512
tokenizer = RobertaTokenizerFast.from_pretrained(tokenizer_folder, max_len=MAX_LEN)
model = RobertaForMaskedLM.from_pretrained('/home/ec2-user/SageMaker/RoBERTa/RoBERTa/checkpoint-480288')

In [None]:
model.push_to_hub('tibetan_RoBERTa_S_e6')
tokenizer.push_to_hub('tibetan_RoBERTa_S_e6')