### Refer this [blog](https://medium.com/analytics-vidhya/create-a-tokenizer-and-train-a-huggingface-roberta-model-from-scratch-f3ed1138180c) and this [code](https://github.com/edumunozsala/RoBERTa_Encoder_Decoder_Product_Names/blob/03c0456f03d8cff62e2d1b04f03029130694e18b/RoBERTa%20MLM%20and%20Tokenizer%20train%20for%20Text%20generation.ipynb)


Recommended spec for training
- ml.g5.4xlarge
- ml.g5.2xlarge

In [None]:
%%capture
! pip install tokenizers transformers ipywidgets pandas datasets wandb huggingface_hub tqdm

In [None]:
! pip install accelerate -U
# ! pip install transformers[torch]

In [None]:
import os
os.environ['HF_HOME'] = '/home/ec2-user/SageMaker/cache'
os.environ['HF_DATASETS_CACHE'] = '/home/ec2-user/SageMaker/cache/datasets'

In [None]:
!echo $HF_HOME
!echo $HF_DATASETS_CACHE

In [None]:
from huggingface_hub import notebook_login
# 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
notebook_login()

In [None]:
import wandb

wandb.login()

In [None]:
import pandas as pd
from pathlib import Path

from tokenizers import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

import torch
from torch.utils.data.dataset import Dataset

import os
import math

from huggingface_hub import HfFolder, notebook_login

In [None]:
%%time 
paths = [str(x) for x in Path("/home/ec2-user/SageMaker/monolingual").glob("A/*.txt")]

In [None]:
# openpecha/Madlad-v1 has 256000. sangjeedondrub/tibetan-roberta-base has 52000.
# when I set it to be 52000 BPE generated 52000 tokens
# when I set it to be 256000 BPE generated 86761 tokens
# 86761 seems optimal cos the tokenizer training step uses 86761 even if it has option to generate more for min_frequency 2.
# Looking at tokenizer_G using 86761, there are too many nonsensical tokens and tokenizer A with vocab size 52000 has fewer of those.

VOCAB_SIZE = 52000
MAX_LEN    = 512

In [None]:
%%time
# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=paths, vocab_size=VOCAB_SIZE, min_frequency=2, # tried 1 gives more tokens
                show_progress=True,
                special_tokens=[
                                "<s>",
                                "<pad>",
                                "</s>",
                                "<unk>",
                                "<mask>",
])

In [None]:
# Save the Tokenizer to disk
! mkdir tokenizer
tokenizer_folder = 'tokenizer'
tokenizer.save_model(tokenizer_folder)

In [None]:
tokenizer_folder = 'tokenizer'

tokenizer = ByteLevelBPETokenizer(
    os.path.abspath(os.path.join(tokenizer_folder,'vocab.json')),
    os.path.abspath(os.path.join(tokenizer_folder,'merges.txt'))
)

In [None]:
# Prepare the tokenizer
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=MAX_LEN)

In [None]:
from transformers import RobertaConfig
from transformers import RobertaForMaskedLM

# Set a configuration for our RoBERTa model
config = RobertaConfig(
    vocab_size=VOCAB_SIZE,
    max_position_embeddings=514,
    num_attention_heads=12,     # 16 Large, 12 Medium
    num_hidden_layers=6,        # 24 Large, 6 Medium
    type_vocab_size=1,
    hidden_size=768             # 1024 Large, 768 Medium
)
# Initialize the model from a configuration without pretrained weights
model = RobertaForMaskedLM(config=config)
print('Num parameters: ',model.num_parameters())

In [None]:
from transformers import RobertaTokenizerFast

# add_prefix_space=True
# Set this when you want to tokenizer to work with syllables using text.split('་'). Useful for NER/POS/Word Chuncking. 
# use is_split_into_words=True when calling tokenizer to use this

tokenizer = RobertaTokenizerFast.from_pretrained(tokenizer_folder, max_len=MAX_LEN)

In [None]:
tokenizer

In [None]:
len(paths)

In [None]:
import random
random.shuffle(paths)

In [None]:
paths_train = paths[0:53000]
paths_test  = paths[53000:]
# paths_train = paths[0:100]
# paths_test  = paths[100:110]

In [None]:
len(paths_test) / len(paths) * 100

In [None]:
from datasets import load_dataset

# Load the text files as a dataset
dataset = load_dataset("text", data_files={"train": paths_train, "test": paths_test})

In [None]:
dataset.push_to_hub("spsither/tibetan_monolingual_A")

In [None]:
CONTEXT_LINES = 7 # try 5 next time some sentences are very long, also some mixture of fewer sentences
def merge_text_lines(examples, context_lines = CONTEXT_LINES):
    examples = examples['text']
    # print(examples)
    merged_examples = []
    for i in range(0, len(examples), context_lines):
        merged_examples.append(' '.join(examples[i:i+context_lines]))
    return {'text' : merged_examples}

In [None]:
%%time
merged_dataset = dataset.map(merge_text_lines, batched=True, batch_size=CONTEXT_LINES)

In [None]:
merged_dataset.push_to_hub("spsither/tibetan_monolingual_A_merged_7_lines")

In [None]:
from datasets import load_dataset
merged_dataset = load_dataset('spsither/tibetan_monolingual_A_merged_7_lines', cache_dir="/home/ec2-user/SageMaker/cache/datasets", num_proc=8)

In [None]:
merged_dataset

In [None]:
# merged_dataset.save_to_disk('merged_6_dataset')

In [None]:
# from datasets import load_from_disk
# merged_dataset = load_from_disk('merged_6_dataset')

In [None]:
# Check that PyTorch sees it
import torch
torch.cuda.is_available()

what ever value you see for vocab_size, consider using that for VOCAB_SIZE. i.e. the least required value

In [None]:
import pandas as pd

class CustomDataset(Dataset):
    def __init__(self, dataset, tokenizer, max_len=512):
        self.df = pd.DataFrame(dataset['text'])
        self.tokenizer = tokenizer
        self.max_len = max_len
    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, i):
        inputs = self.tokenizer.encode_plus(self.df.iloc[i, 0],
                                       max_length=self.max_len,
                                       truncation=True,
                                       padding='max_length',
                                       return_tensors='pt')

        return {'input_ids': inputs.input_ids[0], 'attention_mask': inputs.attention_mask[0]}

In [None]:
eval_dataset = CustomDataset(merged_dataset['test'], tokenizer)

In [None]:
train_dataset = CustomDataset(merged_dataset['train'], tokenizer)

In [None]:
from transformers import DataCollatorForLanguageModeling

# Define the Data Collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [None]:
from transformers import Trainer, TrainingArguments
# Define the training arguments
training_args = TrainingArguments(
    output_dir = 'RoBERTa',
    overwrite_output_dir = False,
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch',
    num_train_epochs = 50,         # number of epochs to train (default: 10)
    learning_rate = 1e-4,          # learning rate (default: 0.001)
    warmup_steps = 500,
    weight_decay = 0.01,
    per_device_train_batch_size = 24, # 32 is too big. 24 is hitting 78.4% GPU memory usage
    per_device_eval_batch_size  = 24, # can be larger than per_device_train_batch_size, no need for grad
    logging_strategy = "steps",
    logging_steps = 100,
    save_total_limit = 40,
    report_to = ['wandb'],
)
# Create the trainer for our model
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=eval_dataset,
    eval_dataset=eval_dataset,
    #prediction_loss_only=True,
)

In [None]:
# Train the model
trainer.train()

In [None]:
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [None]:
trainer.save_model(model_folder)

In [None]:
# Save our tokenizer and create a model card
repository_id = 'spsither/tibetan-RoBERTa'
tokenizer.save_pretrained(repository_id)
trainer.create_model_card()
# Push the results to the hub
trainer.push_to_hub()

In [None]:
# model.config.to_json_file(f"{tokenizer_folder}/config.json")

In [None]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model=f"{model_folder}/checkpoint-110120",
    tokenizer=tokenizer_folder
)

In [None]:
#          སེམས་ཀྱི་རང་བཞིན་འོད་གསལ་བ་ཟེར་ཡ་དེ་
fill_mask("སེམས་ཀྱི་རང་བཞིནའོད་<mask>་བ་ཟེར་ཡ་དེ")

In [None]:
samples = """རིན་ <mask>
ཆོས་ཀྱི་ <mask>
རྫོགས་པའི་ <mask>
གངས་རིའི་ <mask>
མེ་ལོང་ <mask>
བདེན་པའི་ <mask>
'འབྱུང་ <mask>""".splitlines()

for idx, sample in enumerate(samples, start=1):
    outputs = fill_mask(sample)
    print(idx, sample)
    for output in outputs:
        print(output)

In [None]:
text = "དེ་ནས་ཤར་ཕྱོགས་སུ་ནགས་སྟུག་པོ་བརྒྱུད་དེ་རྒྱང་གྲགས་ཉིས་བརྒྱ་བགྲོད་པ་ན་ཨི་ར་ན་བྷ་ཏའི་ཡུལ་ལོ། །ཐ་གྲུར་རྒྱང་གྲགས་སུམ་སྟོང་ལྷག་པ། ལྟེ་བའི་མཁར་ཆེན་ནི།"
ground = text.split('་')
corrects = []
for i in range(len(ground)):
    test = ground[::]
    mask = test[i]
    test[i] = '<mask>'
    test = '་'.join(test)
    infs = fill_mask(test)
    correct = False
    for inf in infs:
        if inf['token_str'] == mask:
            correct = True
    corrects += [correct]
    
print(sum(corrects), len(corrects))