<a href="https://colab.research.google.com/github/OpenPecha-dev/models/blob/main/models/lm/Classical_Bo_BERT_LM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install tokenizers
!pip install transformers
!pip install datasets

Collecting tokenizers
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 19.4 MB/s 
[?25hInstalling collected packages: tokenizers
Successfully installed tokenizers-0.11.6
Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 26.2 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 57.1 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 6.5 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 45.3 MB/s 
Installing collected packages: pyyaml, sacremoses, huggin

In [None]:
from typing import List
from pathlib import Path

from tqdm import tqdm
from tokenizers import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

In [None]:
def _mkdir(path: Path) -> Path:
  path.mkdir(exist_ok=True, parents=True)
  return path

BASE_PATH = Path("/content/drive/MyDrive/OpenPecha/ML/LM")
DATA_PATH = BASE_PATH / "data"
MODELS_PATH = _mkdir(BASE_PATH / "models" / "transformers")

tokenizer_path = _mkdir(MODELS_PATH / "RoBERTaMLM_classical_bo")
lm_path = _mkdir(MODELS_PATH / "RoBERTaMLM_classical_bo")

In [None]:
def get_text_paths(path) -> List[str]:
   files = []
   for pecha_path in tqdm(list(path.iterdir())):
     if pecha_path.is_file():
       continue
     for fn in pecha_path.iterdir():
       if 'tokenized' in fn.stem:
         continue
       files.append(str(fn))
   return files

In [None]:
paths = get_text_paths(DATA_PATH / "classical_bo")

100%|██████████| 417/417 [00:08<00:00, 47.88it/s] 


## Train *Tokenizer*

In [None]:
# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

In [None]:
# Customize training
tokenizer.train(files=paths[], vocab_size=52_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

In [None]:
# Save files to disk
tokenizer.save_model(str(tokenizer_path))

['/content/drive/MyDrive/OpenPecha/ML/LM/models/transformers/RoBERTaMLM_classical_bo/vocab.json',
 '/content/drive/MyDrive/OpenPecha/ML/LM/models/transformers/RoBERTaMLM_classical_bo/merges.txt']

In [None]:
!head ./classical_bo-vocab.json

head: cannot open './classical_bo-vocab.json' for reading: No such file or directory


In [None]:
!head ./classical_bo-merges.txt

head: cannot open './classical_bo-merges.txt' for reading: No such file or directory


In [None]:
tokenizer = ByteLevelBPETokenizer(
    str(tokenizer_path / "vocab.json"),
    str(tokenizer_path / "merges.txt"),
)

In [None]:
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)

In [None]:
tokenizer.enable_truncation(max_length=512)

In [None]:
print(tokenizer.encode("མངོན་པར་རྟོགས་པའི་རྒྱན་འགྲེལ་བ་དང་བཅས་པའི་དཀའ་བའི་").tokens)

['<s>', 'à½ĺà½Ħ', 'à½¼', 'à½ĵ', 'à¼ĭ', 'à½Ķà½¢', 'à¼ĭ', 'à½¢', 'à¾Łà½¼', 'à½Ĥà½¦', 'à¼ĭ', 'à½Ķà½ł', 'à½²à¼ĭ', 'à½¢', 'à¾Ĵà¾±', 'à½ĵ', 'à¼ĭ', 'à½łà½Ĥ', 'à¾²à½º', 'à½£', 'à¼ĭ', 'à½ĸ', 'à¼ĭ', 'à½ĳà½Ħ', 'à¼ĭ', 'à½ĸà½ħà½¦', 'à¼ĭ', 'à½Ķà½ł', 'à½²à¼ĭ', 'à½ĳà½Ģà½ł', 'à¼ĭ', 'à½ĸà½ł', 'à½²à¼ĭ', '</s>']


## Train Language Model

In [None]:
from datasets import load_dataset

In [None]:
dataset_url = "https://raw.githubusercontent.com/OpenPecha/P000241/master/P000241.opf/base/v001.txt"

In [None]:
dataset = load_dataset("text", data_files=dataset_url, split="train")

Using custom data configuration default-6530a1aaf979a5fd


Downloading and preparing dataset text/default to /root/.cache/huggingface/datasets/text/default-6530a1aaf979a5fd/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/195k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset text downloaded and prepared to /root/.cache/huggingface/datasets/text/default-6530a1aaf979a5fd/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8. Subsequent calls will reuse this data.


In [None]:
dataset = dataset.train_test_split(test_size=0.2)

In [None]:
dataset["train"][0]

{'text': 'ཕྱི་རོལ་དོན་གྱི་རྣམ་པ་སྣང་བའི་ཤེས་པ་ཆོས་ཅན། གཅིག་ཏུ་ཡོད་པ་མ་ཡིན་ཏེ། སྣ་ཚོགས་སུ་སྣང་བའི་ཕྱིར་ཞེས་པའོ། །རྟགས་འདི་ལ་བརྟེན་པའི་ཚད་མ་ནི་རྣམ་རྫུན་པ་ལ་ཡང་ཡོད་དེ། ཇི་སྐད་དུ། རང་བཞིན་མཐོང་མེད་ཕྱིར་གཅིག་མིན། །ཞེས་གསུངས་པའི་ཕྱིར་རོ། །'}

In [None]:
from transformers import TrainingArguments, Trainer
from transformers import RobertaConfig, RobertaForMaskedLM, RobertaTokenizerFast

In [None]:
# Set a configuration for our RoBERTa model
config = RobertaConfig(
    vocab_size=8192,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)
# Initialize the model from a configuration without pretrained weights
model = RobertaForMaskedLM(config=config)
print('Num parameters: ',model.num_parameters())

Num parameters:  49816064


In [None]:
tokenizer = RobertaTokenizerFast.from_pretrained(str(tokenizer_path), max_len=512)

In [None]:
def encode(sentence):
    return tokenizer(sentence["text"], truncation=True, padding='max_length')

In [None]:
dataset_encoded = dataset.map(encode, batched=True)

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
from transformers import DataCollatorForLanguageModeling
import math

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=1,
    weight_decay=0.01,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_encoded["train"],
    eval_dataset=dataset_encoded["test"],
    data_collator=data_collator,
)

In [None]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1176
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 147


Epoch,Training Loss,Validation Loss
1,No log,2.090354


The following columns in the evaluation set  don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 294
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=147, training_loss=2.4134770607461733, metrics={'train_runtime': 221.5674, 'train_samples_per_second': 5.308, 'train_steps_per_second': 0.663, 'total_flos': 153642489348096.0, 'train_loss': 2.4134770607461733, 'epoch': 1.0})

In [None]:
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

The following columns in the evaluation set  don't have a corresponding argument in `RobertaForMaskedLM.forward` and have been ignored: text. If text are not expected by `RobertaForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 294
  Batch size = 8


Perplexity: 866.45


In [None]:
trainer.save_model(lm_path)

Saving model checkpoint to models/RoBERTaMLM_classical_bo
Configuration saved in models/RoBERTaMLM_classical_bo/config.json
Model weights saved in models/RoBERTaMLM_classical_bo/pytorch_model.bin


## Checking the trained model using a Pipeline

In [None]:
generator = pipeline(task="text-generation", model=model, tokenizer=tokenizer)

In [None]:
tokenizer

AttributeError: ignored

In [None]:
generator("།དོན་ཤེས་དོན་སྟོན་རབ་ཏུ་བྱེ")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 74, but ``max_length`` is set to 50. This can lead to unexpected behavior. You should consider increasing ``config.max_length`` or ``max_length``.


RuntimeError: ignored

In [None]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model=str(lm_path),
    tokenizer=str(tokenizer_path)
)

loading configuration file models/RoBERTaMLM_classical_bo/config.json
Model config RobertaConfig {
  "_name_or_path": "models/RoBERTaMLM_classical_bo",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.17.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 8192
}

loading configuration file models/RoBERTaMLM_classical_bo/config.json
Model config RobertaConfig {
  "_name_or_path": "models/RoBERTaMLM_classical_bo",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attentio

In [None]:
fill_mask("སྨོན་<mask>དག་པའི་ཤིང་")

[{'score': 0.05201738327741623,
  'sequence': 'སྨོན་་དག་པའི་ཤིང་',
  'token': 263,
  'token_str': '་'},
 {'score': 0.008260619826614857,
  'sequence': 'སྨོན་སདག་པའི་ཤིང་',
  'token': 264,
  'token_str': 'ས'},
 {'score': 0.008035602048039436,
  'sequence': 'སྨོན་ི་དག་པའི་ཤིང་',
  'token': 273,
  'token_str': 'ི་'},
 {'score': 0.007207084912806749,
  'sequence': 'སྨོན་ནདག་པའི་ཤིང་',
  'token': 270,
  'token_str': 'ན'},
 {'score': 0.006591159850358963,
  'sequence': 'སྨོན་པདག་པའི་ཤིང་',
  'token': 274,
  'token_str': 'པ'}]

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("distilgpt2")

model = AutoModelForCausalLM.from_pretrained("distilgpt2")

Could not locate the tokenizer configuration file, will try to use the model config instead.
https://huggingface.co/distilgpt2/resolve/main/config.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpx4jzzki3


Downloading:   0%|          | 0.00/762 [00:00<?, ?B/s]

storing https://huggingface.co/distilgpt2/resolve/main/config.json in cache at /root/.cache/huggingface/transformers/f985248d2791fcff97732e4ee263617adec1edb5429a2b8421734c6d14e39bee.422318838d1ec4e061efb4ea29671cb2a044e244dc69229682bebd7cacc81631
creating metadata file for /root/.cache/huggingface/transformers/f985248d2791fcff97732e4ee263617adec1edb5429a2b8421734c6d14e39bee.422318838d1ec4e061efb4ea29671cb2a044e244dc69229682bebd7cacc81631
loading configuration file https://huggingface.co/distilgpt2/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/f985248d2791fcff97732e4ee263617adec1edb5429a2b8421734c6d14e39bee.422318838d1ec4e061efb4ea29671cb2a044e244dc69229682bebd7cacc81631
Model config GPT2Config {
  "_name_or_path": "distilgpt2",
  "_num_labels": 1,
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "id2label": {
    "0": "LABEL_0"

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

storing https://huggingface.co/distilgpt2/resolve/main/vocab.json in cache at /root/.cache/huggingface/transformers/55051ac97dcc32f0a736d21a32a4d42b0d9b90f117ca7c38e65038b04bd5c3f5.c7ed1f96aac49e745788faa77ba0a26a392643a50bb388b9c04ff469e555241f
creating metadata file for /root/.cache/huggingface/transformers/55051ac97dcc32f0a736d21a32a4d42b0d9b90f117ca7c38e65038b04bd5c3f5.c7ed1f96aac49e745788faa77ba0a26a392643a50bb388b9c04ff469e555241f
https://huggingface.co/distilgpt2/resolve/main/merges.txt not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmp5d64xvkv


Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

storing https://huggingface.co/distilgpt2/resolve/main/merges.txt in cache at /root/.cache/huggingface/transformers/9dfb299b74cdf7601ba7cd3a8073dbdac351caec0ed7ab5849b098b3c8ae3d57.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
creating metadata file for /root/.cache/huggingface/transformers/9dfb299b74cdf7601ba7cd3a8073dbdac351caec0ed7ab5849b098b3c8ae3d57.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
https://huggingface.co/distilgpt2/resolve/main/tokenizer.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmp2isu_ur8


Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

storing https://huggingface.co/distilgpt2/resolve/main/tokenizer.json in cache at /root/.cache/huggingface/transformers/accb287b5a5396b2597382916b6cc939fdab1366e89475a92338d3971b3d02b7.cf2d0ecb83b6df91b3dbb53f1d1e4c311578bfd3aa0e04934215a49bf9898df0
creating metadata file for /root/.cache/huggingface/transformers/accb287b5a5396b2597382916b6cc939fdab1366e89475a92338d3971b3d02b7.cf2d0ecb83b6df91b3dbb53f1d1e4c311578bfd3aa0e04934215a49bf9898df0
loading file https://huggingface.co/distilgpt2/resolve/main/vocab.json from cache at /root/.cache/huggingface/transformers/55051ac97dcc32f0a736d21a32a4d42b0d9b90f117ca7c38e65038b04bd5c3f5.c7ed1f96aac49e745788faa77ba0a26a392643a50bb388b9c04ff469e555241f
loading file https://huggingface.co/distilgpt2/resolve/main/merges.txt from cache at /root/.cache/huggingface/transformers/9dfb299b74cdf7601ba7cd3a8073dbdac351caec0ed7ab5849b098b3c8ae3d57.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
loading file https://huggingface.co/distilgpt2/re

Downloading:   0%|          | 0.00/336M [00:00<?, ?B/s]

storing https://huggingface.co/distilgpt2/resolve/main/pytorch_model.bin in cache at /root/.cache/huggingface/transformers/43a212e83e76bcb07f45be584cf100676bdbbbe9c13f9e5c1c050049143a832f.a83d881ec4d624fd4b5826dd026e315246c48c67504ff91c0500570e291a54ba
creating metadata file for /root/.cache/huggingface/transformers/43a212e83e76bcb07f45be584cf100676bdbbbe9c13f9e5c1c050049143a832f.a83d881ec4d624fd4b5826dd026e315246c48c67504ff91c0500570e291a54ba
loading weights file https://huggingface.co/distilgpt2/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/43a212e83e76bcb07f45be584cf100676bdbbbe9c13f9e5c1c050049143a832f.a83d881ec4d624fd4b5826dd026e315246c48c67504ff91c0500570e291a54ba
All model checkpoint weights were used when initializing GPT2LMHeadModel.

All the weights of GPT2LMHeadModel were initialized from the model checkpoint at distilgpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use GPT2LMHeadModel 

In [None]:
model.init_weights()