<a href="https://colab.research.google.com/github/OpenPecha-dev/models/blob/main/models/lm/Classical_Bo_GPT_LM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install tokenizers
!pip install transformers
!pip install datasets
!pip install -U openpecha

Collecting openpecha
  Downloading openpecha-0.7.83-py3-none-any.whl (111 kB)
[K     |████████████████████████████████| 111 kB 18.0 MB/s 
Installing collected packages: openpecha
  Attempting uninstall: openpecha
    Found existing installation: openpecha 0.7.82
    Uninstalling openpecha-0.7.82:
      Successfully uninstalled openpecha-0.7.82
Successfully installed openpecha-0.7.83


In [None]:
from typing import List
from pathlib import Path

from tokenizers import ByteLevelBPETokenizer

### Paths

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
def _mkdir(path: Path) -> Path:
  path.mkdir(exist_ok=True, parents=True)
  return path

BASE_PATH = Path("/content/drive/MyDrive/OpenPecha/ML/LM")
DATA_PATH = _mkdir(BASE_PATH / "data")
MODELS_PATH = _mkdir(BASE_PATH / "models")

tokenizer_path = _mkdir(MODELS_PATH / "GPT_classical_bo")
lm_path = _mkdir(MODELS_PATH / "GPT_classical_bo")

## Download the corpus

In [None]:
from openpecha.corpus.download import download_corpus

In [None]:
corpus_path = download_corpus("classical_bo", DATA_PATH)

INFO: Downloading PC7518B4E...
INFO: Downloading PEAE1BEB5...
INFO: Downloading P9AAABA99...
INFO: Downloading PE08B39D4...
INFO: Downloading P660CE4D0...
INFO: Downloading P4A2FAFEA...
INFO: Downloading PC55A2EBF...
INFO: Downloading PC78343A6...
INFO: Downloading P406E0054...
INFO: Downloading PC22CB91F...
INFO: Downloading PB94F36D6...
INFO: Downloading P4D4F7C47...
INFO: Downloading PD6E5AF10...
INFO: Downloading P143DB3B5...
INFO: Downloading PDF4FD415...
INFO: Downloading PED98E392...
INFO: Downloading PE99138DF...
INFO: Downloading PC74743EF...
INFO: Downloading P02D29113...
INFO: Downloading P95488BBE...
INFO: Downloading P16A78071...
INFO: Downloading PE997BD16...
INFO: Downloading P0680C678...
INFO: Downloading P9224D7A7...
INFO: Downloading PAD105AAA...
INFO: Downloading P3C42B0C4...
INFO: Downloading P42A41338...
INFO: Downloading PD0D6CD6A...
INFO: Downloading PA0C4F688...
INFO: Downloading P94CE90D9...
INFO: Downloading PAF6381F7...
INFO: Downloading PB35A1867...
INFO: Do

KeyboardInterrupt: ignored

In [None]:
corpus_path = DATA_PATH / "classical_bo"

In [None]:
def get_text_paths(path) -> List[str]:
   files = []
   for pecha_path in path.iterdir():
     if (pecha_path / ".txt.").is_file():
       pecha_path.unlink()
       continue
     for fn in pecha_path.iterdir():
      files.append(str(fn))
   return files

## Train Tokenizer

In [None]:
tokenizer = ByteLevelBPETokenizer()

In [None]:
# Customize training
tokenizer.train(files=get_text_paths(corpus_path), vocab_size=52_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

In [None]:
 tokenizer.save_model(str(tokenizer_path))

['/content/drive/MyDrive/OpenPecha/ML/LM/models/GPT_classical_bo/vocab.json',
 '/content/drive/MyDrive/OpenPecha/ML/LM/models/GPT_classical_bo/merges.txt']

## Train Language Model

In [None]:
from datasets import load_dataset

In [None]:
def get_dataset(path):
  dataset = load_dataset("text", data_files=get_text_paths(path), split="train")
  dataset = dataset.train_test_split(test_size=0.2)
  return dataset

In [None]:
dataset = get_dataset(corpus_path)

Resolving data files:   0%|          | 0/4516 [00:00<?, ?it/s]

Using custom data configuration default-72dd1ce9ee4ca6c5


Downloading and preparing dataset text/default to /root/.cache/huggingface/datasets/text/default-72dd1ce9ee4ca6c5/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset text downloaded and prepared to /root/.cache/huggingface/datasets/text/default-72dd1ce9ee4ca6c5/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8. Subsequent calls will reuse this data.


In [None]:
from transformers import TrainingArguments, Trainer
from transformers import GPT2Config, GPT2TokenizerFast, GPT2LMHeadModel

In [None]:
config = GPT2Config(
    vocab_size=len(tokenizer),
    n_position=514,
    n_head=12,
    n_layer=6
)

In [None]:
# Initialize the model from a configuration without pretrained weights
model = GPT2LMHeadModel(config=config)
print('Num parameters: ',model.num_parameters())

Num parameters:  53345280


In [None]:
assert model.transformer.wte.weight.shape[0] == len(tokenizer)

In [None]:
tokenizer = GPT2TokenizerFast.from_pretrained(str(tokenizer_path))
tokenizer.add_special_tokens({
  "eos_token": "</s>",
  "bos_token": "<s>",
  "unk_token": "<unk>",
  "pad_token": "<pad>",
  "mask_token": "<mask>"
})

Didn't find file /content/drive/MyDrive/OpenPecha/ML/LM/models/GPT_classical_bo/tokenizer.json. We won't load it.
Didn't find file /content/drive/MyDrive/OpenPecha/ML/LM/models/GPT_classical_bo/added_tokens.json. We won't load it.
Didn't find file /content/drive/MyDrive/OpenPecha/ML/LM/models/GPT_classical_bo/special_tokens_map.json. We won't load it.
Didn't find file /content/drive/MyDrive/OpenPecha/ML/LM/models/GPT_classical_bo/tokenizer_config.json. We won't load it.
loading file /content/drive/MyDrive/OpenPecha/ML/LM/models/GPT_classical_bo/vocab.json
loading file /content/drive/MyDrive/OpenPecha/ML/LM/models/GPT_classical_bo/merges.txt
loading file None
loading file None
loading file None
loading file None
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Assigning </s> to the eos_token key of the tokenizer
Assigning <s> to the bos_token key of the tokenizer
Assigning <unk> to the unk_token key of the tokenizer
As

0

In [None]:
def encode(sentence):
    return tokenizer(sentence["text"], truncation=True, padding='max_length')

dataset_encoded = dataset.map(encode, batched=True)

Loading cached processed dataset at /root/.cache/huggingface/datasets/text/default-72dd1ce9ee4ca6c5/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8/cache-8440e3debbe329cd.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/text/default-72dd1ce9ee4ca6c5/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8/cache-231ae2ae333a878a.arrow


In [None]:
dataset_encoded = dataset_encoded.remove_columns(["text"])

In [None]:
from transformers import DataCollatorForLanguageModeling
import math

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=1,
    weight_decay=0.01,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_encoded["train"],
    eval_dataset=dataset_encoded["test"],
    data_collator=data_collator,
)

In [None]:
trainer.train()

***** Running training *****
  Num examples = 244704
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 30588


Epoch,Training Loss,Validation Loss


In [None]:
dataset_encoded

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 244704
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 61176
    })
})

In [None]:
len(dataset_encoded["train"][0]["input_ids"])

303