<a href="https://colab.research.google.com/github/OpenPecha-dev/models/blob/main/models/lm/Classical_Bo_GPT_LM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install tokenizers
!pip install transformers
!pip install datasets



In [7]:
from typing import List
from pathlib import Path

from tokenizers import ByteLevelBPETokenizer

### Paths

In [8]:
def _mkdir(path: Path) -> Path:
  path.mkdir(exist_ok=True, parents=True)
  return path

BASE_PATH = Path.home() / ".models"
DATA_PATH = _mkdir(BASE_PATH / "data")
MODELS_PATH = _mkdir(BASE_PATH / "models")

tokenizer_path = _mkdir(MODELS_PATH / "GPT_classical_bo")
lm_path = _mkdir(MODELS_PATH / "GPT_classical_bo")
corpus_path = DATA_PATH / "classical_bo"

In [9]:
def get_text_paths(path) -> List[str]:
    files = []
    for pecha_path in path.iterdir():
        for fn in pecha_path.iterdir():
            if 'tokenized' in fn.name: continue
            files.append(str(fn))
    return files

## Train Tokenizer

In [10]:
# tokenizer = ByteLevelBPETokenizer()

In [11]:
# # Customize training
# tokenizer.train(files=get_text_paths(corpus_path), vocab_size=52_000, min_frequency=2, special_tokens=[
#     "<s>",
#     "<pad>",
#     "</s>",
#     "<unk>",
#     "<mask>",
# ])

In [12]:
 # tokenizer.save_model(str(tokenizer_path))

## Train Language Model

In [18]:
from datasets import load_dataset
from transformers import TrainingArguments, Trainer
from transformers import GPT2Config, GPT2TokenizerFast, GPT2LMHeadModel

In [19]:
def get_dataset(path):
  dataset = load_dataset("text", data_files=get_text_paths(path), split="train")
  dataset = dataset.train_test_split(test_size=0.2)
  return dataset

In [15]:
dataset = get_dataset(corpus_path)

Resolving data files:   0%|          | 0/4219 [00:00<?, ?it/s]

Using custom data configuration default-d06951f21216b50b
Reusing dataset text (/home/studio-lab-user/.cache/huggingface/datasets/text/default-d06951f21216b50b/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8)


In [20]:
tokenizer = GPT2TokenizerFast.from_pretrained(str(tokenizer_path))
tokenizer.add_special_tokens({
  "eos_token": "</s>",
  "bos_token": "<s>",
  "unk_token": "<unk>",
  "pad_token": "<pad>",
  "mask_token": "<mask>"
})

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


0

In [21]:
config = GPT2Config(
    vocab_size=len(tokenizer),
    n_position=514,
    n_head=12,
    n_layer=6
)

In [22]:
# Initialize the model from a configuration without pretrained weights
model = GPT2LMHeadModel(config=config)
print('Num parameters: ',model.num_parameters())

Num parameters:  48330240


In [23]:
assert model.transformer.wte.weight.shape[0] == len(tokenizer)

In [24]:
def encode(sentence):
    return tokenizer(sentence["text"], truncation=True, padding='max_length')

dataset_encoded = dataset.map(encode, batched=True)

  0%|          | 0/344 [00:00<?, ?ba/s]

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


  0%|          | 0/86 [00:00<?, ?ba/s]

In [25]:
dataset_encoded = dataset_encoded.remove_columns(["text"])

In [26]:
from transformers import DataCollatorForLanguageModeling
import math

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [27]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=1,
    weight_decay=0.01,
)

In [28]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_encoded["train"],
    eval_dataset=dataset_encoded["test"],
    data_collator=data_collator,
)

In [29]:
trainer.train()

***** Running training *****
  Num examples = 343508
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 42939


IndexError: index out of range in self