In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
%cd /content/drive/MyDrive/AI_VIETNAM/AIO2023/Module 09/[Project]-RLHF

/content/drive/MyDrive/AI_VIETNAM/AIO2023/Module 09/[Project]-RLHF


In [3]:
!pip install -q transformers==4.38.2 tokenizers==0.15.2

In [4]:
!pip install -q accelerate -U

##**Dataset**

In [5]:
from datasets import load_dataset

In [26]:
oscar_en = load_dataset("nthngdy/oscar-small", language='en')

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading data:   0%|          | 0.00/76.1M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [27]:
oscar_en

DatasetDict({
    train: Dataset({
        features: ['id', 'text'],
        num_rows: 595810
    })
})

In [28]:
oscar_en['train']['text'][0]

'Mtendere Village was inspired by the vision of Chief Napoleon Dzombe, which he shared with John Blanchard during his first visit to Malawi. Chief Napoleon conveyed the desperate need for a program to intervene and care for the orphans and vulnerable children (OVC) in Malawi, and John committed to help.'

In [29]:
with open("oscar.en.txt", "w") as f:
    for text in oscar_en['train']['text']:
        f.write(text)


##**Tokenizer**

In [30]:
from tokenizers import ByteLevelBPETokenizer

paths = ['./oscar.en.txt']

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

In [31]:
# Save tokenizer
!mkdir oscar_en
tokenizer.save_model("oscar_en")

['oscar_en/vocab.json', 'oscar_en/merges.txt']

In [6]:
#Load vocabulary

from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing


tokenizer = ByteLevelBPETokenizer(
    "./oscar_en/vocab.json",
    "./oscar_en/merges.txt",
)

In [7]:
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

In [8]:
tokenizer.encode("I go to school.")

Encoding(num_tokens=7, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [9]:
tokenizer.encode("I go to school.").tokens

['<s>', 'I', 'Ġgo', 'Ġto', 'Ġschool', '.', '</s>']

##**LM Training from Scratch**

In [11]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=512,
    num_attention_heads=6,
    num_hidden_layers=4,
    type_vocab_size=1,
)

In [12]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained("./oscar_en", max_len=512)

In [13]:
from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM(config=config)

In [14]:
model

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(52000, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-3): 4 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): La

In [15]:
model.num_parameters()

69327136

In [16]:
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="./oscar.en.txt",
    block_size=128,
)



In [17]:
next(iter(dataset))

{'input_ids': tensor([    0,    49,  1714, 14288, 11276,   417,  5872,   423,   269,  5084,
           293,  7452, 33637,   379,    94,   298,  1027,    16,   562,   353,
          4610,   348,  2081,  1691,  3402,   461,  1341,   516,   765,  1447,
           289, 39542,    18,  7452, 33637, 30804,   269, 13001,   683,   324,
           262,  1207,   289, 29981,   291,  1144,   324,   269, 41572,   291,
         12216,  1652,   380,  6534,    39,    13,   292, 39542,    16,   291,
          2081,  5572,   289,   748,    18, 35500,   292,  7179,   293,  2081,
           998,  9298,    93,   387,    87,  3562,    16, 14400,  1691,  3402,
           461,    16,   414,  1891,  1207,   319,   747, 10438,   289,   269,
          1691,  3402,   461,  1366,    18, 24837,  1691,  3402,   461,    16,
           373, 30277, 24837,   359,   730,   319,   529,  7382,  6568,   289,
           401,   350,  1714, 14288,    16,  6009,   325,   892,  1341,   269,
          5113,  1976,    16,   291,   

In [18]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [33]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./oscar_en",
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=64,
    save_strategy='epoch',
    save_total_limit=2,
    prediction_loss_only=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

In [34]:
%%time
trainer.train()

Step,Training Loss


CPU times: user 33 s, sys: 8.81 s, total: 41.9 s
Wall time: 1min


TrainOutput(global_step=30, training_loss=6.829693603515625, metrics={'train_runtime': 60.1741, 'train_samples_per_second': 27.088, 'train_steps_per_second': 0.499, 'total_flos': 36299794759680.0, 'train_loss': 6.829693603515625, 'epoch': 10.0})

##**Mask Token Prediction**

In [36]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model=model,
    tokenizer=tokenizer
)

In [37]:
fill_mask("I go <mask> school.")

[{'score': 0.027589479461312294,
  'token': 269,
  'token_str': ' the',
  'sequence': 'I go the school.'},
 {'score': 0.02498016692698002,
  'token': 293,
  'token_str': ' of',
  'sequence': 'I go of school.'},
 {'score': 0.023255636915564537,
  'token': 16,
  'token_str': ',',
  'sequence': 'I go, school.'},
 {'score': 0.016717061400413513,
  'token': 289,
  'token_str': ' to',
  'sequence': 'I go to school.'},
 {'score': 0.009881892241537571,
  'token': 291,
  'token_str': ' and',
  'sequence': 'I go and school.'}]