In [2]:
!pip install datasets
!pip install transformers
!pip install accelerate>=0.21.0

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.23.0-py3-none-any.

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [43]:
from transformers import Seq2SeqTrainingArguments
import torch
import datasets
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, PreTrainedTokenizerFast, TrainingArguments, Trainer
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers, processors
import numpy as np

In [47]:
dataset = load_dataset('mt_eng_vietnamese', 'iwslt2015-en-vi')

In [44]:
dataset['train'][0]

{'translation': {'en': 'This is a sample text for training BPE.',
  'vi': 'Đây là một văn bản mẫu để huấn luyện BPE.'}}

#Mô hình sử dụng bpe

In [51]:
tokenizer_bpe = Tokenizer(models.BPE())
tokenizer_bpe.pre_tokenizer = pre_tokenizers.Whitespace()

trainer = trainers.BpeTrainer(vocab_size=30522, min_frequency=2, special_tokens=["<pad>", "<s>", "</s>", "<unk>", "<mask>"])

en_sentences = [ex['translation']['en'] for ex in dataset['train']]
vi_sentences = [ex['translation']['vi'] for ex in dataset['train']]

def batch_iterator(dataset, batch_size=1000):
    for i in range(0, len(dataset), batch_size):
        yield dataset[i:i + batch_size]

tokenizer_bpe.train_from_iterator(batch_iterator(en_sentences + vi_sentences), trainer)

tokenizer_bpe.post_processor = processors.TemplateProcessing(
    single="<s> $A </s>",
    pair="<s> $A </s> <s> $B:1 </s>",
    special_tokens=[
        ("<s>", 1),
        ("</s>", 2),
    ],
)
tokenizer_bpe.decoder = decoders.BPEDecoder()

def encode_with_bpe(batch, max_length=128):
    input_texts = batch['en']
    print("First input text:", input_texts[0])
    target_texts = batch['vi']
    print("First target text:", target_texts[0])
    inputs = [tokenizer_bpe.encode(text).ids for text in input_texts]
    outputs = [tokenizer_bpe.encode(text).ids for text in target_texts]

    inputs = [input[:max_length] + [0] * (max_length - len(input)) for input in inputs]
    outputs = [output[:max_length] + [0] * (max_length - len(output)) for output in outputs]
    print("Encoded first input:", inputs[0])
    print("Encoded first target:", outputs[0])
    batch['input_ids'] = inputs
    batch['attention_mask'] = [[1] * len(input) + [0] * (max_length - len(input)) for input in inputs]
    batch['labels'] = outputs

    return batch

dataset_bpe = datasets.Dataset.from_list(dataset['train']['translation']).map(lambda batch: encode_with_bpe(batch, max_length=128), batched=True)
dataset_bpe_eval = datasets.Dataset.from_list(dataset['validation']['translation']).map(lambda batch: encode_with_bpe(batch, max_length=128), batched=True)

Map:   0%|          | 0/133318 [00:00<?, ? examples/s]

First input text: Rachel Pike : The science behind a climate headline
First target text: Khoa học đằng sau một tiêu đề về khí hậu
Encoded first input: [1, 10366, 27639, 28, 437, 2075, 3011, 62, 3552, 13976, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Encoded first target: [1, 6754, 521, 3663, 734, 291, 1370, 707, 404, 1343, 2345, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
First i

Map:   0%|          | 0/1269 [00:00<?, ? examples/s]

First input text: When I was little , I thought my country was the best on the planet , and I grew up singing a song called &quot; Nothing To Envy . &quot;
First target text: Khi tôi còn nhỏ , Tôi nghĩ rằng BắcTriều Tiên là đất nước tốt nhất trên thế giới và tôi thường hát bài &quot; Chúng ta chẳng có gì phải ghen tị . &quot;
Encoded first input: [1, 1539, 41, 393, 992, 14, 41, 1387, 496, 1820, 393, 253, 1943, 260, 253, 2613, 14, 276, 41, 4365, 573, 7131, 62, 2709, 1275, 9, 308, 29, 9212, 2198, 4689, 9360, 16, 9, 308, 29, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Encoded first target: [1, 1218, 289, 725, 1015, 14, 445, 493, 473, 3158, 14510, 16241, 271, 1147, 797, 841, 617, 536, 426, 626, 280, 289, 950, 2527, 1195, 9, 308, 29, 549, 300, 1671, 292, 475, 467

In [9]:
# model_bpe = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
model_bpe = AutoModelForSeq2SeqLM.from_pretrained("/content/drive/MyDrive/Model_bpe/checkpoint-6500")
training_args_bpe = TrainingArguments(
    output_dir='/content/drive/MyDrive/Model_bpe',
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=5e-5,
    gradient_accumulation_steps=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

def data_collator(features):
    batch = {}
    batch['input_ids'] = torch.tensor([f['input_ids'] for f in features], dtype=torch.long)
    batch['attention_mask'] = torch.tensor([f['attention_mask'] for f in features], dtype=torch.long)
    batch['labels'] = torch.tensor([f['labels'] for f in features], dtype=torch.long)
    return batch

trainer_bpe = Trainer(
    model=model_bpe,
    args=training_args_bpe,
    train_dataset=dataset_bpe,
    eval_dataset=dataset_bpe_eval,
    data_collator=data_collator
)

In [None]:
# Huấn luyện
trainer_bpe.train()

Step,Training Loss
500,2.2976
1000,1.5199
1500,1.4581
2000,1.4206
2500,1.3693
3000,1.3511
3500,1.3103
4000,1.2783
4500,1.261
5000,1.2412


In [10]:
results_bpe = trainer_bpe.evaluate()
print("Evaluation results with BPE:", results_bpe)

Evaluation results with BPE: {'eval_loss': 1.2344528436660767, 'eval_runtime': 464.6366, 'eval_samples_per_second': 2.731, 'eval_steps_per_second': 0.342}


#Mô hình encoder-decoder theo word level

In [70]:
tokenizer_no_bpe = AutoTokenizer.from_pretrained("bert-base-uncased")

def encode_no_bpe(batch):
    inputs = tokenizer_no_bpe(batch['en'], truncation=True, padding='max_length', max_length=128)
    outputs = tokenizer_no_bpe(batch['vi'], truncation=True, padding='max_length', max_length=128)
    batch['input_ids'] = inputs['input_ids']
    batch['attention_mask'] = inputs['attention_mask']
    batch['labels'] = outputs['input_ids']
    return batch

dataset_no_bpe = datasets.Dataset.from_list(dataset['train']['translation']).map(encode_no_bpe, batched=True)
dataset_no_bpe_eval = datasets.Dataset.from_list(dataset['validation']['translation']).map(encode_no_bpe, batched=True)

Map:   0%|          | 0/133318 [00:00<?, ? examples/s]

Map:   0%|          | 0/1269 [00:00<?, ? examples/s]

#Mô hình không sử dụng BPE

In [12]:
tokenizer_no_bpe = AutoTokenizer.from_pretrained("bert-base-uncased")

def encode_no_bpe(batch):
    inputs = tokenizer_no_bpe(batch['en'], truncation=True, padding='max_length', max_length=128)
    outputs = tokenizer_no_bpe(batch['vi'], truncation=True, padding='max_length', max_length=128)
    batch['input_ids'] = inputs['input_ids']
    batch['attention_mask'] = inputs['attention_mask']
    batch['labels'] = outputs['input_ids']
    return batch

dataset_no_bpe = datasets.Dataset.from_list(dataset['train']['translation']).map(encode_no_bpe, batched=True)
dataset_no_bpe_eval = datasets.Dataset.from_list(dataset['validation']['translation']).map(encode_no_bpe, batched=True)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/133318 [00:00<?, ? examples/s]

Map:   0%|          | 0/1269 [00:00<?, ? examples/s]

In [38]:
print(dataset_no_bpe[0])
print(dataset_no_bpe_eval[0])
{'en': 'Rachel Pike : The science behind a climate headline', 'vi': 'Khoa học đằng sau một tiêu đề về khí hậu', 'input_ids': [101, 5586, 12694, 1024, 1996, 2671, 2369, 1037, 4785, 17653, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [101, 1047, 6806, 2050, 21929, 1102, 5654, 7842, 2226, 9587, 2102, 5495, 2226, 1102, 2063, 2310, 1047, 4048, 5292, 2226, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
{'en': 'When I was little , I thought my country was the best on the planet , and I grew up singing a song called &quot; Nothing To Envy . &quot;', 'vi': 'Khi tôi còn nhỏ , Tôi nghĩ rằng BắcTriều Tiên là đất nước tốt nhất trên thế giới và tôi thường hát bài &quot; Chúng ta chẳng có gì phải ghen tị . &quot;', 'input_ids': [101, 2043, 1045, 2001, 2210, 1010, 1045, 2245, 2026, 2406, 2001, 1996, 2190, 2006, 1996, 4774, 1010, 1998, 1045, 3473, 2039, 4823, 1037, 2299, 2170, 1004, 22035, 2102, 1025, 2498, 2000, 21103, 1012, 1004, 22035, 2102, 1025, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [101, 1047, 4048, 2000, 2072, 9530, 18699, 2080, 1010, 2000, 2072, 12835, 4048, 8369, 8670, 6593, 7373, 2226, 5495, 2078, 2474, 1102, 4017, 16371, 10085, 2000, 2102, 18699, 4017, 29461, 2078, 1996, 21025, 10448, 12436, 2000, 2072, 16215, 19098, 3070, 6045, 21790, 1004, 22035, 2102, 1025, 15972, 11937, 11132, 2522, 21025, 6887, 4886, 1043, 10222, 14841, 1012, 1004, 22035, 2102, 1025, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}

{'en': 'Rachel Pike : The science behind a climate headline', 'vi': 'Khoa học đằng sau một tiêu đề về khí hậu', 'input_ids': [101, 5586, 12694, 1024, 1996, 2671, 2369, 1037, 4785, 17653, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [101, 1047, 6806, 2050, 21929, 1102, 5654,

In [87]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
from transformers import Seq2SeqTrainer
# model_no_bpe = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
model_no_bpe = AutoModelForSeq2SeqLM.from_pretrained("/content/drive/MyDrive/Model_no_bpe/checkpoint-6500")
training_args_no_bpe = Seq2SeqTrainingArguments(
    output_dir='/content/drive/MyDrive/Model_no_bpe',
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=5e-5,
    gradient_accumulation_steps=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

trainer_no_bpe =  Seq2SeqTrainer(
    model=model_no_bpe,
    args=training_args_no_bpe,
    train_dataset=dataset_no_bpe,
    eval_dataset=dataset_no_bpe_eval
)

In [1]:
trainer_no_bpe.train()

NameError: name 'trainer_no_bpe' is not defined

In [89]:
results_no_bpe = trainer_no_bpe.evaluate()
print("Evaluation results without BPE:", results_no_bpe)

Step,Training Loss,Validation Loss
2,No log,1.268862


Evaluation results without BPE: {'eval_loss': 1.2688624858856201}
