# Setup

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%%capture
!pip install datasets evaluate transformers
!pip install rouge_score
!pip install accelerate

In [3]:
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, TrainingArguments, Seq2SeqTrainingArguments
from tqdm.notebook import tqdm
from torch.utils.data import DataLoader
import pandas as pd
from datasets import load_metric
import torch
import numpy as np

In [4]:
tokenizer = AutoTokenizer.from_pretrained("VietAI/vit5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("VietAI/vit5-base")
model.to('cuda')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/820k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.40M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.12k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/702 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/904M [00:00<?, ?B/s]

T5ForConditionalGeneration(
  (shared): Embedding(36096, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(36096, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

# Preprocessing data

In [18]:
def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["inputs"], max_length=300, truncation=True, padding=True
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["labels"], max_length=300, truncation=True, padding=True
        )
    model_inputs['labels'] = labels['input_ids']
    model_inputs['input_ids'] = model_inputs['input_ids']
    return model_inputs

In [22]:
train_set = pd.read_csv("/content/drive/MyDrive/CT208/Data/train_set.csv")
val_set = pd.read_csv("/content/drive/MyDrive/CT208/Data/val_set.csv")
test_set = pd.read_csv("/content/drive/MyDrive/CT208/Data/test_set.csv")

In [23]:
def tokenize(data):
    input_lines = data.iloc[:, 0].to_numpy()
    label_lines = data.iloc[:, 1].to_numpy()
    dict_obj = {'inputs': input_lines, 'labels': label_lines}
    dataset = Dataset.from_dict(dict_obj)
    tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=['inputs'], num_proc=10)

    return tokenized_datasets

In [24]:
tokenized_train_set = tokenize(train_set)
tokenized_val_set = tokenize(val_set)
tokenized_test_set = tokenize(test_set)

Map (num_proc=10):   0%|          | 0/23272 [00:00<?, ? examples/s]



Map (num_proc=10):   0%|          | 0/5609 [00:00<?, ? examples/s]



Map (num_proc=10):   0%|          | 0/4965 [00:00<?, ? examples/s]



In [None]:
print(len(tokenized_train_set))
print(len(tokenized_val_set))
print(len(tokenized_test_set))

# Draft

## Training

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="pt")

training_args = Seq2SeqTrainingArguments("/content/drive/MyDrive/CT208/1st",
                                      do_train=True,
                                      do_eval=False,
                                      num_train_epochs=30,
                                      learning_rate=1e-5,
                                      warmup_ratio=0.05,
                                      weight_decay=0.01,
                                      per_device_train_batch_size=4,
                                      per_device_eval_batch_size=4,
                                      logging_dir='./log',
                                      group_by_length=True,
                                      save_strategy="epoch",
                                      save_total_limit=3,
                                      #eval_steps=1,
                                      #evaluation_strategy="steps",
                                      # evaluation_strategy="no",
                                      fp16=True,
                                      )

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_set,
    data_collator=data_collator,
)

In [None]:
trainer.train()

## Evaluate

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained("/content/drive/MyDrive/CT208/1st/checkpoint-22500")
model.to('cuda')

In [None]:
metrics = load_metric('rouge')

max_target_length = 300
dataloader = torch.utils.data.DataLoader(tokenized_val_set, collate_fn=data_collator, batch_size=32)

predictions = []
references = []
for i, batch in enumerate(tqdm(dataloader)):
    outputs = model.generate(
        input_ids=batch['input_ids'].to('cuda'),
        max_length=max_target_length,
        attention_mask=batch['attention_mask'].to('cuda'),
    )
    with tokenizer.as_target_tokenizer():
        outputs = [tokenizer.decode(out, clean_up_tokenization_spaces=False, skip_special_tokens=True) for out in outputs]

        labels = np.where(batch['labels'] != -100,  batch['labels'], tokenizer.pad_token_id)
        actuals = [tokenizer.decode(out, clean_up_tokenization_spaces=False, skip_special_tokens=True) for out in labels]
    predictions.extend(outputs)
    references.extend(actuals)
    metrics.add_batch(predictions=outputs, references=actuals)


metrics.compute()

In [None]:
[{k: v.mid.fmeasure} for k,v in metrics.compute(predictions=predictions, references=references).items()]

## Generate text

In [None]:
sentence = "Các giá trị 'rouge-1,' 'rouge-2,' 'rouge-L,' và 'rouge-Lsum' là các phương pháp đánh giá chất lượng của các hệ thống tạo ra các văn bản tóm tắt. Đây là các độ đo phổ biến trong lĩnh vực xử lý ngôn ngữ tự nhiên và tóm tắt máy học."
text =  sentence + " </s>"
encoding = tokenizer(text, return_tensors="pt")
input_ids, attention_masks = encoding["input_ids"].to("cuda"), encoding["attention_mask"].to("cuda")
outputs = model.generate(
    input_ids=input_ids, attention_mask=attention_masks,
    max_length=300,
    early_stopping=True
)
for output in outputs:
    line = tokenizer.decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    print(line)

# 1st

## Training

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="pt")

training_args = Seq2SeqTrainingArguments(output_dir="tmp/",
                                      do_train=True,
                                      do_eval=True,
                                      num_train_epochs=30,
                                      learning_rate=1e-5,
                                      warmup_ratio=0.05,
                                      weight_decay=0.01,
                                      per_device_train_batch_size=4,
                                      per_device_eval_batch_size=4,
                                      logging_dir='./log',
                                      group_by_length=True,
                                      save_strategy="epoch",
                                      save_total_limit=1,
                                      evaluation_strategy="epoch",
                                      fp16=True,
                                      )

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_set,
    data_collator=data_collator,
    eval_dataset=tokenized_val_set
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.0039,0.08892
2,0.004,0.093458
3,0.0038,0.09127
4,0.0031,0.094107
5,0.0023,0.095836
6,0.0021,0.097693
7,0.0024,0.094863
8,0.0018,0.094528
9,0.0017,0.094873
10,0.0018,0.096007


KeyboardInterrupt: 

## Evaluate

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained("/content/tmp/checkpoint-15000")
model.to('cuda')

T5ForConditionalGeneration(
  (shared): Embedding(36096, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(36096, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [None]:
metrics = load_metric('rouge')

max_target_length = 300
dataloader = torch.utils.data.DataLoader(tokenized_test_set, collate_fn=data_collator, batch_size=32)

predictions = []
references = []
for i, batch in enumerate(tqdm(dataloader)):
    outputs = model.generate(
        input_ids=batch['input_ids'].to('cuda'),
        max_length=max_target_length,
        attention_mask=batch['attention_mask'].to('cuda'),
    )
    with tokenizer.as_target_tokenizer():
        outputs = [tokenizer.decode(out, clean_up_tokenization_spaces=False, skip_special_tokens=True) for out in outputs]

        labels = np.where(batch['labels'] != -100,  batch['labels'], tokenizer.pad_token_id)
        actuals = [tokenizer.decode(out, clean_up_tokenization_spaces=False, skip_special_tokens=True) for out in labels]
    predictions.extend(outputs)
    references.extend(actuals)
    metrics.add_batch(predictions=outputs, references=actuals)


metrics.compute()

  metrics = load_metric('rouge')
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

  0%|          | 0/32 [00:00<?, ?it/s]



{'rouge1': AggregateScore(low=Score(precision=0.9697815817270669, recall=0.9649854988936158, fmeasure=0.9671411094961017), mid=Score(precision=0.9727936207335566, recall=0.968095967381642, fmeasure=0.9700114273291586), high=Score(precision=0.9756581854340024, recall=0.9711486167475941, fmeasure=0.9728402048995053)),
 'rouge2': AggregateScore(low=Score(precision=0.9475699001744455, recall=0.9428342865191611, fmeasure=0.9449278929941518), mid=Score(precision=0.9525957043810401, recall=0.9480341669488486, fmeasure=0.9499068165659356), high=Score(precision=0.9567887374436481, recall=0.9527548819803037, fmeasure=0.9542870401165016)),
 'rougeL': AggregateScore(low=Score(precision=0.9667994457184388, recall=0.9619037732888849, fmeasure=0.9640355081020906), mid=Score(precision=0.9701182577607471, recall=0.9655068392826804, fmeasure=0.9674139893836349), high=Score(precision=0.9732413702777974, recall=0.9687075037777059, fmeasure=0.9705334693640532)),
 'rougeLsum': AggregateScore(low=Score(preci

In [None]:
[{k: v.mid.fmeasure} for k,v in metrics.compute(predictions=predictions, references=references).items()]

[{'rouge1': 0.9700114273291586},
 {'rouge2': 0.9499068165659356},
 {'rougeL': 0.9674139893836349},
 {'rougeLsum': 0.9674806461226331}]

## Generate text

In [None]:
sentence = "Ngay 5 2 ban to chuc cuoc thi Thu thach Vesuvius da trao giai thuong tri gia 700 000 USD cho 3 nha nghien cuu ve thanh tuu ung dung tri tue nhan tao AI de giai ma thong tin trong cuon giay 2 000 nam tuoi bi chay sem trong vu phun trao nui lua Vesuvius chon vui thanh pho La Ma co dai Pompeii"
text =  sentence + " </s>"
encoding = tokenizer(text, return_tensors="pt")
input_ids, attention_masks = encoding["input_ids"].to("cuda"), encoding["attention_mask"].to("cuda")
outputs = model.generate(
    input_ids=input_ids, attention_mask=attention_masks,
    max_length=300,
    early_stopping=True
)
for output in outputs:
    line = tokenizer.decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    print(line)

Ngày 5 2 ban tổ chức cuộc thi Thu hoạch Vesuvius đã trao giải thưởng trị giá 700 000 USD cho 3 nhà nghiên cứu về thành tựu ứng dụng trí tuệ nhân tạo AI để giải mã thông tin trong cuốn sách 2 000 năm tuổi bị cháy sém trong vụ cháy rừng núi Vesuvius chọn vui thành phố La Mã cổ đại Pompeii 


NotImplementedError: A UTF-8 locale is required. Got ANSI_X3.4-1968

# 2nd

## Training

In [25]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="pt")

training_args = Seq2SeqTrainingArguments(output_dir="tmp/",
                                      do_train=True,
                                      do_eval=True,
                                      num_train_epochs=20,
                                      learning_rate=1e-5,
                                      warmup_ratio=0.05,
                                      weight_decay=0.01,
                                      per_device_train_batch_size=8,
                                      per_device_eval_batch_size=8,
                                      logging_dir='./log',
                                      group_by_length=True,
                                      save_strategy="epoch",
                                      save_total_limit=1,
                                      evaluation_strategy="epoch",
                                      fp16=True,
                                      )

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_set,
    data_collator=data_collator,
    eval_dataset=tokenized_val_set
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.2842,0.140286
2,0.0892,0.04988
3,0.0502,0.03145
4,0.0359,0.024392
5,0.0252,0.020294
6,0.0202,0.018651
7,0.0165,0.017259
8,0.0129,0.016916
9,0.0105,0.016079
10,0.0089,0.015936


## Evaluate

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained("/content/tmp/checkpoint-15000")
model.to('cuda')

T5ForConditionalGeneration(
  (shared): Embedding(36096, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(36096, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [None]:
metrics = load_metric('rouge')

max_target_length = 300
dataloader = torch.utils.data.DataLoader(tokenized_test_set, collate_fn=data_collator, batch_size=32)

predictions = []
references = []
for i, batch in enumerate(tqdm(dataloader)):
    outputs = model.generate(
        input_ids=batch['input_ids'].to('cuda'),
        max_length=max_target_length,
        attention_mask=batch['attention_mask'].to('cuda'),
    )
    with tokenizer.as_target_tokenizer():
        outputs = [tokenizer.decode(out, clean_up_tokenization_spaces=False, skip_special_tokens=True) for out in outputs]

        labels = np.where(batch['labels'] != -100,  batch['labels'], tokenizer.pad_token_id)
        actuals = [tokenizer.decode(out, clean_up_tokenization_spaces=False, skip_special_tokens=True) for out in labels]
    predictions.extend(outputs)
    references.extend(actuals)
    metrics.add_batch(predictions=outputs, references=actuals)


metrics.compute()

  metrics = load_metric('rouge')
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

  0%|          | 0/32 [00:00<?, ?it/s]



{'rouge1': AggregateScore(low=Score(precision=0.9697815817270669, recall=0.9649854988936158, fmeasure=0.9671411094961017), mid=Score(precision=0.9727936207335566, recall=0.968095967381642, fmeasure=0.9700114273291586), high=Score(precision=0.9756581854340024, recall=0.9711486167475941, fmeasure=0.9728402048995053)),
 'rouge2': AggregateScore(low=Score(precision=0.9475699001744455, recall=0.9428342865191611, fmeasure=0.9449278929941518), mid=Score(precision=0.9525957043810401, recall=0.9480341669488486, fmeasure=0.9499068165659356), high=Score(precision=0.9567887374436481, recall=0.9527548819803037, fmeasure=0.9542870401165016)),
 'rougeL': AggregateScore(low=Score(precision=0.9667994457184388, recall=0.9619037732888849, fmeasure=0.9640355081020906), mid=Score(precision=0.9701182577607471, recall=0.9655068392826804, fmeasure=0.9674139893836349), high=Score(precision=0.9732413702777974, recall=0.9687075037777059, fmeasure=0.9705334693640532)),
 'rougeLsum': AggregateScore(low=Score(preci

In [None]:
[{k: v.mid.fmeasure} for k,v in metrics.compute(predictions=predictions, references=references).items()]

[{'rouge1': 0.9700114273291586},
 {'rouge2': 0.9499068165659356},
 {'rougeL': 0.9674139893836349},
 {'rougeLsum': 0.9674806461226331}]

## Generate text

In [None]:
sentence = "Ngay 5 2 ban to chuc cuoc thi Thu thach Vesuvius da trao giai thuong tri gia 700 000 USD cho 3 nha nghien cuu ve thanh tuu ung dung tri tue nhan tao AI de giai ma thong tin trong cuon giay 2 000 nam tuoi bi chay sem trong vu phun trao nui lua Vesuvius chon vui thanh pho La Ma co dai Pompeii"
text =  sentence + " </s>"
encoding = tokenizer(text, return_tensors="pt")
input_ids, attention_masks = encoding["input_ids"].to("cuda"), encoding["attention_mask"].to("cuda")
outputs = model.generate(
    input_ids=input_ids, attention_mask=attention_masks,
    max_length=300,
    early_stopping=True
)
for output in outputs:
    line = tokenizer.decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    print(line)

Ngày 5 2 ban tổ chức cuộc thi Thu hoạch Vesuvius đã trao giải thưởng trị giá 700 000 USD cho 3 nhà nghiên cứu về thành tựu ứng dụng trí tuệ nhân tạo AI để giải mã thông tin trong cuốn sách 2 000 năm tuổi bị cháy sém trong vụ cháy rừng núi Vesuvius chọn vui thành phố La Mã cổ đại Pompeii 


NotImplementedError: A UTF-8 locale is required. Got ANSI_X3.4-1968