# Setup

In [1]:
%%capture
!pip install datasets evaluate transformers
!pip install rouge_score
!pip install accelerate
!pip install sacrebleu

In [None]:
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, TrainingArguments, Seq2SeqTrainingArguments
from tqdm.notebook import tqdm
from torch.utils.data import DataLoader
import pandas as pd
from datasets import load_metric
import torch
import numpy as np

In [None]:
tokenizer = AutoTokenizer.from_pretrained("VietAI/vit5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("/kaggle/input/vit5-modell")
model.to('cuda')

# Preprocessing data

In [5]:
def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["inputs"], max_length=300, truncation=True
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["labels"], max_length=300, truncation=True
        )
    model_inputs['labels'] = labels['input_ids']
    model_inputs['input_ids'] = model_inputs['input_ids']
    return model_inputs

In [6]:
train_set = pd.read_csv("/kaggle/input/sport-vmese-sentences/train_set2.csv").astype("str")
val_set = pd.read_csv("/kaggle/input/sport-vmese-sentences/val_set2.csv").astype("str")
test_set = pd.read_csv("/kaggle/input/sport-vmese-sentences/test_set2.csv").astype("str")

In [7]:
def tokenize(data):
    input_lines = data.iloc[:, 0].to_numpy()
    label_lines = data.iloc[:, 1].to_numpy()
    dict_obj = {'inputs': input_lines, 'labels': label_lines}
    dataset = Dataset.from_dict(dict_obj)
    tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=['inputs'], num_proc=10)

    return tokenized_datasets

In [None]:
tokenized_train_set = tokenize(train_set)

In [None]:
tokenized_val_set = tokenize(val_set)

In [None]:
tokenized_test_set = tokenize(test_set)

In [10]:
print(len(tokenized_train_set))
print(len(tokenized_val_set))
print(len(tokenized_test_set))

153218

6542

24881


# 5th

## Training

In [11]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="pt")

training_args = Seq2SeqTrainingArguments(
    output_dir="/kaggle/working/5th",
    do_train=True,
    do_eval=True,
    num_train_epochs=4,
    learning_rate=5e-4,
    warmup_ratio=0.01,
    weight_decay=0.05,
    prediction_loss_only=True,
    per_device_train_batch_size=9,
    per_device_eval_batch_size=4,
    logging_dir='/kaggle/working/5th',
    group_by_length=True,
    save_strategy="steps",
    save_steps=500,
    save_total_limit=1,
    evaluation_strategy="steps",
    eval_steps=500,
    fp16=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_set,
    data_collator=data_collator,
    eval_dataset=tokenized_val_set
)

In [12]:
import wandb
wandb.login(key="acc398b6c572515cdbf0044bc49909a609a7ba19")

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin



[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
trainer.train()

In [None]:
!zip -r 5th.zip /kaggle/working/5th

## Evaluate

In [9]:
model = AutoModelForSeq2SeqLM.from_pretrained("/kaggle/working/5th/checkpoint-34000")
model.to('cuda')

T5ForConditionalGeneration(
  (shared): Embedding(36096, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(36096, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [11]:
metrics = load_metric('sacrebleu')

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="pt")

max_target_length = 300
dataloader = torch.utils.data.DataLoader(tokenized_test_set, collate_fn=data_collator, batch_size=64)

predictions = []
references = []
for i, batch in enumerate(tqdm(dataloader)):
    outputs = model.generate(
        input_ids=batch['input_ids'].to('cuda'),
        max_length=max_target_length,
        attention_mask=batch['attention_mask'].to('cuda'),
    )
    with tokenizer.as_target_tokenizer():
        outputs = [tokenizer.decode(out, clean_up_tokenization_spaces=False, skip_special_tokens=True) for out in outputs]

        labels = np.where(batch['labels'] != -100,  batch['labels'], tokenizer.pad_token_id)
        actuals = [[tokenizer.decode(out, clean_up_tokenization_spaces=False, skip_special_tokens=True)] for out in labels]
    predictions.extend(outputs)
    references.extend(actuals)
    metrics.add_batch(predictions=outputs, references=actuals)
    
print(f"Bleu score: {metrics.compute()['score']}")

  0%|          | 0/389 [00:00<?, ?it/s]



Bleu score: 96.87101691308675


## Generate text

In [12]:
from unidecode import unidecode

sentence = unidecode(input())
print(sentence)
encoding = tokenizer(sentence, return_tensors="pt")
input_ids, attention_masks = encoding["input_ids"].to("cuda"), encoding["attention_mask"].to("cuda")
outputs = model.generate(
    input_ids=input_ids, attention_mask=attention_masks,
    max_length=1024,
)
for output in outputs:
    line = tokenizer.decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    print(line)

 Thích thú đứng theo dõi cả buổi, ông Adam Frank, một kỹ thuật viên về hưu, chia sẻ: "Con trai tôi tập một chút jujitsu. Tôi không biết nhiều về võ thuật, nhưng xem họ biểu diễn rất đẹp. Tôi nghĩ võ thuật là nền tảng tốt cho trẻ em".


Thich thu dung theo doi ca buoi, ong Adam Frank, mot ky thuat vien ve huu, chia se: "Con trai toi tap mot chut jujitsu. Toi khong biet nhieu ve vo thuat, nhung xem ho bieu dien rat dep. Toi nghi vo thuat la nen tang tot cho tre em".
Thích thú đứng theo dõi cả buổi, ông Adam Frank, một kỹ thuật viên về hưu, chia sẻ: "Con trai tôi tập một chút jujitsu. Tôi không biết nhiều về võ thuật, nhưng xem họ biểu diễn rất đẹp. Tôi nghĩ võ thuật là nền tảng tốt cho trẻ em".
