# Setup

In [1]:
%%capture
!pip install datasets evaluate transformers
!pip install rouge_score
!pip install accelerate
!pip install sacrebleu

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, TrainingArguments, Seq2SeqTrainingArguments
from tqdm.notebook import tqdm
from torch.utils.data import DataLoader
import pandas as pd
from datasets import load_metric
import torch
import numpy as np

2024-03-18 13:22:29.930772: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-18 13:22:29.930909: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-18 13:22:30.156618: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [None]:
tokenizer = AutoTokenizer.from_pretrained("VietAI/vit5-base")
# model = AutoModelForSeq2SeqLM.from_pretrained("/kaggle/input/vit5-modell")
# model.to('cuda')

# Preprocessing data

In [5]:
def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["inputs"], max_length=300, truncation=True
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["labels"], max_length=300, truncation=True
        )
    model_inputs['labels'] = labels['input_ids']
    model_inputs['input_ids'] = model_inputs['input_ids']
    return model_inputs

In [6]:
train_set = pd.read_csv("/kaggle/input/sport-vmese-sentences/train_set2.csv").astype("str")
val_set = pd.read_csv("/kaggle/input/sport-vmese-sentences/val_set2.csv").astype("str")
test_set = pd.read_csv("/kaggle/input/sport-vmese-sentences/test_set2.csv").astype("str")

In [7]:
def tokenize(data):
    input_lines = data.iloc[:, 0].to_numpy()
    label_lines = data.iloc[:, 1].to_numpy()
    dict_obj = {'inputs': input_lines, 'labels': label_lines}
    dataset = Dataset.from_dict(dict_obj)
    tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=['inputs'], num_proc=10)

    return tokenized_datasets

In [None]:
tokenized_train_set = tokenize(train_set)

In [None]:
tokenized_val_set = tokenize(val_set)

In [None]:
tokenized_test_set = tokenize(test_set)

In [12]:
print(len(tokenized_train_set))
print(len(tokenized_val_set))
print(len(tokenized_test_set))

153218

6542

24881


# 5th

## Training

In [13]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="pt")

training_args = Seq2SeqTrainingArguments(
    output_dir="/kaggle/working/5th",
    do_train=True,
    do_eval=True,
    num_train_epochs=4,
    learning_rate=5e-4,
    warmup_ratio=0.01,
    weight_decay=0.05,
    prediction_loss_only=True,
    per_device_train_batch_size=9,
    per_device_eval_batch_size=4,
    logging_dir='/kaggle/working/5th',
    group_by_length=True,
    save_strategy="steps",
    save_steps=500,
    save_total_limit=1,
    evaluation_strategy="steps",
    eval_steps=500,
    fp16=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_set,
    data_collator=data_collator,
    eval_dataset=tokenized_val_set
)

In [14]:
import wandb
wandb.login(key="acc398b6c572515cdbf0044bc49909a609a7ba19")

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin



[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [15]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mhuynhduykhoi619[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss
500,0.1736,0.333745
1000,0.2858,0.249776
1500,0.2448,0.188853
2000,0.224,0.167762
2500,0.2075,0.149103
3000,0.186,0.121141
3500,0.1723,0.120915
4000,0.162,0.106918
4500,0.1506,0.10139
5000,0.147,0.108727


TrainOutput(global_step=34052, training_loss=0.07988905804453846, metrics={'train_runtime': 25402.709, 'train_samples_per_second': 24.126, 'train_steps_per_second': 1.34, 'total_flos': 3.242245951807488e+16, 'train_loss': 0.07988905804453846, 'epoch': 4.0})

In [17]:
!zip -r 5th.zip /kaggle/working/5th

updating: kaggle/working/5th/ (stored 0%)

updating: kaggle/working/5th/checkpoint-34000/ (stored 0%)

updating: kaggle/working/5th/checkpoint-34000/model.safetensors (deflated 7%)

updating: kaggle/working/5th/checkpoint-34000/trainer_state.json (deflated 84%)

updating: kaggle/working/5th/checkpoint-34000/optimizer.pt (deflated 12%)

updating: kaggle/working/5th/checkpoint-34000/rng_state.pth (deflated 25%)

updating: kaggle/working/5th/checkpoint-34000/config.json (deflated 48%)

updating: kaggle/working/5th/checkpoint-34000/scheduler.pt (deflated 55%)

updating: kaggle/working/5th/checkpoint-34000/generation_config.json (deflated 29%)

updating: kaggle/working/5th/checkpoint-34000/training_args.bin (deflated 51%)

updating: kaggle/working/5th/events.out.tfevents.1710401103.3ea00fe8b7ab.34.0 (deflated 69%)


## Evaluate

In [9]:
model = AutoModelForSeq2SeqLM.from_pretrained("/kaggle/working/5th/checkpoint-34000")
model.to('cuda')

T5ForConditionalGeneration(
  (shared): Embedding(36096, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(36096, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [10]:
metrics = load_metric('sacrebleu')

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="pt")

max_target_length = 300
dataloader = torch.utils.data.DataLoader(tokenized_test_set, collate_fn=data_collator, batch_size=64)

predictions = []
references = []
for i, batch in enumerate(tqdm(dataloader)):
    outputs = model.generate(
        input_ids=batch['input_ids'].to('cuda'),
        max_length=max_target_length,
        attention_mask=batch['attention_mask'].to('cuda'),
    )
    with tokenizer.as_target_tokenizer():
        outputs = [tokenizer.decode(out, clean_up_tokenization_spaces=False, skip_special_tokens=True) for out in outputs]

        labels = np.where(batch['labels'] != -100,  batch['labels'], tokenizer.pad_token_id)
        actuals = [[tokenizer.decode(out, clean_up_tokenization_spaces=False, skip_special_tokens=True)] for out in labels]
    predictions.extend(outputs)
    references.extend(actuals)
    metrics.add_batch(predictions=outputs, references=actuals)
    
print(f"Bleu score: {metrics.compute()['score']}")

Downloading builder script:   0%|          | 0.00/2.85k [00:00<?, ?B/s]

  0%|          | 0/389 [00:00<?, ?it/s]

Bleu score: 96.87101691308675


## Generate text

In [20]:
from unidecode import unidecode

sentence = unidecode(input())
print(sentence)
encoding = tokenizer(sentence, return_tensors="pt")
input_ids, attention_masks = encoding["input_ids"].to("cuda"), encoding["attention_mask"].to("cuda")
outputs = model.generate(
    input_ids=input_ids, attention_mask=attention_masks,
    max_length=1024,
)
for output in outputs:
    line = tokenizer.decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    print(line)

 Sinner khởi đầu ván 1 với điểm số đầu tiên. Tuy nhiên, Medvedev đã nhanh chóng rút ngắn cách biệt và vươn lên dẫn trước 3-1. Ở những loạt bóng bền, tay vợt hạng 3 thế giới vẫn chứng tỏ sự lì lợm của mình. Trong khi đó, Sinner cũng không chịu khuất phục khi bám đuổi tỉ số từ 2-4 thành 3-4.


Sinner khoi dau van 1 voi diem so dau tien. Tuy nhien, Medvedev da nhanh chong rut ngan cach biet va vuon len dan truoc 3-1. O nhung loat bong ben, tay vot hang 3 the gioi van chung to su li lom cua minh. Trong khi do, Sinner cung khong chiu khuat phuc khi bam duoi ti so tu 2-4 thanh 3-4.
Sinner khởi đầu ván 1 với điểm số đầu tiên. Tuy nhiên, Medvedev đã nhanh chóng rút ngắn cách biệt và vươn lên dẫn trước 3-1. Ở những loạt bóng bền, tay vợt hạng 3 thế giới vẫn chứng tỏ sự lì lợm của mình. Trong khi đó, Sinner cũng không chịu khuất phục khi bám đuổi tỉ số từ 2-4 thành 3-4.
