# Setup

In [1]:
%%capture
!pip install datasets evaluate transformers
!pip install rouge_score
!pip install accelerate
!pip install sacrebleu

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer,Seq2SeqTrainingArguments
from tqdm.notebook import tqdm
from torch.utils.data import DataLoader
import pandas as pd
from datasets import load_metric
import torch
import numpy as np

In [None]:
tokenizer = AutoTokenizer.from_pretrained("VietAI/vit5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("/kaggle/input/vit5-modell")
model.to('cuda')

# Preprocessing data

In [6]:
def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["inputs"], max_length=300, truncation=True
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["labels"], max_length=300, truncation=True
        )
    model_inputs['labels'] = labels['input_ids']
    model_inputs['input_ids'] = model_inputs['input_ids']
    return model_inputs

In [7]:
train_set = pd.read_csv("/kaggle/input/temp-data/train_set2.csv").astype("str")
val_set = pd.read_csv("/kaggle/input/temp-data/val_set2.csv").astype("str")
test_set = pd.read_csv("/kaggle/input/temp-data/test_set2.csv").astype("str")

In [8]:
def tokenize(data):
    input_lines = data.iloc[:, 0].to_numpy()
    label_lines = data.iloc[:, 1].to_numpy()
    dict_obj = {'inputs': input_lines, 'labels': label_lines}
    dataset = Dataset.from_dict(dict_obj)
    tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=['inputs'], num_proc=10)

    return tokenized_datasets

In [None]:
tokenized_train_set = tokenize(train_set)

In [None]:
tokenized_val_set = tokenize(val_set)

In [None]:
tokenized_test_set = tokenize(test_set)

In [12]:
print(len(tokenized_train_set))
print(len(tokenized_val_set))
print(len(tokenized_test_set))

153218
6542
24881


# 3th

## Training

In [16]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="pt")

training_args = Seq2SeqTrainingArguments(
    output_dir="/kaggle/working/3th",
    do_train=True,
    do_eval=True,
    num_train_epochs=3,
    learning_rate=1e-5,
    warmup_ratio=0.05,
    weight_decay=0.01,
    prediction_loss_only=True,
    per_device_train_batch_size=9,
    per_device_eval_batch_size=4,
    logging_dir='/kaggle/working/3th',
    group_by_length=True,
    save_strategy="steps",
    save_steps=500,
    save_total_limit=1,
    evaluation_strategy="steps",
    eval_steps=500,
    fp16=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_set,
    data_collator=data_collator,
    eval_dataset=tokenized_val_set
)

In [12]:
import wandb
wandb.login(key="acc398b6c572515cdbf0044bc49909a609a7ba19")

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin



[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [13]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mhuynhduykhoi619[0m. Use [1m`wandb login --relogin`[0m to force relogin





Step,Training Loss,Validation Loss
500,0.2741,0.038768
1000,0.0414,0.025893
1500,0.0321,0.023959
2000,0.0291,0.022091
2500,0.0289,0.021429
3000,0.0262,0.021141
3500,0.0247,0.019187
4000,0.0242,0.01971
4500,0.0226,0.019149
5000,0.0241,0.019564









































































































TrainOutput(global_step=25539, training_loss=0.02352539929802544, metrics={'train_runtime': 18310.823, 'train_samples_per_second': 25.103, 'train_steps_per_second': 1.395, 'total_flos': 2.431569332754432e+16, 'train_loss': 0.02352539929802544, 'epoch': 3.0})

In [14]:
!zip -r 3th.zip /kaggle/working/3th

  adding: kaggle/working/3rd/ (stored 0%)

  adding: kaggle/working/3rd/checkpoint-25500/ (stored 0%)

  adding: kaggle/working/3rd/checkpoint-25500/rng_state.pth (deflated 25%)

  adding: kaggle/working/3rd/checkpoint-25500/generation_config.json (deflated 29%)

  adding: kaggle/working/3rd/checkpoint-25500/scheduler.pt (deflated 55%)

  adding: kaggle/working/3rd/checkpoint-25500/config.json (deflated 48%)

  adding: kaggle/working/3rd/checkpoint-25500/optimizer.pt (deflated 9%)

  adding: kaggle/working/3rd/checkpoint-25500/trainer_state.json (deflated 83%)

  adding: kaggle/working/3rd/checkpoint-25500/training_args.bin (deflated 51%)

  adding: kaggle/working/3rd/checkpoint-25500/model.safetensors (deflated 7%)

  adding: kaggle/working/3rd/events.out.tfevents.1708473449.744285104d44.34.0 (deflated 68%)


## Evaluate

In [13]:
model = AutoModelForSeq2SeqLM.from_pretrained("/kaggle/working/3th/checkpoint-25500")
model.to('cuda')

T5ForConditionalGeneration(
  (shared): Embedding(36096, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(36096, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [17]:
metrics = load_metric('sacrebleu')

max_target_length = 300
dataloader = torch.utils.data.DataLoader(tokenized_test_set, collate_fn=data_collator, batch_size=64)

predictions = []
references = []
for i, batch in enumerate(tqdm(dataloader)):
    outputs = model.generate(
        input_ids=batch['input_ids'].to('cuda'),
        max_length=max_target_length,
        attention_mask=batch['attention_mask'].to('cuda'),
    )
    with tokenizer.as_target_tokenizer():
        outputs = [tokenizer.decode(out, clean_up_tokenization_spaces=False, skip_special_tokens=True) for out in outputs]

        labels = np.where(batch['labels'] != -100,  batch['labels'], tokenizer.pad_token_id)
        actuals = [[tokenizer.decode(out, clean_up_tokenization_spaces=False, skip_special_tokens=True)] for out in labels]
    predictions.extend(outputs)
    references.extend(actuals)
    metrics.add_batch(predictions=outputs, references=actuals)
    
print(f"Bleu score: {metrics.compute()['score']}")

  0%|          | 0/389 [00:00<?, ?it/s]

Bleu score: 96.78402904234343


## Generate text

In [19]:
from unidecode import unidecode

sentence = unidecode(input())
print(sentence)
encoding = tokenizer(sentence, return_tensors="pt")
input_ids, attention_masks = encoding["input_ids"].to("cuda"), encoding["attention_mask"].to("cuda")
outputs = model.generate(
    input_ids=input_ids, attention_mask=attention_masks,
    max_length=1024,
)
for output in outputs:
    line = tokenizer.decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    print(line)

 Nho so huu dan sao nay nen Hoop Dreams la ung vien hang dau cho chuc vo dich. Michael Soy - cau thu goc Viet cua Cantho Catfish tai VBA 2023 - cho biet: "Co nhieu doi bong manh den tu cac nuoc. Tuy nhien, Hoop Dreams co loi the khi so huu dan cau thu da quen thuoc nhau khi cung choi bong ro o Viet Nam. Kentrell Barkley la mot tay "sat thu" du moi lan dau thu suc voi bong ro 3x3"


Nho so huu dan sao nay nen Hoop Dreams la ung vien hang dau cho chuc vo dich. Michael Soy - cau thu goc Viet cua Cantho Catfish tai VBA 2023 - cho biet: "Co nhieu doi bong manh den tu cac nuoc. Tuy nhien, Hoop Dreams co loi the khi so huu dan cau thu da quen thuoc nhau khi cung choi bong ro o Viet Nam. Kentrell Barkley la mot tay "sat thu" du moi lan dau thu suc voi bong ro 3x3"
Nhờ sở hữu dàn sao này nên Hoop Dreams là ứng viên hàng đầu cho chức vô địch. Michael Soy - cầu thủ gốc Việt của Cantho Catfish tại VBA 2023 - cho biết: "Có nhiều đội bóng mạnh đến từ các nước. Tuy nhiên, Hoop Dreams có lợi thế khi sở hữu dàn cầu thủ đã quen thuộc nhau khi cùng chơi bóng rổ ở Việt Nam. Kentrell Barkley là một tay "sát thủ" dù mới lần đầu thử sức với bóng rổ 3x3"
