# Setup

In [None]:
%%capture
!pip install datasets evaluate transformers
!pip install rouge_score
!pip install accelerate

In [None]:
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, TrainingArguments, Seq2SeqTrainingArguments
from tqdm.notebook import tqdm
from torch.utils.data import DataLoader
import pandas as pd
from datasets import load_metric
import torch
import numpy as np

In [None]:
tokenizer = AutoTokenizer.from_pretrained("VietAI/vit5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("VietAI/vit5-base")
model.to('cuda')

# Preprocessing data

In [None]:
def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["inputs"], max_length=300, truncation=True, padding=True
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["labels"], max_length=300, truncation=True, padding=True
        )
    model_inputs['labels'] = labels['input_ids']
    model_inputs['input_ids'] = model_inputs['input_ids']
    return model_inputs

In [None]:
train_set = pd.read_csv("/kaggle/input/nonpunc-and-punc-vnese-sentences/train_set.csv").astype("str")
val_set = pd.read_csv("/kaggle/input/nonpunc-and-punc-vnese-sentences/val_set.csv").astype("str")
test_set = pd.read_csv("/kaggle/input/nonpunc-and-punc-vnese-sentences/test_set.csv").astype("str")

In [None]:
def tokenize(data):
    input_lines = data.iloc[:, 0].to_numpy()
    label_lines = data.iloc[:, 1].to_numpy()
    dict_obj = {'inputs': input_lines, 'labels': label_lines}
    dataset = Dataset.from_dict(dict_obj)
    tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=['inputs'], num_proc=10)

    return tokenized_datasets

In [None]:
tokenized_train_set = tokenize(train_set)

In [None]:
tokenized_val_set = tokenize(val_set)

In [None]:
tokenized_test_set = tokenize(test_set)

In [None]:
print(len(tokenized_train_set))
print(len(tokenized_val_set))
print(len(tokenized_test_set))

# 8th

## Training

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="pt")

training_args = Seq2SeqTrainingArguments(
    output_dir="/kaggle/working/8th",
    do_train=True,
    do_eval=True,
    num_train_epochs=2,
    learning_rate=1e-5,
    warmup_ratio=0.05,
    weight_decay=0.01,
    prediction_loss_only=True,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    logging_dir='/kaggle/working/8th',
    group_by_length=True,
    save_strategy="steps",
    save_steps=500,
    save_total_limit=1,
    evaluation_strategy="steps",
    eval_steps=500,
    fp16=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_set,
    data_collator=data_collator,
    eval_dataset=tokenized_val_set
)

In [None]:
import wandb
wandb.login(key="acc398b6c572515cdbf0044bc49909a609a7ba19")

In [None]:
trainer.train()

In [None]:
!zip -r 7th.zip /kaggle/working/5th

## Evaluate

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained("./5th/checkpoint-16000")
model.to('cuda')

In [None]:
metrics = load_metric('rouge')

max_target_length = 300
dataloader = torch.utils.data.DataLoader(tokenized_test_set, collate_fn=data_collator, batch_size=32)

predictions = []
references = []
for i, batch in enumerate(tqdm(dataloader)):
    outputs = model.generate(
        input_ids=batch['input_ids'].to('cuda'),
        max_length=max_target_length,
        attention_mask=batch['attention_mask'].to('cuda'),
    )
    with tokenizer.as_target_tokenizer():
        outputs = [tokenizer.decode(out, clean_up_tokenization_spaces=False, skip_special_tokens=True) for out in outputs]

        labels = np.where(batch['labels'] != -100,  batch['labels'], tokenizer.pad_token_id)
        actuals = [tokenizer.decode(out, clean_up_tokenization_spaces=False, skip_special_tokens=True) for out in labels]
    predictions.extend(outputs)
    references.extend(actuals)
    metrics.add_batch(predictions=outputs, references=actuals)
    
[{k: v.mid.fmeasure} for k,v in metrics.compute(predictions=predictions, references=references).items()]

## Generate text

In [None]:
sentence = input()
text =  sentence + " </s>"
encoding = tokenizer(text, return_tensors="pt")
input_ids, attention_masks = encoding["input_ids"].to("cuda"), encoding["attention_mask"].to("cuda")
outputs = model.generate(
    input_ids=input_ids, attention_mask=attention_masks,
    max_length=300,
    early_stopping=True
)
for output in outputs:
    line = tokenizer.decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    print(line)