In [None]:
import pandas as pd
import evaluate

In [None]:
from sklearn.model_selection import train_test_split
from datasets import Dataset
from google.colab import drive
from transformers import EncoderDecoderModel, BertTokenizer
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

In [None]:
drive.mount('/content/drive')
bert2bert = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-uncased", "bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [None]:
tokenizer.bos_token = tokenizer.cls_token
tokenizer.eos_token = tokenizer.sep_token
bert2bert.config.decoder_start_token_id = tokenizer.bos_token_id
bert2bert.config.eos_token_id = tokenizer.eos_token_id
bert2bert.config.pad_token_id = tokenizer.pad_token_id

bert2bert.config.vocab_size = bert2bert.config.decoder.vocab_size
bert2bert.config.max_length = 142
bert2bert.config.min_length = 56
bert2bert.config.no_repeat_ngram_size = 3
bert2bert.config.early_stopping = True
bert2bert.config.length_penalty = 2.0
bert2bert.config.num_beams = 4

In [None]:
data = pd.read_excel("qca.xlsx")
data = data.rename(columns={"Question": "question", "Context": "context", "Answer": "answer"})
data["input"] = "question: " + data["question"] + " context: " + data["context"]


train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)
train_dataset = Dataset.from_pandas(train_data[["input", "answer"]])
val_dataset = Dataset.from_pandas(val_data[["input", "answer"]])


In [None]:

batch_size=4
def tokenize_function(batch):
    inputs = tokenizer(batch["input"], max_length=512, truncation=True, padding="max_length", return_tensors="pt")
    outputs = tokenizer(batch["answer"], max_length=128, truncation=True, padding="max_length", return_tensors="pt")
    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask
    batch["decoder_input_ids"] = outputs.input_ids
    batch["decoder_attention_mask"] = outputs.attention_mask
    labels = outputs["input_ids"].clone()
    labels[labels == tokenizer.pad_token_id] = -100
    batch["labels"] = labels

    return batch

train_dataset = train_dataset.map(tokenize_function, batched=True, batch_size = batch_size)
val_dataset = val_dataset.map(tokenize_function, batched=True, batch_size = batch_size)


train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"])


In [None]:

rouge = evaluate.load("rouge")

def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
    }

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="/content/bertModel",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    predict_with_generate=True,
    do_train=True,
    do_eval=True,
    logging_steps=2,
    save_steps=16,
    eval_steps=4,
    warmup_steps=1,
    max_steps=16,
    overwrite_output_dir=True,
    save_total_limit=3,
)

trainer = Seq2SeqTrainer(
    model=bert2bert,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)
trainer.train()

In [None]:
model_save_path = "/content/drive/MyDrive/BERT2BERT_FineTuned_Model2"
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)