In [None]:
!pip install datasets
!pip install -U accelerate
!pip install -U transformers
!pip install -U sentence-transformers
!pip install rouge_score
!pip install evaluate

In [1]:
from sentence_transformers import SentenceTransformer
from datasets import load_dataset, Dataset
from transformers import (AutoTokenizer,
                          DataCollatorForSeq2Seq,
                          AutoModelForSeq2SeqLM,
                          Seq2SeqTrainingArguments,
                          Seq2SeqTrainer)
from evaluate import load
import numpy as np

2024-11-25 13:39:36.725478: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [2]:
model = SentenceTransformer('sentence-transformers/paraphrase-distilroberta-base-v1')

## Prepare dataset

In [3]:
book_data = load_dataset("ubaada/booksum-complete-cleaned", "books")

train = book_data['train']
test = book_data['test']
validation = book_data['validation']

del book_data

In [4]:
def extract_summaries(dataset):
    prefix = "summarize: "
    documents = []
    summaries = []
    for sample in zip(dataset["text"], dataset["summary"]):
        for summary in sample[1]:
            text = summary['text']
            documents.append(prefix + sample[0])
            summaries.append(text)
    return {"text": documents, "summary": summaries}

In [18]:
train_ds = Dataset.from_dict(extract_summaries(train))
val_ds = Dataset.from_dict(extract_summaries(validation))
test_ds = Dataset.from_dict(extract_summaries(test))

del train, test, val

NameError: name 'val' is not defined

In [6]:
t5 = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(t5)

In [7]:
def preprocess_function(examples):
    model_inputs = tokenizer(examples["text"], max_length=1024, truncation=True)
    print(type(model_inputs))

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [19]:
tokenized_train = train_ds.map(preprocess_function, batched=True)
tokenized_validation = val_ds.map(preprocess_function, batched=True)
tokenized_test = test_ds.map(preprocess_function, batched=True)

del train_ds, val_ds, test_ds

Map:   0%|          | 0/314 [00:00<?, ? examples/s]

<class 'transformers.tokenization_utils_base.BatchEncoding'>


Map:   0%|          | 0/45 [00:00<?, ? examples/s]

<class 'transformers.tokenization_utils_base.BatchEncoding'>


Map:   0%|          | 0/46 [00:00<?, ? examples/s]

<class 'transformers.tokenization_utils_base.BatchEncoding'>


In [9]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=t5)

In [10]:
rouge = load("rouge")

In [11]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [12]:
model = AutoModelForSeq2SeqLM.from_pretrained(t5)

In [20]:
training_args = Seq2SeqTrainingArguments(
    output_dir="my_awesome_billsum_model",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=10,
    per_device_eval_batch_size=10,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=True, #change to bf16=True for XPU
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [21]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_validation,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [15]:
trainer.evaluate()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


{'eval_loss': 5.7012248039245605,
 'eval_model_preparation_time': 0.0012,
 'eval_rouge1': 0.1057,
 'eval_rouge2': 0.017,
 'eval_rougeL': 0.0843,
 'eval_rougeLsum': 0.0839,
 'eval_gen_len': 19.0,
 'eval_runtime': 1.5605,
 'eval_samples_per_second': 28.838,
 'eval_steps_per_second': 3.204}

In [22]:
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,4.178144,0.0987,0.0123,0.0801,0.08,19.0
2,No log,3.915363,0.0973,0.0165,0.0781,0.0779,19.0
3,No log,3.838743,0.0965,0.0161,0.079,0.0788,19.0
4,No log,3.818701,0.1026,0.017,0.0829,0.0828,19.0




TrainOutput(global_step=128, training_loss=4.4499921798706055, metrics={'train_runtime': 44.7344, 'train_samples_per_second': 28.077, 'train_steps_per_second': 2.861, 'total_flos': 339978605297664.0, 'train_loss': 4.4499921798706055, 'epoch': 4.0})

In [23]:
trainer.evaluate()

{'eval_loss': 3.8187012672424316,
 'eval_rouge1': 0.1026,
 'eval_rouge2': 0.017,
 'eval_rougeL': 0.0829,
 'eval_rougeLsum': 0.0828,
 'eval_gen_len': 19.0,
 'eval_runtime': 1.346,
 'eval_samples_per_second': 33.432,
 'eval_steps_per_second': 3.715,
 'epoch': 4.0}

## Evaluate

In [25]:
text = tokenized_test['text'][0]

In [27]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("t5_model/checkpoint-128")
inputs = tokenizer(text, return_tensors="pt", truncation=True).input_ids

In [30]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained("t5_model/checkpoint-128")
outputs = model.generate(inputs, max_new_tokens=100, do_sample=False)

In [31]:
tokenizer.decode(outputs[0], skip_special_tokens=True)

'Fyodor Pavlovitch Karamazov was the third son of a landowner known in our district in his own day. he was married twice, and had three sons, the eldest, Dmitri, by his first wife, and two, Ivan and Alexey, by his second. he was married twice, and had three sons, the eldest, Dmitri, by his first wife, and two'