In [None]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import DataCollatorForSeq2Seq
import evaluate
import numpy as np
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

# We start with a simple example to summarize text using T5-small model.

We are using the T5-small model for text summarization.

We use pre-defined models and tokenizers.

In [None]:
model_name = "t5-small"

In [None]:
# initializing tokenizer for processing text and generating inputs for the model
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Lodaing the pre-trained model
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [None]:
# Sample text for generation
# For T5 we need to specify the task in the begining of the text.
text = "summarize: The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country. It'll lower the deficit and ask the ultra-wealthy and corporations to pay their fair share. And no one making under $400,000 per year will pay a penny more in taxes."
inputs = tokenizer(text, return_tensors="pt").input_ids
outputs = model.generate(inputs, max_new_tokens=100, do_sample=False)
tokenizer.decode(outputs[0], skip_special_tokens=True)

# Training

We use billsum dataset for this part. More details about the dataset can be found on https://huggingface.co/datasets/billsum

google/t5-efficient-tiny model is used during training during limited processing power. More details about the model can be found on https://huggingface.co/google/t5-efficient-tiny

In [None]:
model_name = "t5-small"

In [None]:
# initializing tokenizer for processing text and generating inputs for the model
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Lodaing the pre-trained model
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [None]:
dataset = load_dataset("billsum", split="ca_test")
dataset = dataset.train_test_split(test_size=0.2)

In [None]:
dataset["train"][0]['text']

In [None]:
dataset["train"][0]['summary']

The preprocessing function you want to create needs to:


*   Prefix the input sequence so the model knows this is a summarization task. Some models are capable of multiple NLP tasks and require prompts to specify the task.
*   Use the keyword *text-target* argument when tokenizing labels.
* Truncate sequence to *max_length* supported by the model.



In [None]:
eval = evaluate.SummarizationEvaluator(task="summarization")
eval.compute(model_or_pipeline=model, data=dataset["test"], metric=evaluate.load("rouge"), label_column='summary', tokenizer=tokenizer)

In [None]:
prefix = "summarize: "


def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

The *map* function can be used to apply the pre-processing function to the entire dataset. Use *batched=True* argument to speed the process by running a batched version.

In [None]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_name)

In [None]:
rouge = evaluate.load("rouge")

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)



1.   Define your training hyperparameters in Seq2SeqTrainingArguments. The only required parameter is output_dir which specifies where to save your model. You’ll push this model to the Hub by setting push_to_hub=True (you need to be signed in to Hugging Face to upload your model). At the end of each epoch, the Trainer will evaluate the ROUGE metric and save the training checkpoint.
2.   Pass the training arguments to Seq2SeqTrainer along with the model, dataset, tokenizer, data collator, and compute_metrics function.
3. Call train() to finetune the model


In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="finetuned_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=False,
    push_to_hub=False,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

## You can run the evaluation directly using the evaluate package.

You can use SummarizationEvaluator to run the evaluation on the data.

Documentation present on https://huggingface.co/docs/evaluate/v0.4.0/en/package_reference/evaluator_classes#evaluate.SummarizationEvaluator

# Exercise

Apply the T5 model used above to medical summary generation task. THe data has been provided as CSV file. Documentation for loading CSV files as dataset can be found on https://huggingface.co/docs/datasets/loading.

For the exercise first try running inference with pre-trained models since fine-tuning will take significant amount of time.

Write the code to compute BLEU score along with Rouge score for evaluation of the model.

You are expected to submit the codes for the TS's in a zip file. You also need to present your work on the two exercises in a short 5min presentation during the last TP.

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import DataCollatorForSeq2Seq
import evaluate
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from sklearn.model_selection import train_test_split



model_name = "t5-small"
# initializing tokenizer for processing text and generating inputs for the model
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Lodaing the pre-trained model
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

dataset = load_dataset('csv', data_files="TaskA-TrainingSet.csv")
dataset = dataset["train"].train_test_split(test_size=0.2)
dataset_val = load_dataset('csv', data_files="TaskA-ValidationSet.csv")




In [None]:
prefix = "summarize: "


def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["dialogue"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["section_text"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_name)

In [None]:
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    
    result_bleu = bleu.compute(predictions=decoded_preds, references=decoded_labels)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    result["bleu"] = result_bleu['bleu']
    
    print(result_bleu['bleu'])
    
    return {k: round(v, 4) for k, v in result.items()}

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="finetuned_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=False,
    push_to_hub=False,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
input = dataset["test"][0]["dialogue"]
inputs = tokenizer(input, return_tensors="pt").input_ids

inputs = inputs.to('cuda')
outputs = model.generate(inputs, max_new_tokens=100, do_sample=False)
print(outputs)
tokenizer.decode(outputs[0], skip_special_tokens=True)