<a href="https://colab.research.google.com/github/Shakib-IO/hf_nlp/blob/main/Summarization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://huggingface.co/learn/nlp-course/chapter7/5?fw=pt#summarization

In [4]:
!pip install transformers
!pip install datasets
!pip install evaluate

FiscalNote/billsum Dataset

In [6]:
from datasets import load_dataset
ds = load_dataset("FiscalNote/billsum")
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 18949
    })
    test: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 3269
    })
    ca_test: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 1237
    })
})

In [8]:
def show_samples(dataset, num_samples=2, seed=42):
    sample = dataset["train"].shuffle(seed=seed).select(range(num_samples))
    for example in sample:
        print(f"\n'>> Title: {example['title']}'")
        print(f"\n'>> Text: {example['text']}'")

show_samples(ds)


'>> Title: A bill to prohibit unauthorized third-party charges on wireline telephone bills, and for other purposes.'

'>> Text: SECTION 1. SHORT TITLE.

    This Act may be cited as the ``Fair Telephone Billing Act of 
2012''.

SEC. 2. FINDINGS.

    Congress makes the following findings:
            (1) For years, telephone users have complained that their 
        wireline telephone bills included unauthorized third-party 
        charges.
            (2) This problem, commonly referred to as ``cramming'', 
        first appeared in the 1990s, after wireline telephone companies 
        opened their billing platforms to an array of third-party 
        vendors offering a variety of services.
            (3) Since the 1990s, the Federal Communications Commission, 
        the Federal Trade Commission, and State attorneys general have 
        brought multiple enforcement actions against dozens of 
        individuals and companies for engaging in cramming.
            (4) An investig

Preprocess Dataset

In [9]:
from transformers import AutoTokenizer
model_checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [10]:
#Let’s test out the mT5 tokenizer on a small example:
inputs = tokenizer("I loved reading the Hunger Games!")
inputs

{'input_ids': [27, 1858, 1183, 8, 26049, 5880, 55, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [11]:
tokenizer.convert_ids_to_tokens(inputs.input_ids)

['▁I', '▁loved', '▁reading', '▁the', '▁Hunger', '▁Games', '!', '</s>']

In [12]:
max_input_length = 512
max_target_length = 30

def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["text"],
        max_length=max_input_length,
        truncation=True
    )
    # Setup the tokenizer for targets
    labels = tokenizer(
        examples["title"],
        max_length=max_target_length,
        truncation=True
    )
    # Assigns the tokenized target sequences (input_ids) to the "labels" key in the model_inputs dictionary.
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [13]:
tokenized_datasets = ds.map(preprocess_function, batched=True)

Map:   0%|          | 0/18949 [00:00<?, ? examples/s]

Map:   0%|          | 0/3269 [00:00<?, ? examples/s]

Map:   0%|          | 0/1237 [00:00<?, ? examples/s]

```
'text' got converted to 'input_ids'.
'title' got converted to 'labels'.

```

In [14]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'summary', 'title', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 18949
    })
    test: Dataset({
        features: ['text', 'summary', 'title', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3269
    })
    ca_test: Dataset({
        features: ['text', 'summary', 'title', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1237
    })
})

In [20]:
# Print the first title
tokenized_datasets['train']['title'][0]

'A bill to limit the civil liability of business entities providing use of facilities to nonprofit organizations.'

In [23]:
# So the above title break down to this:
tokenizer.convert_ids_to_tokens(tokenized_datasets['train']['labels'][0])

['▁A',
 '▁bill',
 '▁to',
 '▁limit',
 '▁the',
 '▁civil',
 '▁liability',
 '▁of',
 '▁business',
 '▁entities',
 '▁providing',
 '▁use',
 '▁of',
 '▁facilities',
 '▁to',
 '▁nonprofit',
 '▁organizations',
 '.',
 '</s>']

In [24]:
# And the break down subword then convert to ids. (label_ids)
tokenized_datasets['train']['labels'][0]

[71,
 2876,
 12,
 2006,
 8,
 3095,
 6283,
 13,
 268,
 12311,
 1260,
 169,
 13,
 2465,
 12,
 11069,
 2371,
 5,
 1]

Metrics

In [None]:
!pip install rouge_score

In [None]:
import evaluate
rouge_score = evaluate.load("rouge")

Fine-tuning mT5

In [33]:
from transformers import AutoModelForSeq2SeqLM
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [34]:
from transformers import Seq2SeqTrainingArguments

batch_size = 8
num_train_epochs = 8
# Show the training loss with every epoch
logging_steps = len(tokenized_datasets["train"]) // batch_size
model_name = model_checkpoint.split("/")[-1]

args = Seq2SeqTrainingArguments(
    output_dir=f"{model_name}-finetuned-billsum",
    evaluation_strategy="epoch",
    learning_rate=5.6e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=num_train_epochs,
    predict_with_generate=True,
    logging_steps=logging_steps,
    push_to_hub=False,
)



The next thing we need to do is provide the trainer with a compute_metrics() function so that we can evaluate our model during training. For summarization this is a bit more involved than simply calling rouge_score.compute() on the model’s predictions, since we need to decode the outputs and labels into text before we can compute the ROUGE scores.

In [35]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Decode generated summaries into text
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Decode reference summaries into text
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # ROUGE expects a newline after each sentence
    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
    # Compute ROUGE scores
    result = rouge_score.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    # Extract the median scores
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    return {k: round(v, 4) for k, v in result.items()}

In [36]:
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

Our models only can deals with number. So in the dataset, if you have any text you shoule remove before feedingn it to the model.

First, we need to remove the columns with strings because the collator won’t know how to pad these elements:

**So DataCollatorForSeq2Seq will batched our inputs.**

In [37]:
# Showing the original dataset
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 18949
    })
    test: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 3269
    })
    ca_test: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 1237
    })
})

In [38]:
# Showing the tokenized dataset
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'summary', 'title', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 18949
    })
    test: Dataset({
        features: ['text', 'summary', 'title', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3269
    })
    ca_test: Dataset({
        features: ['text', 'summary', 'title', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1237
    })
})

In [39]:
# Dropping the colums which values are string and showing the final tokenized dataset
tokenized_datasets = tokenized_datasets.remove_columns(
    ds["train"].column_names
)
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 18949
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3269
    })
    ca_test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1237
    })
})

See above output. In the previous cell we have this colums `['text', 'summary', 'title', 'input_ids', 'attention_mask', 'labels']`.

After droping the string colums we have only `['input_ids', 'attention_mask', 'labels']` columns

In [41]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss




In [None]:
trainer.evaluate()

Using your fine-tuned model

In [None]:
from transformers import pipeline

model_path = "/content/t5-small-finetuned-billsum"
summarizer = pipeline("summarization", model=model_path)

In [None]:
def print_summary(idx):
    text = ds["ca_test"][idx]["text"]
    title = ds["ca_test"][idx]["title"]
    summary = summarizer(ds["ca_test"][idx]["text"])[0]["summary"]
    print(f"'>>> Text: {text}'")
    print(f"\n'>>> Title: {title}'")
    print(f"\n'>>> Summary: {summary}'")

In [None]:
print_summary(45)