# Installing the HuggingFace Libraries 

In [1]:
!pip install -q transformers[torch] datasets

# Loading the CNN_DAILYMAIL Dataset 

In [2]:
from datasets import load_dataset

cnn_dailymail = load_dataset("cnn_dailymail", "3.0.0", split='validation')

Downloading readme:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

Downloading data: 100%|██████████| 257M/257M [00:01<00:00, 194MB/s]  
Downloading data: 100%|██████████| 257M/257M [00:01<00:00, 240MB/s]  
Downloading data: 100%|██████████| 259M/259M [00:01<00:00, 202MB/s]  
Downloading data: 100%|██████████| 34.7M/34.7M [00:00<00:00, 84.9MB/s]
Downloading data: 100%|██████████| 30.0M/30.0M [00:00<00:00, 61.8MB/s]


Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

Looking at the number of rows and columns of the dataset

In [3]:
cnn_dailymail

Dataset({
    features: ['article', 'highlights', 'id'],
    num_rows: 13368
})

Splitting the dataset into training and testing set

In [4]:
cnn_dailymail = cnn_dailymail.train_test_split(test_size=0.2)

In [5]:
cnn_dailymail

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 10694
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 2674
    })
})

Checking if the dataset is loaded correctly

In [6]:
example = cnn_dailymail["train"][0]
for key in example:
    print("A key of the example: \"{}\"".format(key))
    print("The value corresponding to the key-\"{}\"\n \"{}\"".format(key, example[key]))

A key of the example: "article"
The value corresponding to the key-"article"
 "Asking a young child to talk about Nelson Mandela and the concept of human rights, equality or the right to individual self-realisation is quite a leap from asking them about their favourite food. But that's exactly what Amnesty International has done, in a heart-warming video called Human Rights Heroes Of The Future, which poses such questions to children ranging in age from five to nine. One girl is asked: What did Nelson Mandela do? She says: 'He stood up for what he believed - that white people shouldn't just have all the fun and black people should have fun as well.' Another is asked: Who was Nelson Mandela? She replies: 'Nelson Mandela was a man who thought that black people should just have the amount of right just as the white people should. 'But then he was put into jail because of what he said - and then he finally went out because people were protesting for him to come out. And then there was a so

# Preprocessing and Tokenization

In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("t5-small")

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [8]:
tokenized_text = tokenizer(example['article'])
for key in tokenized_text:
    print(key)
    print(tokenized_text[key])

Token indices sequence length is longer than the specified maximum sequence length for this model (1194 > 512). Running this sequence through the model will result in indexing errors


input_ids
[8366, 53, 3, 9, 1021, 861, 12, 1350, 81, 14448, 26353, 9, 11, 8, 2077, 13, 936, 2166, 6, 18963, 42, 8, 269, 12, 928, 1044, 18, 6644, 2121, 19, 882, 3, 9, 14527, 45, 3558, 135, 81, 70, 3960, 542, 5, 299, 24, 31, 7, 1776, 125, 736, 29, 222, 63, 1331, 65, 612, 6, 16, 3, 9, 842, 18, 14522, 53, 671, 718, 3892, 10226, 26284, 1129, 37, 9130, 6, 84, 15968, 224, 746, 12, 502, 3, 6836, 16, 1246, 45, 874, 12, 4169, 5, 555, 3202, 19, 1380, 10, 363, 410, 14448, 26353, 9, 103, 58, 451, 845, 10, 3, 31, 3845, 8190, 95, 21, 125, 3, 88, 6141, 3, 18, 24, 872, 151, 6994, 31, 17, 131, 43, 66, 8, 694, 11, 1001, 151, 225, 43, 694, 38, 168, 5, 31, 2351, 19, 1380, 10, 2645, 47, 14448, 26353, 9, 58, 451, 26719, 10, 3, 31, 567, 3573, 106, 26353, 9, 47, 3, 9, 388, 113, 816, 24, 1001, 151, 225, 131, 43, 8, 866, 13, 269, 131, 38, 8, 872, 151, 225, 5, 3, 31, 11836, 258, 3, 88, 47, 474, 139, 11796, 250, 13, 125, 3, 88, 243, 3, 18, 11, 258, 3, 88, 2031, 877, 91, 250, 151, 130, 4973, 53, 21, 376, 12, 369, 91

In [9]:
def preprocess_function(examples):
    # Prepends the string "summarize: " to each document in the 'text' field of the input examples.
    # This is done to instruct the T5 model on the task it needs to perform, which in this case is summarization.
    inputs = ["summarize: " + doc for doc in examples["article"]]

    # Tokenizes the prepended input texts to convert them into a format that can be fed into the T5 model.
    # Sets a maximum token length of 1024, and truncates any text longer than this limit.
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    # Tokenizes the 'summary' field of the input examples to prepare the target labels for the summarization task.
    # Sets a maximum token length of 128, and truncates any text longer than this limit.
    labels = tokenizer(text_target=examples["highlights"], max_length=128, truncation=True)

    # Assigns the tokenized labels to the 'labels' field of model_inputs.
    # The 'labels' field is used during training to calculate the loss and guide model learning.
    model_inputs["labels"] = labels["input_ids"]

    # Returns the prepared inputs and labels as a single dictionary, ready for training.
    return model_inputs

In [10]:
tokenized_cnn_dailymail = cnn_dailymail.map(preprocess_function, batched=True)

Map:   0%|          | 0/10694 [00:00<?, ? examples/s]

Map:   0%|          | 0/2674 [00:00<?, ? examples/s]

In [11]:
tokenized_cnn_dailymail['test'][0]['article']

'(CNN)Kanye West is known for rankling people on stage.  Now it\'s his turn to get a virtual smackdown. One music fan in the UK wants him out of the Glastonbury Festival so bad, he\'s started a petition to make it happen. In his petition, Neil Lonsdale describes the rapper as an "egotistical, maniacal, disgrace." He calls on organizers to replace the  "Yeezus" singer  at the Glastonbury Festival this summer. West is booked to headline the festival, which runs between June 24-28. His performance is Saturday. "Kanye West is an insult to music fans all over the world,"  Lonsdale says in the change.org petition.  "We spend hundreds of pounds to attend glasto, and by doing so, expect a certain level of entertainment." The petition urges West to  "pass his headline slot on to someone deserving." Lonsdale is not the only music fan upset by his scheduled performance.  By early Friday, the petition had nearly 80,000 signatures. West  and the festival\'s organizers have not responded to the peti

In [12]:
tokenized_cnn_dailymail['test'][0]['highlights']

"One music fan in the UK wants him out of the Glastonbury Festival .\nHe's started a petition calling for organizers to replace him ."

In [13]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model="t5-small")

2024-04-23 19:36:56.697021: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-23 19:36:56.697120: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-23 19:36:56.822122: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Finetuning the model using Rouge Evaluation Metric

In [14]:
! pip install -q evaluate rouge_score

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [15]:
import evaluate

rouge = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [16]:
import numpy as np

def compute_metrics(eval_pred):
    # Unpacks the evaluation predictions tuple into predictions and labels.
    predictions, labels = eval_pred

    # Decodes the tokenized predictions back to text, skipping any special tokens (e.g., padding tokens).
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replaces any -100 values in labels with the tokenizer's pad_token_id.
    # This is done because -100 is often used to ignore certain tokens when calculating the loss during training.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    # Decodes the tokenized labels back to text, skipping any special tokens (e.g., padding tokens).
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Computes the ROUGE metric between the decoded predictions and decoded labels.
    # The use_stemmer parameter enables stemming, which reduces words to their root form before comparison.
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    # Calculates the length of each prediction by counting the non-padding tokens.
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]

    # Computes the mean length of the predictions and adds it to the result dictionary under the key "gen_len".
    result["gen_len"] = np.mean(prediction_lens)

    # Rounds each value in the result dictionary to 4 decimal places for cleaner output, and returns the result.
    return {k: round(v, 4) for k, v in result.items()}


In [17]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, T5Config

In [18]:
model_name = "t5-small"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
config = T5Config.from_pretrained(model_name)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [19]:
print(config)

T5Config {
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "prefix"

In [20]:
training_args = Seq2SeqTrainingArguments(
    output_dir="fine_tuned_t5_small_cnn_dailymail_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=True,
)

In [21]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_cnn_dailymail["train"],
    eval_dataset=tokenized_cnn_dailymail["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [22]:
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,1.9842,1.734299,0.241,0.1162,0.1995,0.1995,19.0
2,1.8973,1.717317,0.2406,0.1161,0.1993,0.1992,18.9989
3,1.8665,1.710837,0.2416,0.1168,0.2004,0.2002,18.9981
4,1.8561,1.709529,0.241,0.1164,0.1998,0.1996,18.9989




TrainOutput(global_step=2676, training_loss=1.891330257124965, metrics={'train_runtime': 3470.9407, 'train_samples_per_second': 12.324, 'train_steps_per_second': 0.771, 'total_flos': 1.1578357888843776e+16, 'train_loss': 1.891330257124965, 'epoch': 4.0})

In [23]:
trainer.save_model("fine_tuned_t5_small_cnn_dailymail_model")

# Inference Using Finetuned Model

In [24]:
text = cnn_dailymail['test'][100]['article']
text = "summarize: " + text
text

"summarize: Michel Platini has raised the spectre of the Heysel disaster and warned of a growing issue of hooligans and political fanatics taking control in football stadiums in Europe. The UEFA president, who was on the pitch for Juventus at the European Cup final in Brussels 30 years ago when 39 people - mainly from the Italian club - died after rioting Liverpool fans charged Juve supporters causing a wall to collapse. Platini has called for tougher stadium bans and a European-wide sports police force. Speaking at the UEFA Congress in Vienna, Platini said: 'Europe is seeing a rise in nationalism and extremism the like of which we have not witnessed for a very long time. Michel Platini wants to see a European sports police force introduced to prevent football hooliganism . 'This insidious trend can also be observed in our stadiums, as football is a reflection of society. Given its popularity, our sport is a barometer for the ills of our continent. And that barometer is pointing to som

In [25]:
from transformers import pipeline

summarizer = pipeline("summarization", model="fine_tuned_t5_small_cnn_dailymail_model")
pred = summarizer(text)
pred

Token indices sequence length is longer than the specified maximum sequence length for this model (908 > 512). Running this sequence through the model will result in indexing errors


[{'summary_text': 'Michel Platini was on the pitch for Juventus at the European Cup final in Brussels . 39 people were killed when Liverpool fans breached the neutral area . He has called for tougher stadium bans and a European-wide sports police force . The Italian is being re-elected unopposed for a third term .'}]

In [26]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("fine_tuned_t5_small_cnn_dailymail_model")
inputs = tokenizer(text, return_tensors="pt").input_ids
inputs

Token indices sequence length is longer than the specified maximum sequence length for this model (906 > 512). Running this sequence through the model will result in indexing errors


tensor([[21603,    10,  9411,   276, 14098,    23,    65,  3279,     8,     3,
          5628,    60,    13,     8,  9459,     7,    15,    40,  6912,    11,
         15240,    13,     3,     9,  1710,   962,    13,  3534,  4172,  2565,
             7,    11,  1827,  1819,  6049,     7,   838,   610,    16,  3370,
         14939,     7,    16,  1740,     5,    37,     3,  5078,  4795,  2753,
             6,   113,    47,    30,     8,  6242,    21, 31568,    44,     8,
          1611,  3802,   804,    16, 20501,   604,   203,   977,   116,  6352,
           151,     3,    18,     3,  4894,    45,     8,  4338,  1886,     3,
            18,  3977,   227,     3, 12884,    53, 15131,  2675,  4977,  3736,
           162, 11172,     3,  5885,     3,     9,  1481,    12, 11612,     5,
           276, 14098,    23,    65,   718,    21,  3429,    49, 14939,  4514,
             7,    11,     3,     9,  1611,    18,  6728,  2100,  2095,  2054,
             5, 14734,    44,     8,     3,  5078,  

In [27]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained("fine_tuned_t5_small_cnn_dailymail_model")
outputs = model.generate(inputs, max_new_tokens=100, do_sample=False)

In [28]:
tokenizer.decode(outputs[0], skip_special_tokens=True)

'Michel Platini has called for tougher stadium bans and a European sports police force. He has warned of a growing issue of hooliganism in football stadiums. Platini was on the pitch for Juventus at the European Cup final in Brussels.'

In [29]:
pred[0]['summary_text']

'Michel Platini was on the pitch for Juventus at the European Cup final in Brussels . 39 people were killed when Liverpool fans breached the neutral area . He has called for tougher stadium bans and a European-wide sports police force . The Italian is being re-elected unopposed for a third term .'

In [30]:
preds = [pred[0]['summary_text']]

In [31]:
labels = [cnn_dailymail['test'][100]['highlights']]

In [32]:
rouge.compute(predictions=preds, references=labels, use_stemmer=True)

{'rouge1': 0.35294117647058826,
 'rouge2': 0.10000000000000002,
 'rougeL': 0.19607843137254902,
 'rougeLsum': 0.2549019607843137}