# Installing the HuggingFace Libraries 

In [1]:
!pip install -q transformers[torch] datasets

# Loading the CNN_DAILYMAIL Dataset 

In [2]:
from datasets import load_dataset

cnn_dailymail = load_dataset("cnn_dailymail", "3.0.0", split='validation')

Downloading readme:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

Downloading data: 100%|██████████| 257M/257M [00:00<00:00, 257MB/s]  
Downloading data: 100%|██████████| 257M/257M [00:00<00:00, 326MB/s]  
Downloading data: 100%|██████████| 259M/259M [00:01<00:00, 240MB/s]  
Downloading data: 100%|██████████| 34.7M/34.7M [00:00<00:00, 108MB/s] 
Downloading data: 100%|██████████| 30.0M/30.0M [00:00<00:00, 91.6MB/s]


Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

Looking at the number of rows and columns of the dataset

In [3]:
cnn_dailymail

Dataset({
    features: ['article', 'highlights', 'id'],
    num_rows: 13368
})

Splitting the dataset into training and testing set

In [4]:
cnn_dailymail = cnn_dailymail.train_test_split(test_size=0.2)

In [5]:
cnn_dailymail

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 10694
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 2674
    })
})

Checking if the dataset is loaded correctly

In [6]:
example = cnn_dailymail["train"][0]
for key in example:
    print("A key of the example: \"{}\"".format(key))
    print("The value corresponding to the key-\"{}\"\n \"{}\"".format(key, example[key]))

A key of the example: "article"
The value corresponding to the key-"article"
 "Hristo Stoichkov has lambasted Manchester United manager Louis van Gaal by insisting the Dutchman is 'garbage'. Stoichkov played under Van Gaal during his second spell at the Nou Camp before leaving the Spanish outfit for CSKA Sofia in 1998. Bulgarian legend Stoichkov was far from impressed with Van Gaal and has blamed the Manchester United boss for his premature departure from the Catalan giants. Former Barcelona forward Hristo Stoichkov, pictured in 1997, has blasted Man United boss Louis van Gaal . Stoichkov (left) played under Van Gaal (right) during his second spell at Barcelona in the Nineties . Stoichkov, speaking to Sport Sunday, said: 'I have no respect for him, he’s garbage. 'One day, when I was injured and I was with my wife at the Nou Camp. He went up to her and asked "how was possible that she married someone like me?" 'It was the fault of Van Gaal that I moved on to CSKA Sofia.' In contrast, St

# Preprocessing and Tokenization

In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("t5-small")

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [8]:
tokenized_text = tokenizer(example['article'])
for key in tokenized_text:
    print(key)
    print(tokenized_text[key])

input_ids
[454, 17149, 8272, 362, 9789, 65, 17871, 9, 6265, 9145, 907, 2743, 5181, 4049, 2776, 138, 57, 10419, 53, 8, 10098, 348, 19, 3, 31, 1478, 7893, 15, 31, 5, 8272, 362, 9789, 1944, 365, 4480, 2776, 138, 383, 112, 511, 10783, 44, 8, 14455, 4594, 274, 3140, 8, 5093, 6468, 21, 3, 4778, 12048, 25860, 16, 6260, 5, 15536, 29, 9503, 8272, 362, 9789, 47, 623, 45, 8686, 28, 4480, 2776, 138, 11, 65, 9100, 26, 8, 9145, 907, 7930, 21, 112, 27130, 12028, 45, 8, 3431, 9, 1618, 6079, 7, 5, 18263, 11869, 1039, 454, 17149, 8272, 362, 9789, 6, 3, 22665, 16, 6622, 6, 65, 3, 115, 19054, 1140, 907, 7930, 5181, 4049, 2776, 138, 3, 5, 8272, 362, 9789, 41, 17068, 61, 1944, 365, 4480, 2776, 138, 41, 3535, 61, 383, 112, 511, 10783, 44, 11869, 16, 8, 19636, 3010, 3, 5, 8272, 362, 9789, 6, 4461, 12, 3349, 1771, 6, 243, 10, 3, 31, 196, 43, 150, 1445, 21, 376, 6, 3, 88, 22, 7, 12937, 5, 3, 31, 10723, 239, 6, 116, 27, 47, 7532, 11, 27, 47, 28, 82, 2512, 44, 8, 14455, 4594, 5, 216, 877, 95, 12, 160, 11, 1380, 9

In [9]:
def preprocess_function(examples):
    # Prepends the string "summarize: " to each document in the 'text' field of the input examples.
    # This is done to instruct the T5 model on the task it needs to perform, which in this case is summarization.
    inputs = ["summarize: " + doc for doc in examples["article"]]

    # Tokenizes the prepended input texts to convert them into a format that can be fed into the T5 model.
    # Sets a maximum token length of 1024, and truncates any text longer than this limit.
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    # Tokenizes the 'summary' field of the input examples to prepare the target labels for the summarization task.
    # Sets a maximum token length of 128, and truncates any text longer than this limit.
    labels = tokenizer(text_target=examples["highlights"], max_length=128, truncation=True)

    # Assigns the tokenized labels to the 'labels' field of model_inputs.
    # The 'labels' field is used during training to calculate the loss and guide model learning.
    model_inputs["labels"] = labels["input_ids"]

    # Returns the prepared inputs and labels as a single dictionary, ready for training.
    return model_inputs

In [10]:
tokenized_cnn_dailymail = cnn_dailymail.map(preprocess_function, batched=True)

Map:   0%|          | 0/10694 [00:00<?, ? examples/s]

Map:   0%|          | 0/2674 [00:00<?, ? examples/s]

In [11]:
tokenized_cnn_dailymail['test'][0]['article']

"Boxing fans worldwide have picked an emerald green, diamond-encrusted belt to be strapped around the waist of either Floyd Mayweather or Manny Pacquiao at the conclusion of their $300million fight of the century in Las Vegas on May 2. The WBC held a  public vote to choose between an onyx belt of unusual design or their more traditional green world champion belt, with the latter winning by a margin of just six per cent. The cost of the belt is likely to be upwards of $1m. Fans chose this emerald belt as the one which  will adorn the winner of the fight of the century on May 2 . Floyd Mayweather or Manny Pacquiao will end up wearing the $1million emerald belt at the end of their fight - this image does not show the version that includes pictures of the two fighters . Costing upwards of $1million, this WBC world champions belt features images of former WBC president Mauricio Sulaiman, Mayweather, Pacquiao and legendary heavyweight Muhammad Ali along the strap. The treasured memento comes

In [12]:
tokenized_cnn_dailymail['test'][0]['highlights']

'Boxing fans worldwide were asked to pick which of two commemorative belts should be worn by either Floyd Mayweather or Manny Pacquiao .\nThe traditional green world champions belt, encrusted with diamonds, won out ahead of the unusual onyx belt design in WBC vote .\nThe belt is expected to cost upwards of $1m .\nCLICK HERE for all the latest Floyd Mayweather vs Manny Pacquiao news .'

In [13]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model="t5-small")

2024-04-29 15:37:56.324323: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-29 15:37:56.324417: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-29 15:37:56.449544: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Finetuning the model using Rouge Evaluation Metric

In [14]:
! pip install -q evaluate rouge_score

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [15]:
import evaluate

rouge = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [16]:
import numpy as np

def compute_metrics(eval_pred):
    # Unpacks the evaluation predictions tuple into predictions and labels.
    predictions, labels = eval_pred

    # Decodes the tokenized predictions back to text, skipping any special tokens (e.g., padding tokens).
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replaces any -100 values in labels with the tokenizer's pad_token_id.
    # This is done because -100 is often used to ignore certain tokens when calculating the loss during training.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    # Decodes the tokenized labels back to text, skipping any special tokens (e.g., padding tokens).
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Computes the ROUGE metric between the decoded predictions and decoded labels.
    # The use_stemmer parameter enables stemming, which reduces words to their root form before comparison.
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    # Calculates the length of each prediction by counting the non-padding tokens.
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]

    # Computes the mean length of the predictions and adds it to the result dictionary under the key "gen_len".
    result["gen_len"] = np.mean(prediction_lens)

    # Rounds each value in the result dictionary to 4 decimal places for cleaner output, and returns the result.
    return {k: round(v, 4) for k, v in result.items()}


In [17]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, T5Config, T5Model, T5ForConditionalGeneration

In [18]:
model_name = "T5-small"
# Define the configuration with your desired number of attention heads
config = T5Config.from_pretrained(model_name)
config.num_heads = 6  # Change the number of attention heads to 6
config.d_kv = config.d_model // config.num_heads

# Now, use this modified configuration when initializing your model for fine-tuning
# Initialize a new T5 model with the modified configuration
model = T5ForConditionalGeneration(config=config)

# Load the pretrained weights into the newly initialized model
model.from_pretrained(model_name)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [19]:
print(config)

T5Config {
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 85,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 6,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "prefix"

In [20]:
training_args = Seq2SeqTrainingArguments(
    output_dir="fine_tuned_t5_small_cnn_dailymail_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=True,
)

In [21]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_cnn_dailymail["train"],
    eval_dataset=tokenized_cnn_dailymail["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [22]:
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,8.3042,7.809098,0.0743,0.0003,0.0633,0.0634,19.0
2,7.8785,7.589591,0.0758,0.0027,0.0641,0.0642,19.0
3,7.624,7.476039,0.0659,0.0016,0.0569,0.0569,19.0
4,7.5611,7.43697,0.0744,0.0034,0.0631,0.0631,19.0




TrainOutput(global_step=2676, training_loss=7.801563895871286, metrics={'train_runtime': 2723.8156, 'train_samples_per_second': 15.704, 'train_steps_per_second': 0.982, 'total_flos': 1.1559017575612416e+16, 'train_loss': 7.801563895871286, 'epoch': 4.0})

In [23]:
trainer.save_model("fine_tuned_t5_small_cnn_dailymail_model")

# Inference Using Finetuned Model

In [24]:
text = cnn_dailymail['test'][100]['article']
text = "summarize: " + text
text

"summarize: A mound of spaghetti sits in a bowl with a heaping of tomatoes. Exactly nine puckered capers and a measly two slivers of anchovy perch rather inelegantly atop the pile. On the side tiny serving of mustard, some slices of gherkin and a wodge of waxed cheese. To wash it down is a cup of pink-strained tea. As last meals go, it's not the most glamorous of repasts. This is what Doomsday Prepper Kellene Bishop, a resident in Utah, US, will eat on the day the world ends. Texan Wayne Martin's last meal is a bowl of Campbell’s chicken and spaghetti soup and a side serving of gourmet cat food. He washes it down with a glass of his homemade white wine which he will use as barter in lieu of money once the apocalypse begins . Wilma Bryant from Missouri will feast on thyme-roast chicken covered in gravy and served with beans. Ms Bryant lives with her daughter and the pair are both diabetic and dependent on insulin. Her meal is juxtaposed against syringes and medical paraphernalia . This 

In [25]:
from transformers import pipeline

summarizer = pipeline("summarization", model="fine_tuned_t5_small_cnn_dailymail_model")
pred = summarizer(text)
pred

Token indices sequence length is longer than the specified maximum sequence length for this model (2043 > 512). Running this sequence through the model will result in indexing errors


[{'summary_text': "A-year-old has been he was a he is a-old . He was 's's he had been . The he has been in he said he ' and he's and ."}]

In [26]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("fine_tuned_t5_small_cnn_dailymail_model")
inputs = tokenizer(text, return_tensors="pt").input_ids
inputs

Token indices sequence length is longer than the specified maximum sequence length for this model (2041 > 512). Running this sequence through the model will result in indexing errors


tensor([[21603,    10,    71,  ...,     5,    31,     1]])

In [27]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained("fine_tuned_t5_small_cnn_dailymail_model")
outputs = model.generate(inputs, max_new_tokens=100, do_sample=False)

In [28]:
tokenizer.decode(outputs[0], skip_special_tokens=True)

'The a-year-old was a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a'

In [29]:
pred[0]['summary_text']

"A-year-old has been he was a he is a-old . He was 's's he had been . The he has been in he said he ' and he's and ."

In [30]:
preds = [pred[0]['summary_text']]

In [31]:
labels = [cnn_dailymail['test'][100]['highlights']]

In [32]:
rouge.compute(predictions=preds, references=labels, use_stemmer=True)

{'rouge1': 0.07142857142857142,
 'rouge2': 0.0,
 'rougeL': 0.02380952380952381,
 'rougeLsum': 0.04761904761904762}