# Installing the HuggingFace Libraries 

In [1]:
!pip install -q transformers[torch] datasets

# Loading the CNN_DAILYMAIL Dataset 

In [2]:
from datasets import load_dataset

cnn_dailymail = load_dataset("cnn_dailymail", "3.0.0", split='validation')

Downloading readme:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

Downloading data: 100%|██████████| 257M/257M [00:01<00:00, 221MB/s]  
Downloading data: 100%|██████████| 257M/257M [00:01<00:00, 239MB/s]  
Downloading data: 100%|██████████| 259M/259M [00:01<00:00, 239MB/s]  
Downloading data: 100%|██████████| 34.7M/34.7M [00:00<00:00, 148MB/s] 
Downloading data: 100%|██████████| 30.0M/30.0M [00:00<00:00, 138MB/s] 


Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

Looking at the number of rows and columns of the dataset

In [3]:
cnn_dailymail

Dataset({
    features: ['article', 'highlights', 'id'],
    num_rows: 13368
})

Splitting the dataset into training and testing set

In [4]:
cnn_dailymail = cnn_dailymail.train_test_split(test_size=0.2)

In [5]:
cnn_dailymail

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 10694
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 2674
    })
})

Checking if the dataset is loaded correctly

In [6]:
example = cnn_dailymail["train"][0]
for key in example:
    print("A key of the example: \"{}\"".format(key))
    print("The value corresponding to the key-\"{}\"\n \"{}\"".format(key, example[key]))

A key of the example: "article"
The value corresponding to the key-"article"
 "A former university professor who taught a class on the hit CBS show Survivor and entered the reality competition this year was was voted off the program in the fourth episode. Max Dawson, who has done extensive research on the show and spent two years readying himself to compete, was voted off on Wednesday because other contestants thought he was 'annoying'. Dawson taught a class called 'The Tribe Has Spoken: Surviving TV's New Reality' at Northwestern University in 2012 and 2013. Scroll down for video . Max Dawson, a former Northwestern University professor who taught a class called 'The Tribe Has Spoken: Surviving TV's New Reality' for two years was voted off the hit reality show Survivor after just four episodes . Contestants apparently thought Dawson was 'annoying'. He was seen over-strategizing and spewing knowledge of past seasons throughout his time on the show . But his constant over-strategizing, i

# Preprocessing and Tokenization

In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("t5-small")

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [8]:
tokenized_text = tokenizer(example['article'])
for key in tokenized_text:
    print(key)
    print(tokenized_text[key])

Token indices sequence length is longer than the specified maximum sequence length for this model (704 > 512). Running this sequence through the model will result in indexing errors


input_ids
[71, 1798, 3819, 5812, 113, 4436, 3, 9, 853, 30, 8, 1560, 19856, 504, 3, 31400, 11, 5136, 8, 2669, 2259, 48, 215, 47, 47, 3, 11060, 326, 8, 478, 16, 8, 4509, 5640, 5, 5370, 31676, 6, 113, 65, 612, 3616, 585, 30, 8, 504, 11, 1869, 192, 203, 1065, 53, 2448, 12, 5978, 6, 47, 3, 11060, 326, 30, 2875, 250, 119, 4233, 2366, 816, 3, 88, 47, 3, 31, 24889, 8149, 31, 5, 31676, 4436, 3, 9, 853, 718, 3, 31, 634, 2702, 346, 4498, 8927, 2217, 10, 3705, 7003, 53, 1424, 31, 7, 368, 23963, 31, 44, 30198, 636, 16, 1673, 11, 6386, 25731, 323, 21, 671, 3, 5, 5370, 31676, 6, 3, 9, 1798, 30198, 636, 5812, 113, 4436, 3, 9, 853, 718, 3, 31, 634, 2702, 346, 4498, 8927, 2217, 10, 3705, 7003, 53, 1424, 31, 7, 368, 23963, 31, 21, 192, 203, 47, 3, 11060, 326, 8, 1560, 2669, 504, 3, 31400, 227, 131, 662, 13562, 3, 5, 21537, 2366, 8743, 816, 31676, 47, 3, 31, 24889, 8149, 31, 5, 216, 47, 894, 147, 18, 7, 17, 2206, 122, 2610, 11, 15142, 3108, 1103, 13, 657, 9385, 1019, 112, 97, 30, 8, 504, 3, 5, 299, 112, 3

In [9]:
def preprocess_function(examples):
    # Prepends the string "summarize: " to each document in the 'text' field of the input examples.
    # This is done to instruct the T5 model on the task it needs to perform, which in this case is summarization.
    inputs = ["summarize: " + doc for doc in examples["article"]]

    # Tokenizes the prepended input texts to convert them into a format that can be fed into the T5 model.
    # Sets a maximum token length of 1024, and truncates any text longer than this limit.
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    # Tokenizes the 'summary' field of the input examples to prepare the target labels for the summarization task.
    # Sets a maximum token length of 128, and truncates any text longer than this limit.
    labels = tokenizer(text_target=examples["highlights"], max_length=128, truncation=True)

    # Assigns the tokenized labels to the 'labels' field of model_inputs.
    # The 'labels' field is used during training to calculate the loss and guide model learning.
    model_inputs["labels"] = labels["input_ids"]

    # Returns the prepared inputs and labels as a single dictionary, ready for training.
    return model_inputs

In [10]:
tokenized_cnn_dailymail = cnn_dailymail.map(preprocess_function, batched=True)

Map:   0%|          | 0/10694 [00:00<?, ? examples/s]

Map:   0%|          | 0/2674 [00:00<?, ? examples/s]

In [11]:
tokenized_cnn_dailymail['test'][0]['article']

"Couple: Becky Tait, 20, stepped in front of a train on the same stretch of tracks where her boyfriend Phil James, 19, did the same two months earlier . A heartbroken girlfriend killed herself at the same spot where her boyfriend committed suicide weeks earlier after struggling to come to terms with his death, an inquest heard today. Becky Tait, 20, stepped in front of a speeding train on the same tracks where her boyfriend Phil James, 19, had stood two months earlier. The care assistant from Stoke-on-Trent had been distraught after his death and was struggling to cope with her loss, Cannock Coroner’s Court in Staffordshire was told. Miss Tait went to the tracks where she died last November - where her ‘soul mate’ Mr James had died in September. Witnesses said she looked on ‘purposefully’ at the passenger train, which braked but struck her - causing multiple injuries. She was pronounced dead at the scene. Miss Tait had already tried to kill herself a month after Mr James’s death by tak

In [12]:
tokenized_cnn_dailymail['test'][0]['highlights']

'Becky Tait, of Stoke, killed herself at same spot in Staffordshire .\nShe committed suicide two months after death of boyfriend Phil James .\nCare assistant struggled to cope with loss and was hit by  train .'

In [13]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model="t5-small")

2024-04-29 20:29:26.571196: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-29 20:29:26.571296: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-29 20:29:26.710896: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Finetuning the model using Rouge Evaluation Metric

In [14]:
! pip install -q evaluate rouge_score

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [15]:
import evaluate

rouge = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [16]:
import numpy as np

def compute_metrics(eval_pred):
    # Unpacks the evaluation predictions tuple into predictions and labels.
    predictions, labels = eval_pred

    # Decodes the tokenized predictions back to text, skipping any special tokens (e.g., padding tokens).
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replaces any -100 values in labels with the tokenizer's pad_token_id.
    # This is done because -100 is often used to ignore certain tokens when calculating the loss during training.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    # Decodes the tokenized labels back to text, skipping any special tokens (e.g., padding tokens).
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Computes the ROUGE metric between the decoded predictions and decoded labels.
    # The use_stemmer parameter enables stemming, which reduces words to their root form before comparison.
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    # Calculates the length of each prediction by counting the non-padding tokens.
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]

    # Computes the mean length of the predictions and adds it to the result dictionary under the key "gen_len".
    result["gen_len"] = np.mean(prediction_lens)

    # Rounds each value in the result dictionary to 4 decimal places for cleaner output, and returns the result.
    return {k: round(v, 4) for k, v in result.items()}


In [17]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, T5Config, T5Model, T5ForConditionalGeneration

In [18]:
model_name = "T5-small"
# Define the configuration with your desired number of attention heads
config = T5Config.from_pretrained(model_name)
config.num_heads = 2  # Change the number of attention heads to 2
config.d_kv = config.d_model // config.num_heads

# Now, use this modified configuration when initializing your model for fine-tuning
# Initialize a new T5 model with the modified configuration
model = T5ForConditionalGeneration(config=config)

# Load the pretrained weights into the newly initialized model
model.from_pretrained(model_name)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [19]:
print(config)

T5Config {
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 256,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 2,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "prefix

In [20]:
training_args = Seq2SeqTrainingArguments(
    output_dir="fine_tuned_t5_small_cnn_dailymail_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=True,
)

In [21]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_cnn_dailymail["train"],
    eval_dataset=tokenized_cnn_dailymail["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [22]:
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,8.1894,7.732235,0.0761,0.0001,0.0686,0.0685,19.0
2,7.8064,7.527384,0.0674,0.0005,0.0583,0.0582,19.0
3,7.559,7.413552,0.0691,0.0006,0.0584,0.0584,19.0
4,7.4982,7.375935,0.0672,0.0005,0.0576,0.0576,19.0




TrainOutput(global_step=2676, training_loss=7.7254573896148635, metrics={'train_runtime': 2274.9035, 'train_samples_per_second': 18.803, 'train_steps_per_second': 1.176, 'total_flos': 1.1577810177490944e+16, 'train_loss': 7.7254573896148635, 'epoch': 4.0})

In [23]:
trainer.save_model("fine_tuned_t5_small_cnn_dailymail_model")

# Inference Using Finetuned Model

In [24]:
text = cnn_dailymail['test'][100]['article']
text = "summarize: " + text
text

"summarize: UK defence cuts risk leaving America to tackle the twin threat of Russia and ISIS on its own, it has been claimed. Retired British army chief General Richard Dannatt suggested there was ‘no-one else’ for the US to turn to in Europe. It comes days after a UK think tank predicted that up to 30,000 British service personnel could go leaving the armed forces with a combined strength of just 115,000\xa0by the end of the decade. Retired British army chief General Richard Dannatt (pictured) suggested there was ‘no-one else’ for the US to turn to in Europe . Now there are fears the United States could be left without a credible partner as it stands up to a number of global security threats. Lord Dannatt, who served as Chief of the General Staff between 2006 and 2009, told the Washington Post: ‘If the UK can’t do it, who else is the US going to turn to in Europe? There’s no one else.’ He added: ‘The concern is that we’re going to fall from being a significant player to a bit-part pl

In [25]:
from transformers import pipeline

summarizer = pipeline("summarization", model="fine_tuned_t5_small_cnn_dailymail_model")
pred = summarizer(text)
pred

Token indices sequence length is longer than the specified maximum sequence length for this model (569 > 512). Running this sequence through the model will result in indexing errors


[{'summary_text': "The a 's a has been in the . It is he has been 't's in ' ' and '"}]

In [26]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("fine_tuned_t5_small_cnn_dailymail_model")
inputs = tokenizer(text, return_tensors="pt").input_ids
inputs

Token indices sequence length is longer than the specified maximum sequence length for this model (567 > 512). Running this sequence through the model will result in indexing errors


tensor([[21603,    10,  1270, 13613,  8620,  1020,  3140,  1371,    12,  8000,
             8,  7390,  5888,    13,  4623,    11,    27, 14408,    30,   165,
           293,     6,    34,    65,   118,  7760,     5,   419, 11809,    26,
          2390,  9102,  5752,  2146,  4117,  7680,   144,    17,  5259,   132,
            47,   458,    29,    32,    18,   782,  1307,    22,    21,     8,
           837,    12,   919,    12,    16,  1740,     5,    94,   639,   477,
           227,     3,     9,  1270,   317,  5040, 15439,    24,    95,    12,
             3, 17093,  2390,   313,  4231,   228,   281,  3140,     8,     3,
          8715,  3859,    28,     3,     9,  3334,  2793,    13,   131,   850,
          5898,    57,     8,   414,    13,     8,  5112,     5,   419, 11809,
            26,  2390,  9102,  5752,  2146,  4117,  7680,   144,    17,    41,
         22665,    61,  5259,   132,    47,   458,    29,    32,    18,   782,
          1307,    22,    21,     8,   837,    12,  

In [27]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained("fine_tuned_t5_small_cnn_dailymail_model")
outputs = model.generate(inputs, max_new_tokens=100, do_sample=False)

In [28]:
tokenizer.decode(outputs[0], skip_special_tokens=True)

"The a a a a a a a a a a a a a a a a a a a a a's's's's's's's's's's's's's's's's '"

In [29]:
pred[0]['summary_text']

"The a 's a has been in the . It is he has been 't's in ' ' and '"

In [30]:
preds = [pred[0]['summary_text']]

In [31]:
labels = [cnn_dailymail['test'][100]['highlights']]

In [32]:
rouge.compute(predictions=preds, references=labels, use_stemmer=True)

{'rouge1': 0.14925373134328357,
 'rouge2': 0.030769230769230767,
 'rougeL': 0.08955223880597014,
 'rougeLsum': 0.14925373134328357}