# Installing the HuggingFace Libraries 

In [1]:
!pip install -q transformers[torch] datasets

# Loading the CNN_DAILYMAIL Dataset 

In [2]:
from datasets import load_dataset

cnn_dailymail = load_dataset("cnn_dailymail", "3.0.0", split='validation')

Downloading readme:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

Downloading data: 100%|██████████| 257M/257M [00:03<00:00, 75.8MB/s] 
Downloading data: 100%|██████████| 257M/257M [00:03<00:00, 71.4MB/s] 
Downloading data: 100%|██████████| 259M/259M [00:03<00:00, 75.1MB/s] 
Downloading data: 100%|██████████| 34.7M/34.7M [00:00<00:00, 54.3MB/s]
Downloading data: 100%|██████████| 30.0M/30.0M [00:00<00:00, 49.2MB/s]


Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

Looking at the number of rows and columns of the dataset

In [3]:
cnn_dailymail

Dataset({
    features: ['article', 'highlights', 'id'],
    num_rows: 13368
})

Splitting the dataset into training and testing set

In [4]:
cnn_dailymail = cnn_dailymail.train_test_split(test_size=0.2)

In [5]:
cnn_dailymail

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 10694
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 2674
    })
})

Checking if the dataset is loaded correctly

In [6]:
example = cnn_dailymail["train"][0]
for key in example:
    print("A key of the example: \"{}\"".format(key))
    print("The value corresponding to the key-\"{}\"\n \"{}\"".format(key, example[key]))

A key of the example: "article"
The value corresponding to the key-"article"
 "West Ham striker Carlton Cole has accepted a Football Association charge over a Twitter exchange with a Tottenham supporter. Cole had until 6pm on Thursday night to respond to the charge of making a comment which 'was abusive and/or insulting and/or improper and/or brings the game into disrepute'. A FA independent commission will now meet to decide his sanction, with no limit to the possible punishment. West Ham United striker Carlton Cole has admitted an FA charge for a tweet that the FA deemed abusive . Cole tweeted back to a Tottenham fan who had insulted him on Twitter, telling the  supporter: 'F off you c***' The charge related to Cole's involvement in a Twitter altercation with a Spurs fan following West Ham's 2-2 Barclays Premier League draw at White Hart Lane on February 22. The 31-year-old, who has 122,000 followers on the social networking site, was responding to a message from Spurs supporter Stua

# Preprocessing and Tokenization

In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("t5-small")

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [8]:
tokenized_text = tokenizer(example['article'])
for key in tokenized_text:
    print(key)
    print(tokenized_text[key])

input_ids
[1244, 5845, 6585, 52, 3, 30339, 16311, 65, 4307, 3, 9, 10929, 2125, 1567, 147, 3, 9, 3046, 2509, 28, 3, 9, 31857, 380, 49, 5, 16311, 141, 552, 431, 2028, 30, 2721, 706, 12, 3531, 12, 8, 1567, 13, 492, 3, 9, 1670, 84, 3, 31, 9491, 27031, 11, 87, 127, 21548, 53, 11, 87, 127, 22187, 11, 87, 127, 3200, 8, 467, 139, 1028, 28285, 31, 5, 71, 8536, 2547, 5473, 56, 230, 942, 12, 2204, 112, 26419, 6, 28, 150, 2006, 12, 8, 487, 19372, 5, 1244, 5845, 907, 6585, 52, 3, 30339, 16311, 65, 10246, 46, 8536, 1567, 21, 3, 9, 10657, 24, 8, 8536, 3, 10863, 27031, 3, 5, 16311, 27975, 223, 12, 3, 9, 31857, 1819, 113, 141, 21548, 15, 26, 376, 30, 3046, 6, 5188, 8, 380, 49, 10, 3, 31, 371, 326, 25, 3, 75, 10647, 31, 37, 1567, 1341, 12, 16311, 31, 7, 9683, 16, 3, 9, 3046, 8310, 75, 257, 28, 3, 9, 17740, 7, 1819, 826, 1244, 5845, 31, 7, 3, 22451, 1386, 75, 20244, 6552, 3815, 3314, 44, 1945, 10498, 11834, 30, 2083, 1630, 5, 37, 2664, 18, 1201, 18, 1490, 6, 113, 65, 586, 8630, 10076, 30, 8, 569, 7607, 3

In [9]:
def preprocess_function(examples):
    # Prepends the string "summarize: " to each document in the 'text' field of the input examples.
    # This is done to instruct the T5 model on the task it needs to perform, which in this case is summarization.
    inputs = ["summarize: " + doc for doc in examples["article"]]

    # Tokenizes the prepended input texts to convert them into a format that can be fed into the T5 model.
    # Sets a maximum token length of 1024, and truncates any text longer than this limit.
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    # Tokenizes the 'summary' field of the input examples to prepare the target labels for the summarization task.
    # Sets a maximum token length of 128, and truncates any text longer than this limit.
    labels = tokenizer(text_target=examples["highlights"], max_length=128, truncation=True)

    # Assigns the tokenized labels to the 'labels' field of model_inputs.
    # The 'labels' field is used during training to calculate the loss and guide model learning.
    model_inputs["labels"] = labels["input_ids"]

    # Returns the prepared inputs and labels as a single dictionary, ready for training.
    return model_inputs

In [10]:
tokenized_cnn_dailymail = cnn_dailymail.map(preprocess_function, batched=True)

Map:   0%|          | 0/10694 [00:00<?, ? examples/s]

Map:   0%|          | 0/2674 [00:00<?, ? examples/s]

In [11]:
tokenized_cnn_dailymail['test'][0]['article']

"The two cross-dressing men who were shot outside the National Security Agency's headquarters on Monday had stolen the SUV they were driving from a hotel they had been partying at, it is claimed. Kevin Fleming, 20, and his friend were allegedly picked up in Baltimore, Maryland, by another man on Sunday night and driven to a hotel in Howard County, where they were said to have 'partied'. But the following morning, the unidentified driver woke up alone and discovered both the men and his Ford Escape SUV had gone, sources told ABC. He reported his vehicle stolen to county police. Shortly after, Fleming and his friend allegedly drove the SUV onto the grounds of the NSA and refused police commands to halt. They were subsequently shot by a guard outside the facility. Scroll down for videos . An NSA officer opened fire, killing one of the suspects and wounding a second during a shootout. A white sheet appears to cover a body outside the the SUV . The 44-year-old officer is pictured here being

In [12]:
tokenized_cnn_dailymail['test'][0]['highlights']

"National Security Agency police guard shot Kevin Fleming, 20, and his friend, who tried to ram through secure entrance on Fort Meade Monday .\nFleming was flown to hospital with serious injuries; other man killed .\nNow, it has been claimed \xa0pair stole SUV from hotel they had partied at .\nThey allegedly partied with the driver, but stole his vehicle as he slept .\nAerial footage of scene shows that the SUV crashed into a police cruiser .\nOfficer, 44, was also injured in incident\xa0at NSA's secretive headquarters .\nIt is unknown whether men's attire was to do with the alleged 'partying'\nUS officials say incident is 'local criminal matter' and not act of terrorism ."

In [13]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model="t5-small")

2024-04-29 17:08:27.825585: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-29 17:08:27.825687: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-29 17:08:27.943707: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Finetuning the model using Rouge Evaluation Metric

In [14]:
! pip install -q evaluate rouge_score

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [15]:
import evaluate

rouge = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [16]:
import numpy as np

def compute_metrics(eval_pred):
    # Unpacks the evaluation predictions tuple into predictions and labels.
    predictions, labels = eval_pred

    # Decodes the tokenized predictions back to text, skipping any special tokens (e.g., padding tokens).
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replaces any -100 values in labels with the tokenizer's pad_token_id.
    # This is done because -100 is often used to ignore certain tokens when calculating the loss during training.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    # Decodes the tokenized labels back to text, skipping any special tokens (e.g., padding tokens).
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Computes the ROUGE metric between the decoded predictions and decoded labels.
    # The use_stemmer parameter enables stemming, which reduces words to their root form before comparison.
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    # Calculates the length of each prediction by counting the non-padding tokens.
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]

    # Computes the mean length of the predictions and adds it to the result dictionary under the key "gen_len".
    result["gen_len"] = np.mean(prediction_lens)

    # Rounds each value in the result dictionary to 4 decimal places for cleaner output, and returns the result.
    return {k: round(v, 4) for k, v in result.items()}


In [17]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, T5Config, T5Model, T5ForConditionalGeneration

In [18]:
model_name = "T5-small"
# Define the configuration with your desired number of attention heads
config = T5Config.from_pretrained(model_name)
config.num_heads = 4  # Change the number of attention heads to 4
config.d_kv = config.d_model // config.num_heads

# Now, use this modified configuration when initializing your model for fine-tuning
# Initialize a new T5 model with the modified configuration
model = T5ForConditionalGeneration(config=config)

# Load the pretrained weights into the newly initialized model
model.from_pretrained(model_name)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [19]:
print(config)

T5Config {
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 128,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 4,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "prefix

In [20]:
training_args = Seq2SeqTrainingArguments(
    output_dir="fine_tuned_t5_small_cnn_dailymail_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=True,
)

In [21]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_cnn_dailymail["train"],
    eval_dataset=tokenized_cnn_dailymail["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [22]:
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,8.2375,7.777788,0.06,0.0001,0.0527,0.0527,18.9993
2,7.8314,7.548822,0.0591,0.0001,0.0519,0.0519,19.0
3,7.5825,7.447306,0.0622,0.0007,0.0541,0.0541,19.0
4,7.5191,7.409779,0.0631,0.001,0.0548,0.0548,19.0




TrainOutput(global_step=2676, training_loss=7.753702890712584, metrics={'train_runtime': 2547.19, 'train_samples_per_second': 16.793, 'train_steps_per_second': 1.051, 'total_flos': 1.1577873618960384e+16, 'train_loss': 7.753702890712584, 'epoch': 4.0})

In [23]:
trainer.save_model("fine_tuned_t5_small_cnn_dailymail_model")

# Inference Using Finetuned Model

In [24]:
text = cnn_dailymail['test'][100]['article']
text = "summarize: " + text
text

"summarize: It's usually cats getting their backs scratched. But one paws-on feline from Bonita, California, decided to repay the favor with his own 'ultimate meow-ssage.' Banks the cat was filmed as he caressed a woman's back and shoulders. 'How does it feel?' a man asks as the therapy session takes place. 'Fantastic!' the woman exclaims with a grin on her face. To reach all the right spots, Banks hopped up on to a kitchen counter, with his customer stood in front. He then proceeded to paw-rub away. The animal was previously listed for adoption by the San Diego Department of Animal Services. However, he now appears to have gone to a good home. Banks' massage client explained that his relaxation techniques were a definite bonus. 'For sure, who doesn't like an in-house masseuse that works for catnip?' she said. Caught on camera: Banks the cat was filmed as he caressed a woman's back and shoulders . Clever kitty: To reach all the right spots, the feline hopped up on to a kitchen counter,

In [25]:
from transformers import pipeline

summarizer = pipeline("summarization", model="fine_tuned_t5_small_cnn_dailymail_model")
pred = summarizer(text)
pred

[{'summary_text': "The he was 's's a 't's in as . He was he had been he's and he '"}]

In [26]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("fine_tuned_t5_small_cnn_dailymail_model")
inputs = tokenizer(text, return_tensors="pt").input_ids
inputs

tensor([[21603,    10,    94,    31,     7,  1086, 10003,   652,    70,   223,
             7,  8629,    15,    26,     5,   299,    80,     3, 19589,     7,
            18,   106,  3110,   630,    45,  4523,   155,     9,     6,  1826,
             6,  1500,    12, 26344,     8,  4971,    28,   112,   293,     3,
            31,    83,  2998,   342,   140,  2381,    18,     7,     7,   545,
             5,    31,  1925,     7,     8,  1712,    47,     3, 25403,    38,
             3,    88,   124,     7,  3843,     3,     9,  2335,    31,     7,
           223,    11, 15424,     5,     3,    31,  7825,   405,    34,   473,
            58,    31,     3,     9,   388,   987,     7,    38,     8,  3918,
          2363,  1217,   286,     5,     3,    31,   371,   288, 10057,    55,
            31,     8,  2335,  1215, 15085,     7,    28,     3,     9,     3,
         18363,    30,   160,   522,     5,   304,  1535,    66,     8,   269,
          6883,     6,  1925,     7,     3, 29074,  

In [27]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained("fine_tuned_t5_small_cnn_dailymail_model")
outputs = model.generate(inputs, max_new_tokens=100, do_sample=False)

In [28]:
tokenizer.decode(outputs[0], skip_special_tokens=True)

"The's's's's's's's's's's's's's's's's's's's's's's's's's's's's's's's's's's's's's's '"

In [29]:
pred[0]['summary_text']

"The he was 's's a 't's in as . He was he had been he's and he '"

In [30]:
preds = [pred[0]['summary_text']]

In [31]:
labels = [cnn_dailymail['test'][100]['highlights']]

In [32]:
rouge.compute(predictions=preds, references=labels, use_stemmer=True)

{'rouge1': 0.16666666666666666,
 'rouge2': 0.0,
 'rougeL': 0.125,
 'rougeLsum': 0.16666666666666666}