In [1]:
!nvidia-smi

Fri Jul  7 15:10:22 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   72C    P8    11W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install transformers[sentencepiece] datasets sacrebleu rouge_score py7zr -q

!pip install --upgrade accelerate -q
!pip uninstall -y transformers accelerate -q
!pip install transformers accelerate -q

In [3]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import DataCollatorForSeq2Seq
from transformers import TrainingArguments, Trainer
from transformers import pipeline

from datasets import load_dataset, load_from_disk, load_metric

import pandas as pd

import nltk
from nltk.tokenize import sent_tokenize
nltk.download("punkt")

from tqdm import tqdm
import torch

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
# torch.cuda.is_available() function to check if a GPU is available for use else CPU will be used

device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

##.to(device)

The device variable contains the string "cuda" or "cpu", which determines whether the model will be placed on a GPU or CPU for execution.

By using .to(device), the model is moved to the specified device, ensuring that computations are performed on the appropriate hardware.

In [5]:
# Pre-trained model checkpoint for the PEGASUS model

model_ckpt = "google/pegasus-cnn_dailymail"

tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)

In [6]:
dataset_samsum = load_dataset('samsum')
dataset_samsum



  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

In [7]:
# Lets take only 1000 records of the train dataset keeping computation into consideration

dataset_samsum['train'] = dataset_samsum['train'].shuffle(seed=42).select(range(1000))
dataset_samsum




DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

In [8]:
dataset_samsum['train'].data[:2]

pyarrow.Table
id: string
dialogue: string
summary: string
----
id: [["13818513","13728867"]]
dialogue: [["Amanda: I baked  cookies. Do you want some?
Jerry: Sure!
Amanda: I'll bring you tomorrow :-)","Olivia: Who are you voting for in this election? 
Oliver: Liberals as always.
Olivia: Me too!!
Oliver: Great"]]
summary: [["Amanda baked cookies and will bring Jerry some tomorrow.","Olivia and Olivier are voting for liberals in this election. "]]

In [9]:
split_lengths = [len(dataset_samsum[split]) for split in dataset_samsum]

print(f"Split lengths: {split_lengths}")
print(f"Features: {dataset_samsum['train'].column_names}")
print("\nDialogue:")

print(dataset_samsum["test"][1]["dialogue"])

print("\nSummary:")

print(dataset_samsum["test"][1]["summary"])

Split lengths: [1000, 819, 818]
Features: ['id', 'dialogue', 'summary']

Dialogue:
Eric: MACHINE!
Rob: That's so gr8!
Eric: I know! And shows how Americans see Russian ;)
Rob: And it's really funny!
Eric: I know! I especially like the train part!
Rob: Hahaha! No one talks to the machine like that!
Eric: Is this his only stand-up?
Rob: Idk. I'll check.
Eric: Sure.
Rob: Turns out no! There are some of his stand-ups on youtube.
Eric: Gr8! I'll watch them now!
Rob: Me too!
Eric: MACHINE!
Rob: MACHINE!
Eric: TTYL?
Rob: Sure :)

Summary:
Eric and Rob are going to watch a stand-up on youtube.


In [10]:
# Lets see the train dataset

columns_names = dataset_samsum['train'].column_names

import pandas as pd
train_df = pd.DataFrame.from_records(dataset_samsum["train"].data).T
train_df.columns = columns_names
train_df.head()

Unnamed: 0,id,dialogue,summary
0,13818513,Amanda: I baked cookies. Do you want some?\r\...,Amanda baked cookies and will bring Jerry some...
1,13728867,Olivia: Who are you voting for in this electio...,Olivia and Olivier are voting for liberals in ...
2,13681000,"Tim: Hi, what's up?\r\nKim: Bad mood tbh, I wa...",Kim may try the pomodoro technique recommended...
3,13730747,"Edward: Rachel, I think I'm in ove with Bella....",Edward thinks he is in love with Bella. Rachel...
4,13728094,Sam: hey overheard rick say something\r\nSam:...,"Sam is confused, because he overheard Rick com..."


In [11]:
dialogue_length = train_df["dialogue"].astype(str).apply(len)
dialogue_length.max()

5492

In [12]:
summary_length = train_df["summary"].astype(str).apply(len)
summary_length.max()

300

## Attention mask:

The attention mask is a binary mask with the same length as the input sequence. It has a value of 1 for tokens that are part of the original text and 0 for padding tokens. By providing this mask to the model, we indicate which tokens are valid and should be attended to during processing.

When the model attends to the input sequence, it uses the attention mask to guide its attention mechanism. It learns to assign higher weights to the valid tokens, allowing it to concentrate on the relevant parts of the input while effectively ignoring the padding tokens. This helps in reducing unnecessary computation and improving the efficiency of the model.

By attending only to valid tokens, the model can focus on the meaningful information in the sequence, making its predictions more accurate and relevant to the task at hand.

In [13]:
def convert_examples_to_features(example_batch):
    input_encodings = tokenizer(example_batch['dialogue'], max_length=1024, truncation=True, padding=True)

    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(example_batch['summary'], max_length=128, truncation=True, padding=True)

    return {
        'input_ids' : input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids']
    }



batched=True in the map() method allows the function to process the input dataset in batches instead of individual examples. This is used while working with large datasets or when processing inputs in parallel, improving performance and resource utilization.

In [14]:
dataset_samsum_pt = dataset_samsum.map(convert_examples_to_features, batched=True)
dataset_samsum_pt



Map:   0%|          | 0/819 [00:00<?, ? examples/s]



DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 818
    })
})

In [15]:
pd.DataFrame.from_records(dataset_samsum_pt["train"].data).T.head()

Unnamed: 0,0,1,2,3,4,5
0,13681220,Lucy: omg did you see JK this morning?\r\nSue:...,Sue doesn't watch JK any more as it's disgusting.,"(12174, 151, 25479, 838, 368, 119, 236, 36844,...","(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","(12776, 591, 131, 144, 1183, 36844, 189, 154, ..."
1,13716809,Wendy: What's up?\r\nSimon: Nothing much. I'm ...,This weekend Wendy is very lazy because she wo...,"(17472, 151, 463, 131, 116, 164, 152, 6331, 15...","(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","(182, 1339, 17472, 117, 221, 9474, 262, 265, 9..."
2,13730745,"Petra: Hi Zack, I see you called. Sorry I can'...","Zack called Petra, but she didn't answer becau...","(41111, 151, 4451, 33962, 108, 125, 236, 119, ...","(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","(33962, 568, 41111, 108, 155, 265, 595, 131, 1..."
3,13827944,Amelia: Want to go shopping tomorrow? :)\r\nAn...,Amelia wants to go shopping on Sunday with Ann...,"(27704, 151, 6168, 112, 275, 1553, 3469, 152, ...","(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","(27704, 1728, 112, 275, 1553, 124, 1342, 122, ..."
4,13828293,Niki: Guess what\r\nJeanna: Hmmm? ;p\r\nNiki: ...,"To everyone's surprise, Angel has a boyfriend.","(47539, 151, 20874, 180, 7026, 2558, 151, 4124...","(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","(413, 688, 131, 116, 2989, 108, 9227, 148, 114..."


Column 1: Contains the original text from the "dialogue" field of the input example.

Column 2: Contains the original text from the "summary" field of the input example.

Column 3: Contains a sequence of input IDs for the input text, encoded by the tokenizer.

Column 4: Contains a sequence of attention mask values for the input text, indicating which tokens to pay attention to during processing.

Column 5: Contains a sequence of input IDs for the target or summary text, encoded by the tokenizer.

In [16]:
pd.DataFrame.from_records(dataset_samsum_pt["test"].data).T.head()

Unnamed: 0,0,1,2,3,4,5
0,13862856,"Hannah: Hey, do you have Betty's number?\nAman...",Hannah needs Betty's number but Amanda doesn't...,"(12636, 151, 10532, 108, 171, 119, 133, 17379,...","(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","(12636, 397, 17379, 131, 116, 344, 155, 12195,..."
1,13729565,Eric: MACHINE!\r\nRob: That's so gr8!\r\nEric:...,Eric and Rob are going to watch a stand-up on ...,"(6303, 151, 60662, 147, 7374, 151, 485, 131, 1...","(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","(6303, 111, 7374, 127, 313, 112, 1183, 114, 12..."
2,13680171,"Lenny: Babe, can you help me with something?\r...",Lenny can't decide which trousers to buy. Bob ...,"(43880, 151, 35774, 108, 137, 119, 225, 213, 1...","(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","(43880, 137, 131, 144, 1854, 162, 18002, 112, ..."
3,13729438,"Will: hey babe, what do you want for dinner to...",Emma will be home soon and she will let Will k...,"(2254, 151, 14381, 38381, 108, 180, 171, 119, ...","(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","(9859, 138, 129, 238, 783, 111, 265, 138, 538,..."
4,13828600,"Ollie: Hi , are you in Warsaw\r\nJane: yes, ju...",Jane is in Warsaw. Ollie and Jane has a party....,"(45245, 151, 4451, 110, 108, 127, 119, 115, 23...","(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","(7130, 117, 115, 23445, 107, 45245, 111, 7130,..."


In [17]:
pd.DataFrame.from_records(dataset_samsum_pt["validation"].data).T.head()

Unnamed: 0,0,1,2,3,4,5
0,13817023,"A: Hi Tom, are you busy tomorrow’s afternoon?\...",A will go to the animal shelter tomorrow to ge...,"(202, 151, 4451, 3227, 108, 127, 119, 2117, 34...","(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","(202, 138, 275, 112, 109, 2517, 6256, 3469, 11..."
1,13716628,Emma: I’ve just fallen in love with this adven...,Emma and Rob love the advent calendar. Lauren ...,"(9859, 151, 125, 123, 261, 188, 6852, 115, 298...","(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","(9859, 111, 7374, 298, 109, 15519, 3672, 107, ..."
2,13829420,Jackie: Madison is pregnant\r\nJackie: but she...,Madison is pregnant but she doesn't want to ta...,"(17811, 151, 8367, 117, 5725, 17811, 151, 155,...","(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","(8367, 117, 5725, 155, 265, 591, 131, 144, 245..."
3,13819648,Marla: <file_photo>\r\nMarla: look what I foun...,Marla found a pair of boxers under her bed.,"(76333, 151, 110, 105, 12014, 940, 18580, 2314...","(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","(76333, 374, 114, 2188, 113, 48239, 365, 215, ..."
4,13728448,Robert: Hey give me the address of this music ...,Robert wants Fred to send him the address of t...,"(3102, 151, 10532, 361, 213, 109, 845, 113, 13...","(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","(3102, 1728, 9930, 112, 1053, 342, 109, 845, 1..."


**DataCollatorForSeq2Seq**

DataCollatorForSeq2Seq class helps in preparing the data for fine-tuning a pre-trained model for seq2seq tasks by handling batch creation, padding, and attention mask generation.

However, the actual tokenization and encoding of the data is performed by tokenizer object separately before using the DataCollatorForSeq2Seq class.

In [18]:
# Training

from transformers import DataCollatorForSeq2Seq

seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)

In [19]:
from transformers import TrainingArguments, Trainer

trainer_args = TrainingArguments(output_dir='pegasus-samsum',
                                 num_train_epochs=1,
                                 warmup_steps=500,
                                 per_device_train_batch_size=1,
                                 per_device_eval_batch_size=1,
                                 weight_decay=0.01,
                                 logging_steps=10,
                                 evaluation_strategy='steps',
                                 eval_steps=500,
                                 save_steps=1e6,
                                 gradient_accumulation_steps=16
                                  )

In [20]:

trainer = Trainer(model = model_pegasus,
                  args = trainer_args,
                  tokenizer = tokenizer,
                  data_collator = seq2seq_data_collator,
                  train_dataset = dataset_samsum_pt["train"],
                  eval_dataset = dataset_samsum_pt["validation"])

In [21]:
trainer.train()

You're using a PegasusTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss


TrainOutput(global_step=62, training_loss=9.020810065730926, metrics={'train_runtime': 466.8333, 'train_samples_per_second': 2.142, 'train_steps_per_second': 0.133, 'total_flos': 1959418055884800.0, 'train_loss': 9.020810065730926, 'epoch': 0.99})

In [22]:
list_of_elements = [ "A", "B", "C", "D", "E", "F"]
batch_size = 2

for i in range(0, len(list_of_elements), batch_size):
  print(list_of_elements[i : i + batch_size])

['A', 'B']
['C', 'D']
['E', 'F']


In [23]:
# Evaluation

def generate_batch_sized_chunks(list_of_elements, batch_size):
    """split the dataset into smaller batches that we can process simultaneously
    Yield successive batch-sized chunks from list_of_elements."""

    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i : i + batch_size]

def calculate_metric_on_test_ds(dataset, metric, model, tokenizer,
                               batch_size=16, device=device,
                               column_text="dialogue",
                               column_summary="summary"):
    article_batches = list(generate_batch_sized_chunks(dataset[column_text], batch_size))
    target_batches = list(generate_batch_sized_chunks(dataset[column_summary], batch_size))

    for article_batch, target_batch in tqdm(zip(article_batches, target_batches),
                                            total=len(article_batches)):

        inputs = tokenizer(article_batch, max_length=1024,  truncation=True,
                        padding="max_length", return_tensors="pt")

        summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                         attention_mask=inputs["attention_mask"].to(device),
                         length_penalty=0.8, num_beams=8, max_length=128)
        ''' parameter for length penalty ensures that the model does not generate sequences that are too long. '''

        # Finally, we decode the generated texts,
        # replace the  token, and add the decoded texts with the references to the metric.
        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True, clean_up_tokenization_spaces=True)
                              for s in summaries]

        decoded_summaries = [d.replace("", " ") for d in decoded_summaries]


        metric.add_batch(predictions=decoded_summaries, references=target_batch)

    #  Finally compute and return the ROUGE scores.
    score = metric.compute()
    return score


Each element represents a specific ROUGE metric that will be used for evaluation.

Rouge-1 (ROUGE-1): Measures the overlap of unigram (individual words) between the generated summary and the reference summary.

Rouge-2 (ROUGE-2): Measures the overlap of bigram (pairs of consecutive words) between the generated summary and the reference summary.

Rouge-L (ROUGE-L): Computes the longest common subsequence (LCS) between the generated summary and the reference summary, considering word-level matches.

Rouge-Lsum (ROUGE-Lsum): Similar to Rouge-L, but uses sentence-level matches to compute the LCS.

By passing the generated summary and the reference summary to the rouge_metric object, we can obtain evaluation scores for each of the specified ROUGE metrics.



generated_summary = "This is a generated summary."
reference_summary = "This is a reference summary."

results = rouge_metric.compute(predictions=[generated_summary],     
                               references=[reference_summary])

for rouge_name in rouge_names:
    print(f"{rouge_name}: {results[rouge_name]}")


In [24]:
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
rouge_metric = load_metric('rouge')

  rouge_metric = load_metric('rouge')


Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

In [25]:
score = calculate_metric_on_test_ds(dataset_samsum['test'][0:10],
                                    rouge_metric,
                                    trainer.model,
                                    tokenizer,
                                    batch_size=2,
                                    column_text='dialogue',
                                    column_summary='summary')


rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names )

pd.DataFrame(rouge_dict, index = [f'pegasus'] )

100%|██████████| 5/5 [00:15<00:00,  3.06s/it]


Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
pegasus,0.019367,0.0,0.019121,0.019181


In [26]:
## Save model
model_pegasus.save_pretrained("pegasus-samsum-model")

In [27]:
## Save tokenizer
tokenizer.save_pretrained("tokenizer")

('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/spiece.model',
 'tokenizer/added_tokens.json',
 'tokenizer/tokenizer.json')

In [28]:
#Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("/content/tokenizer")

In [29]:
input_text = """Attention mechanism is a technique used in Natural Language Processing (NLP) and other sequence-based tasks to enhance the performance of models by allowing them to focus on relevant parts of the input sequence while generating an output. It helps the model to pay attention to specific words or positions in the input sequence when making predictions or generating the output sequence
The basic idea behind attention mechanism is to create a weighted representation of the input sequence, where the weights indicate the importance or relevance of each word or position. These weights are learned during the training process, and they reflect the alignment between the words in the input sequence and the words being generated in the output sequence."""


In [30]:
# Tokenize the input text
input_tokens = tokenizer(input_text, max_length=1024, truncation=True, padding="max_length", return_tensors="pt")
input_tokens = input_tokens.to(device)

# Generate summary
summary_tokens = model_pegasus.generate(input_ids=input_tokens["input_ids"],
                                        attention_mask=input_tokens["attention_mask"],
                                        length_penalty=0.8,
                                        num_beams=8,
                                        max_length=128)

# Decode the generated summary
summary = tokenizer.decode(summary_tokens[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)

# Print the generated summary
print("Generated Summary:")
print(summary)


Generated Summary:
The basic idea behind attention mechanism is to create a weighted representation of the input sequence, where the weights indicate the importance or relevance of each word or position.<n>These weights are learned during the training process, and they reflect the alignment between the words in the input sequence and the words being generated in the output sequence.


The basic idea behind attention mechanism is to create a weighted representation of the input sequence.<n>These weights are learned during the training process, and they reflect the alignment between the words in the input sequence and the words being generated in the output sequence.