In [1]:
from datasets import load_dataset, load_from_disk
import matplotlib.pyplot as plt
from datasets import load_dataset
import pandas as pd

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import nltk
from nltk.tokenize import sent_tokenize

from tqdm import tqdm 
import torch 

nltk.download("punkt")

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /home/ml01/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
from transformers import AutoTokenizer, PegasusForConditionalGeneration
from transformers import PegasusConfig

config = PegasusConfig.from_pretrained("google/pegasus-large")
config.use_cache = False  # Important for activation checkpointing
config.checkpointing = True 

model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")

tokenizer = AutoTokenizer.from_pretrained("google/pegasus-xsum")

ARTICLE_TO_SUMMARIZE = (
    "PG&E stated that it schedules the blackouts in repsonse to forcasts for high winds "
    "amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousands customers were "
    "scheduled to be affected by the shutoffs which were excepted to last through at least midday tommorow."
)
inputs = tokenizer(ARTICLE_TO_SUMMARIZE, max_length=1024, return_tensors="pt")




Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [3]:
inputs

{'input_ids': tensor([[14887,   759,  1005,  3163,   120,   126,  8307,   109, 25690,   116,
           115, 12994,  5978,   326,   112,   118,  9684,   116,   118,   281,
          7213, 10754,  1514,  1047,   107,   139,  2560,   117,   112,  1329,
           109,   887,   113, 39471,   107, 16502,  6194,  1873,   527,   195,
          2798,   112,   129,  2790,   141,   109, 87338,   116,   162,   195,
          2854,   316,   112,   289,   224,   134,   583, 26568,   112,  1592,
           490, 11055,   107,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [4]:

# Genarate Summary
summary_ids = model.generate(inputs["input_ids"])
tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


"California's largest electricity provider has cut power to tens of thousands of customers because of high winds."

In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [6]:
model = "google/pegasus-cnn_dailymail"

tokenizer = AutoTokenizer.from_pretrained(model)

model_pagasus = AutoModelForSeq2SeqLM.from_pretrained(model).to(device)

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
dataset_samsum = load_from_disk('../data/samsum_dataset')
dataset_samsum 

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

In [8]:
split_lengths = [len(dataset_samsum[split]) for split in dataset_samsum]

print(f"Split lengths: {split_lengths}")

print(f"Features: {dataset_samsum['train'].column_names}")

print("\nDialogue:")
print(dataset_samsum["test"][1]['dialogue'])

print("\nSummary:")
print(dataset_samsum["test"][1]["summary"])

Split lengths: [14732, 819, 818]
Features: ['id', 'dialogue', 'summary']

Dialogue:
Eric: MACHINE!
Rob: That's so gr8!
Eric: I know! And shows how Americans see Russian ;)
Rob: And it's really funny!
Eric: I know! I especially like the train part!
Rob: Hahaha! No one talks to the machine like that!
Eric: Is this his only stand-up?
Rob: Idk. I'll check.
Eric: Sure.
Rob: Turns out no! There are some of his stand-ups on youtube.
Eric: Gr8! I'll watch them now!
Rob: Me too!
Eric: MACHINE!
Rob: MACHINE!
Eric: TTYL?
Rob: Sure :)

Summary:
Eric and Rob are going to watch a stand-up on youtube.


### Preparing Data For Training For Sequence To Sequence Model

In [11]:
"""
{
    'dialogue': "Hi! How are you?",
    'summary': "The speaker is asking how the other person is."
}


{
    'input_ids': [123, 456, 789, ...],  # Token IDs for the dialogue
    'attention_mask': [1, 1, 1, ...],  # Attention mask for the input
    'labels': [321, 654, 987, ...]  # Token IDs for the summary (target)
}
"""

def convert_examples_to_features(example_batch):
    input_encodings = tokenizer(example_batch["dialogue"], max_length=1024, truncation=True)

    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(example_batch["summary"], max_length=128, truncation=True)

    return {
        'input_ids': input_encodings['input_ids'],
        'attention_masks': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids']
    }


In [12]:
dataset_samsum_pt = dataset_samsum.map(convert_examples_to_features, batched= True)

In [13]:
dataset_samsum_pt["train"]

Dataset({
    features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_masks', 'labels'],
    num_rows: 14732
})

## Training

In [14]:
from  transformers import DataCollatorForSeq2Seq

seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pagasus)

In [13]:
# !pip install --upgrade accelerate
# !pip uninstall -y transformers accelerate
# !pip install transformers accelerate

In [15]:
from transformers import TrainingArguments, Trainer

trainer_args = TrainingArguments(
    output_dir='pegasus-samsum', num_train_epochs=1,warmup_steps=500,
    per_device_train_batch_size=1, per_device_eval_batch_size=1,
    weight_decay=0.01, logging_steps=50, eval_strategy='steps',
    eval_steps=1000, save_steps=10000, fp16=True,
    gradient_accumulation_steps=8, deepspeed = "deepspeed.json"

)

In [16]:
trainer = Trainer(model = model_pagasus, args= trainer_args, tokenizer= tokenizer, data_collator=seq2seq_data_collator
                  , train_dataset=dataset_samsum_pt["test"], eval_dataset=dataset_samsum_pt["validation"])

[2025-06-25 16:07:52,347] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)


  trainer = Trainer(model = model_pagasus, args= trainer_args, tokenizer= tokenizer, data_collator=seq2seq_data_collator
/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status


[2025-06-25 16:07:53,494] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False


In [18]:
trainer.train()

RuntimeError: cannot load MPI library
/home/ml01/.virtualenvs/mlops_huggingface/lib/libmpi.so: cannot open shared object file: No such file or directory
/home/ml01/.virtualenvs/mlops_huggingface/lib/libmpi.so.12: cannot open shared object file: No such file or directory
/home/ml01/.virtualenvs/mlops_huggingface/lib/libmpi.so.40: cannot open shared object file: No such file or directory
libmpi.so: cannot open shared object file: No such file or directory
libmpi.so.12: cannot open shared object file: No such file or directory
libmpi.so.40: cannot open shared object file: No such file or directory