In [1]:
!nvidia-smi

Sun Jun  2 05:46:14 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0              45W / 400W |      2MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [1]:
!pip install "transformers[sentencepiece]" datasets sacrebleu rouge_score py7zr -q

In [2]:
!pip install --upgrade accelerate
!pip uninstall -y transformers accelerate
!pip install transformers accelerate

Found existing installation: transformers 4.41.2
Uninstalling transformers-4.41.2:
  Successfully uninstalled transformers-4.41.2
Found existing installation: accelerate 0.30.1
Uninstalling accelerate-0.30.1:
  Successfully uninstalled accelerate-0.30.1
Collecting transformers
  Using cached transformers-4.41.2-py3-none-any.whl.metadata (43 kB)
Collecting accelerate
  Using cached accelerate-0.30.1-py3-none-any.whl.metadata (18 kB)
Using cached transformers-4.41.2-py3-none-any.whl (9.1 MB)
Using cached accelerate-0.30.1-py3-none-any.whl (302 kB)
Installing collected packages: accelerate, transformers
Successfully installed accelerate-0.30.1 transformers-4.41.2


In [3]:
# Import necessary libraries
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, BartTokenizer, BartForConditionalGeneration, DataCollatorForSeq2Seq, TrainingArguments, Trainer, pipeline
from datasets import load_dataset, concatenate_datasets, load_metric
import pandas as pd
from tqdm import tqdm
import nltk


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Download NLTK data
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/surya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
#For Apple Silicon Users
# Set device for MPS (Apple Silicon) or CPU

device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
#For Windows Users, isnce I am using Colab, I will be utilising this
# device = "cuda" if torch.cuda.is_available() else "cpu"
# device

In [7]:
# Function to load PEGASUS model and tokenizer
def load_pegasus_model(device):
    model_ckpt = "google/pegasus-cnn_dailymail"
    tokenizer = AutoTokenizer.from_pretrained(model_ckpt, model_max_length = 1024)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)
    return model, tokenizer

In [8]:
# Function to load BART model and tokenizer
def load_bart_model(device):
    model_ckpt = "facebook/bart-large-cnn"
    tokenizer = BartTokenizer.from_pretrained(model_ckpt, model_max_length = 1024)
    model = BartForConditionalGeneration.from_pretrained(model_ckpt).to(device)
    return model, tokenizer

In [9]:
# Define dataset names, configurations, and trust_remote_code flags
datasets_info = [
    {"name": "cnn_dailymail", "config": "3.0.0", "trust_remote_code": False},
    {"name": "xsum", "config": None, "trust_remote_code": True},
    {"name": "multi_news", "config": None, "trust_remote_code": True},
    {"name": "gigaword", "config": None, "trust_remote_code": True},
    {"name": "big_patent", "config": "a", "trust_remote_code": True},
    {"name": "samsum", "config": None, "trust_remote_code": True},
]

In [10]:
# Load datasets
def load_datasets(dataset_info):
    datasets = []
    for dataset in dataset_info:
        name = dataset['name']
        config = dataset.get('config')
        trust_remote_code = dataset.get('trust_remote_code', False)
        ds = load_dataset(name, config, trust_remote_code=trust_remote_code) if config else load_dataset(name, trust_remote_code=trust_remote_code)
        datasets.append(ds)
    return datasets

In [11]:
# Standardize column names
def standardize_column_names(datasets, column_mappings):
    for i, dataset in enumerate(datasets):
        for split in dataset.keys():
            datasets[i][split] = dataset[split].rename_columns(column_mappings[i])
    return datasets

In [12]:
# Concatenate splits
def concatenate_splits(datasets):
    train_datasets = [dataset['train'].select(range(5000)) for dataset in datasets if 'train' in dataset]
    val_datasets = [dataset['validation'].select(range(1000)) for dataset in datasets if 'validation' in dataset]
    test_datasets = [dataset['test'].select(range(1000)) for dataset in datasets if 'test' in dataset]
    return concatenate_datasets(train_datasets), concatenate_datasets(val_datasets), concatenate_datasets(test_datasets)

In [13]:
# Define function to convert examples to features
def convert_examples_to_features(example_batch, tokenizer):
    input_encodings = tokenizer(example_batch['text'], max_length=1024, truncation=True)
    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(example_batch['summary'], max_length=128, truncation=True)
    return {
        'input_ids': input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids']
    }

In [14]:
# Preprocess data
def preprocess_data(dataset, tokenizer):
    return dataset.map(lambda x: convert_examples_to_features(x, tokenizer), batched=True)

In [15]:
# Load and prepare datasets
datasets = load_datasets(datasets_info)
column_mappings = [
    {'article': 'text', 'highlights': 'summary'},
    {'document': 'text', 'summary': 'summary'},
    {'document': 'text', 'summary': 'summary'},
    {'document': 'text', 'summary': 'summary'},
    {'description': 'text', 'abstract': 'summary'},
    {'dialogue': 'text', 'summary': 'summary'}
]
datasets = standardize_column_names(datasets, column_mappings)
train_dataset, val_dataset, test_dataset = concatenate_splits(datasets)

Downloading builder script: 100%|██████████| 3.83k/3.83k [00:00<00:00, 30.9MB/s]
Downloading readme: 100%|██████████| 10.6k/10.6k [00:00<00:00, 10.8MB/s]
Downloading data: 100%|██████████| 548M/548M [00:08<00:00, 61.5MB/s] 
Downloading data: 100%|██████████| 58.8M/58.8M [00:00<00:00, 66.1MB/s]
Downloading data: 100%|██████████| 66.9M/66.9M [00:01<00:00, 63.9MB/s]
Downloading data: 100%|██████████| 7.30M/7.30M [00:00<00:00, 45.4MB/s]
Downloading data: 100%|██████████| 69.0M/69.0M [00:01<00:00, 60.0MB/s]
Downloading data: 100%|██████████| 7.31M/7.31M [00:00<00:00, 50.5MB/s]
Generating train split: 100%|██████████| 44972/44972 [00:04<00:00, 9416.23 examples/s] 
Generating validation split: 100%|██████████| 5622/5622 [00:00<00:00, 11675.29 examples/s]
Generating test split: 100%|██████████| 5622/5622 [00:00<00:00, 12115.97 examples/s]
Downloading builder script: 100%|██████████| 4.43k/4.43k [00:00<00:00, 22.9MB/s]
Downloading readme: 100%|██████████| 8.03k/8.03k [00:00<00:00, 21.2MB/s]
Dow

OSError: [Errno 28] No space left on device

In [17]:
# Function to train and evaluate both models
def train_and_evaluate_models():
    # Train and evaluate PEGASUS
    pegasus_model, pegasus_tokenizer = load_pegasus_model(device)
    train_dataset_pegasus = preprocess_data(train_dataset, pegasus_tokenizer)
    val_dataset_pegasus = preprocess_data(val_dataset, pegasus_tokenizer)
    test_dataset_pegasus = preprocess_data(test_dataset, pegasus_tokenizer)

    seq2seq_data_collator = DataCollatorForSeq2Seq(pegasus_tokenizer, model=pegasus_model)
    trainer_args = TrainingArguments(
        output_dir='pegasus-summarizer', num_train_epochs=2, warmup_steps=100,
        per_device_train_batch_size=2, per_device_eval_batch_size=2,
        weight_decay=0.01, logging_steps=10,
        evaluation_strategy='steps', eval_steps=100, save_steps=500,
        gradient_accumulation_steps=8
    )
    trainer_pegasus = Trainer(model=pegasus_model, args=trainer_args,
                              tokenizer=pegasus_tokenizer, data_collator=seq2seq_data_collator,
                              train_dataset=train_dataset_pegasus,
                              eval_dataset=val_dataset_pegasus)
    trainer_pegasus.train()

    rouge_metric = load_metric('rouge')
    score_pegasus = calculate_metric_on_test_ds(test_dataset_pegasus, rouge_metric, pegasus_model, pegasus_tokenizer, batch_size=2)
    print(pd.DataFrame(dict((rn, score_pegasus[rn].mid.fmeasure) for rn in ["rouge1", "rouge2", "rougeL", "rougeLsum"]), index=['PEGASUS']))

    pegasus_model.save_pretrained("pegasus-summarizer-model")
    pegasus_tokenizer.save_pretrained("pegasus-tokenizer")

    # Train and evaluate BART
    bart_model, bart_tokenizer = load_bart_model(device)
    train_dataset_bart = preprocess_data(train_dataset, bart_tokenizer)
    val_dataset_bart = preprocess_data(val_dataset, bart_tokenizer)
    test_dataset_bart = preprocess_data(test_dataset, bart_tokenizer)

    seq2seq_data_collator = DataCollatorForSeq2Seq(bart_tokenizer, model=bart_model)
    trainer_args = TrainingArguments(
        output_dir='bart-summarizer', num_train_epochs=1, warmup_steps=100,
        per_device_train_batch_size=1, per_device_eval_batch_size=1,
        weight_decay=0.01, logging_steps=10,
        evaluation_strategy='steps', eval_steps=100, save_steps=500,
        gradient_accumulation_steps=8
    )
    trainer_bart = Trainer(model=bart_model, args=trainer_args,
                           tokenizer=bart_tokenizer, data_collator=seq2seq_data_collator,
                           train_dataset=train_dataset_bart,
                           eval_dataset=val_dataset_bart)
    trainer_bart.train()

    score_bart = calculate_metric_on_test_ds(test_dataset_bart, rouge_metric, bart_model, bart_tokenizer, batch_size=2)
    print(pd.DataFrame(dict((rn, score_bart[rn].mid.fmeasure) for rn in ["rouge1", "rouge2", "rougeL", "rougeLsum"]), index=['BART']))

    bart_model.save_pretrained("bart-summarizer-model")
    bart_tokenizer.save_pretrained("bart-tokenizer")

    return (pegasus_model, pegasus_tokenizer, score_pegasus), (bart_model, bart_tokenizer, score_bart)

In [18]:
def calculate_metric_on_test_ds(dataset, metric, model, tokenizer, batch_size=16, device=device, column_text="text", column_summary="summary"):
    def generate_batch_sized_chunks(list_of_elements, batch_size):
        for i in range(0, len(list_of_elements), batch_size):
            yield list_of_elements[i: i + batch_size]

    article_batches = list(generate_batch_sized_chunks(dataset[column_text], batch_size))
    target_batches = list(generate_batch_sized_chunks(dataset[column_summary], batch_size))

    for article_batch, target_batch in tqdm(zip(article_batches, target_batches), total=len(article_batches)):
        inputs = tokenizer(article_batch, max_length=1024, truncation=True, padding="max_length", return_tensors="pt")
        summaries = model.generate(input_ids=inputs["input_ids"].to(device), attention_mask=inputs["attention_mask"].to(device), length_penalty=0.8, num_beams=8, max_length=128)
        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True, clean_up_tokenization_spaces=True) for s in summaries]
        decoded_summaries = [d.replace("", " ") for d in decoded_summaries]
        metric.add_batch(predictions=decoded_summaries, references=target_batch)
    score = metric.compute()
    return score

In [19]:
# Train and evaluate both models
(pegasus_model, pegasus_tokenizer, pegasus_score), (bart_model, bart_tokenizer, bart_score) = train_and_evaluate_models()

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/600 [00:00<?, ? examples/s]



Step,Training Loss,Validation Loss
100,2.357,2.339068
200,2.2838,2.158914
300,2.5269,2.104371
400,2.2837,2.073403
500,1.9405,2.057004
600,1.9355,2.048563
700,2.1981,2.043863


Non-default generation parameters: {'max_length': 128, 'min_length': 32, 'num_beams': 8, 'length_penalty': 0.8, 'forced_eos_token_id': 1}
  rouge_metric = load_metric('rouge')
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
100%|██████████| 300/300 [09:34<00:00,  1.92s/it]
Non-default generation parameters: {'max_length': 128, 'min_length': 32, 'num_beams': 8, 'length_penalty': 0.8, 'forced_eos_token_id': 1}


           rouge1    rouge2    rougeL  rougeLsum
PEGASUS  0.021147  0.000407  0.020681   0.020616


Map:   0%|          | 0/6000 [00:00<?, ? examples/s]



Map:   0%|          | 0/600 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]



Step,Training Loss,Validation Loss
100,1.9198,2.186247
200,1.8173,2.17653
300,2.199,2.021796
400,1.8958,2.02677
500,1.651,1.987621
600,1.634,1.952994
700,1.6109,1.916393


Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
100%|██████████| 300/300 [07:39<00:00,  1.53s/it]
Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


        rouge1  rouge2    rougeL  rougeLsum
BART  0.017554  0.0003  0.017393   0.017321


In [20]:
# Prediction example using both models
def predict_with_models(text, pegasus_model, bart_model, pegasus_tokenizer, bart_tokenizer, gen_kwargs):
    pegasus_summary = pegasus_model.generate(pegasus_tokenizer(text, return_tensors="pt", truncation=True).input_ids.to(device), **gen_kwargs)
    bart_summary = bart_model.generate(bart_tokenizer(text, return_tensors="pt", truncation=True).input_ids.to(device), **gen_kwargs)

    pegasus_summary_text = pegasus_tokenizer.decode(pegasus_summary[0], skip_special_tokens=True)
    bart_summary_text = bart_tokenizer.decode(bart_summary[0], skip_special_tokens=True)

    return pegasus_summary_text, bart_summary_text

In [21]:
gen_kwargs = {"length_penalty": 0.8, "num_beams": 8, "max_length": 128}
sample_text = test_dataset[0]["text"]
reference = test_dataset[0]["summary"]

pegasus_summary, bart_summary = predict_with_models(sample_text, pegasus_model, bart_model, pegasus_tokenizer, bart_tokenizer, gen_kwargs)

print("Text:")
print(sample_text)
print("\nReference Summary:")
print(reference)
print("\nPEGASUS Summary:")
print(pegasus_summary)
print("\nBART Summary:")
print(bart_summary)

Text:
(CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC's founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the situation in Palestinian territories, paving the way for possible war crimes investigations against Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and the United States, neither of which is an ICC member, opposed the Palestinians' efforts to join the body. But Palestinian Foreign Minister Riad al-Malki, speaking at Wednesday's ceremo