## STEPS TO FOLLOW:

1. Import the requirements
2. Change to GPU
3. Get the data
4. Apply / Map the tokenizer
5. Set the training arguments
6. Train the model
7. Evaluation using required metrics
8. Saving the model to use on test data
9. Prediction

In [None]:
!nvidia-smi

In [None]:
!pip install transformers[sentencepiece] datasets sacrebleu rouge_score py7zr -q

In [None]:
!pip install --upgrade accelerate # Helps to use GPU / CUDA
!pip uninstall -y transformers accelerate
!pip install transformers accelerate
# In order to use the updated version

In [None]:
from transformers import pipeline, set_seed
from datasets import load_dataset, load_from_disk, load_metric
import matplotlib.plotly as plt
import pandas as pd

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import nltk
from nltk.tokenize import sent_tokenize

from tqdm import tqdm
import torch # Importing pytorch

nltk.download("punkt")

In [None]:
# To check if cuda is available
device = "cuda" if torch.cuda.is_available() else "cpu"
device

In [None]:
model_ckpt = "google/pegasus-cnn_dailymail"
tokenizer = AutomTokenizer.from_pretrained(model_ckpt)
# To download the tokenizer

In [None]:
model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)
# To download the model

In [None]:
detaset_samsum = load_dataset("samsum")

In [None]:
dataset_samsum

In [None]:
dataset_samsum["train"]["dialogue"][1]

In [None]:
dataset_samsum["train"][1]["summary"]

In [None]:
# Prints number of examples available in this dataset
split_lengths = [len(dataset_samsum[split]) for split in dataset_samsum]
print(f"Split lengths: {split_lengths}")
print(f"Features: {dataset_samsum['train'].column_names}")
print("\nDialogue: \n")
print(dataset_samsum["test"][1]["dialogue"])
print("\nSummary: \n")
print(dataset_samsum["test"][1]["summary"])

In [None]:
# Preprocess data to convert to vector representation
def convert_examples_to_features(example_batch):
    input_encodings = tokenizer(example_batch["dialogue"], max_length=1024, truncation=True)

    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(example_batch["summary"], max_length=128, truncation=True)

    return {
        "input_ids": input_encodings["input_ids"],
        "attention_mask": input_encodings["attention_mask"],
        "labels": target_encodings["input_ids"]
    }

In [None]:
dataset_samsum_pt = dataset_samsum.map(convert_examples_to_features, batched=True)

In [None]:
dataset_samsum_pt["train"]

In [None]:
dataset_samsum_pt["train"]["input_ids"][1]

In [None]:
dataset_samsum_pt["train"]["attention_mask"][1]

In [None]:
dataset_samsum_pt["train"]["labels"][1]

## TRAINING

In [None]:
from transformers import DataCollatorForSeq2Seq

seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)
# Data collator helps load data in batches in the memory as we only have fixed RAM

In [None]:
from transformers import TrainingArguments, Trainer

trainer_args = TrainingArguments(
    output_dir="pegasus-samsum", num_train_epochs=1, warmup_steps=500,
    per_device_train_batch_size=1, per_device_eval_batch_size=1,
    weight_decay=0.01, logging_steps=10,
    evaluation_strategy="steps", eval_steps=500, save_steps=1e6,
    gradient_accumulation_steps=16)

In [None]:
trainer = Trainer(model=model_pegasus, args=trainer_args,
                  tokenizer=tokenizer, data_collator=seq2seq_data_collator,
                  train_dataset=dataset_samsum_pt["train"],
                  eval_dataset=dataset_samsum_pt["validation"])

# Instead of train data you can also use test data if you think that there is not enough time/memory space

In [None]:
trainer.train()

## EVALUATION
The error here should be very close to 0

In [None]:
def generate_batch_sized_chunks(list_of_elements, batch_size):
    """split the dataset into smaller batches so that we can process simultaneously
    Yield successive batch-sized chunks from list_of_elements."""
    for i in range (0, len(list_of_elements), batch_size):
      yield list_of_elements[i : i + batch_size]

def calculate_metric_on_test_ds(dataset, metric, model, tokenizer,
                                batch_size = 16, device = device,
                                column_text = "article",
                                column_summary = "highlights"):
  article_batches = list(generate_batch_sized_chunks(dataset[column_text], batch_size))
  target_batches = list(generate_batch_sized_chunks(dataset[column_summary], batch_size))

  for article_batch, target_batch in tqdm(zip(article_batches, target_batches), total = len(article_batches)):
    inputs = tokenizer(article_batch, max_length = 1024, truncation = True,
                       padding = "max_length", return_tensors = "pt")
    summaries = model.generate(input_ids = inputs["input_ids"].to(device),
                               attention_mask = inputs["attention_mask"].to(device),
                               length_penalty = 0.8, num_beams = 8, max_length = 128)
    '''
    Parameter for length penalty ensures that the model does not generate sequences that are too short or too long. Close to 0 means short and close to 1 means long
    Text summarization uses rogue score, just how classification uses other types of performance metrics
    Now we decode the generated texts, replace the token and add the decoded texts with the references to the metric.
    '''

    decoded_summaries = [tokenizer.decode(s, skip_special_tokens = True,
                                          clean_up_tokenization_spaces = True)
                          for s in summaries]
    metric.add_batch(predictions = decoded_summaries, references = target_batch)
# Finally compute and return the ROUGE scores.
  score = metric.compute()
  return score

In [None]:
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
rouge_metric = load_metric("rouge")

In [None]:
score = calculate_metric_on_test_ds(
    dataset_samsum["test"][0:10], rouge_metric, trainer.model, tokenizer, batch_size = 2, column_text = "dialogue", column_summary = "summary"
)
# Here we have just taken 10 as space is less but for actual model evaluation just omit the [0:10] part
rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)
pd.DataFrame(rouge_dict, index = [f"pegasus"])

The rouge score should be close to 1

## SAVING THE MODEL

In [None]:
model_pegasus.save_pretrained("pegasus-samsum-model")

In [None]:
tokenizer.save_pretrained("tokenizer")

## PREDICTION

In [None]:
# Load your tokenizer
tokenizer = AutoTokenizer.from_pretrained("/content/tokenizer")

In [None]:
sample_text = dataset_samsum["test"][0]["dialogue"]

In [None]:
reference = dataset_samsum["test"][0]["summary"]

In [None]:
gen_kwargs = {"length_penalty": 0.8, "num_beams": 8, "max_length": 128}
sample_text = dataset_samsum["test"][0]["dialogue"]
reference = dataset_samsum["test"][0]["summary"]
pipe = pipeline("summarization", model = "pegasus-samsum-model", tokenizer = tokenizer)
# Here we pass name of our model

print("Dialogue: ")
print(sample_text)

print("\nReference Summary: ")
print(reference)

print("\nModel Summary: ")
print(pipe(sample_text, **gen_kwargs)[0]["summary_text"])