## Text summarization using Hugging face


Ensure you are using GPU - for faster training capabilities

In [None]:
!nvidia-smi

Install dependancies

In [None]:
!pip install transformers[sentencepiece] datasets sacrebleu rouge_score py7zr -q

In [None]:
!pip uninstall -y transformers accelerate
!pip install transformers accelerate
!pip install --upgrade accelerate

Import required tools

In [None]:
from matplotlib import pyplot as plt
from transformers import pipeline, set_seed, AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import load_dataset, load_from_disk, load_metric

import pandas as pd

import nltk
from nltk.tokenize import sent_tokenize

from tqdm import tqdm
import torch

nltk.download("punkt")

check if GPU is running

In [None]:
dev = 'cuda' if torch.cuda.is_available() else 'cpu'
dev

# **Pre-Processing Data**


1.   Tokenization



In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM


In [None]:
model = "google/pegasus-cnn_dailymail"      # model we shall use for summarization
tokenizer = AutoTokenizer.from_pretrained(model)

In [None]:
model_p = AutoModelForSeq2SeqLM.from_pretrained(model).to(dev)

load the dataset to be summarized

In [None]:
!pip install fsspec==2023.6.0

In [None]:
ds = load_dataset('abisee/cnn_dailymail', '3.0.0')

In [None]:
ds

**visualizing the data**

In [None]:
ds['train']['highlights'][10]

In [None]:
ds['train'][10]["highlights"]

**Convert to vector representation**

In [None]:
def convert_to_features(example_batch):
    input_encodings = tokenizer(example_batch['article'], max_length=1024, truncation=True)

    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(example_batch['highlights'], max_length=128, truncation=True)

    return {
        'input_ids': input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids']
    }


Lets map the function

In [None]:
ds_cn = ds.map(convert_to_features, batched=True)

In [None]:
ds_cn['train']

In [None]:
#ds_cn['train']['input_ids'][1]

In [None]:
#ds_cn['train']['attention_mask'][1]

In [None]:
ds_cn['train']['labels'][1]

**Training**

In [None]:
!pip install --upgrade transformers

In [None]:
from transformers import DataCollatorForSeq2Seq # load data in batches - u may have huge amount of data

seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_p)

In [None]:
from transformers import TrainingArguments, Trainer
import transformers

training_args = TrainingArguments(
    output_dir='pg_tst',
    num_train_epochs=1, # keep everything same only this field may change
    warmup_steps=500,
    per_device_train_batch_size=1,
    weight_decay=0.01,
    logging_steps=10,
    eval_strategy='steps',
    eval_steps=500,
    save_steps=1e6,
    gradient_accumulation_steps=16
)

In [None]:
trainer = Trainer(model=model_p, args=training_args,
                  tokenizer=tokenizer, data_collator=seq2seq_data_collator,
                  train_dataset=ds_cn['test'], eval_dataset=ds_cn['validation'])

In [None]:
trainer.train()

In [None]:
# Evaluation

def generate_batch_sized_chunks(list_of_elements, batch_size):
    """split the dataset into smaller batches that we can process simultaneously
    Yield successive batch-sized chunks from list_of_elements."""
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i : i + batch_size]

def calculate_metric_on_test_ds(dataset, metric, model, tokenizer,
                               batch_size=16, device=dev,
                               column_text="article",
                               column_summary="highlights"):
    article_batches = list(generate_batch_sized_chunks(dataset[column_text], batch_size))
    target_batches = list(generate_batch_sized_chunks(dataset[column_summary], batch_size))

    for article_batch, target_batch in tqdm(
        zip(article_batches, target_batches), total=len(article_batches)):

        inputs = tokenizer(article_batch, max_length=1024,  truncation=True,
                        padding="max_length", return_tensors="pt")

        summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                         attention_mask=inputs["attention_mask"].to(device),
                         length_penalty=0.8, num_beams=8, max_length=128)
        ''' parameter for length penalty ensures that the model does not generate sequences that are too long. '''

        # Decode generated texts

        # replace the token and add the decoded  text with refrence to the matrix

        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True,
                                clean_up_tokenization_spaces=True)
               for s in summaries]

        decoded_summaries = [d.replace("", " ") for d in decoded_summaries]


        metric.add_batch(predictions=decoded_summaries, references=target_batch)

        # Compute and return rogue scores
        score = metric.compute()
        return score


In [None]:
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
rouge_metric = load_metric('rouge')


In [None]:
score = calculate_metric_on_test_ds(
    ds_cn['validation'][0:10], rouge_metric, trainer.model, tokenizer, batch_size = 2, column_text = 'article', column_summary= 'highlights')

rouge_dict = dict((rn, score[rn].mid.fmeasure ) for rn in rouge_names )

pd.DataFrame(rouge_dict, index = [f'pg_tst'] )

In [None]:
#save model
model_p.save_pretrained("pg_tst")

In [None]:
#save tokenizer
tokenizer.save_pretrained("tokenizer")

In [None]:
#load tokenizer
tokenizer = AutoTokenizer.from_pretrained("/content/tokenizer")

In [None]:
#prediction

gen_kwargs = {"length_penalty": 0.8, "num_beams":8, "max_length": 128}

sample_txt = ds['test'][0]['article']

reference = ds['test'][0]['highlights']

pipe = pipeline("summarization", model="pg_tst",tokenizer=tokenizer)

##
print("Dialogue:")
print(sample_txt)

print("\nReference Summary:")
print(reference)

print("\nModel Summary:")
print(pipe(sample_txt, **gen_kwargs)[0]["summary_text"])