In [1]:
from transformers import pipeline

import matplotlib.pyplot as plt
from datasets import load_dataset,load_dataset,load_metric
import pandas as pd

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

from nltk.tokenize import sent_tokenize

from tqdm import tqdm
import torch


In [2]:
import nltk

In [3]:
device='cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

In [4]:
model_ckpt = "google/pegasus-cnn_dailymail"
tokenizer=AutoTokenizer.from_pretrained(model_ckpt)

In [5]:
model_pegasus=AutoModelForSeq2SeqLM.from_pretrained(model_ckpt)

  return self.fget.__get__(instance, owner)()
Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
dataset_samsum=load_dataset("samsum")

In [7]:
dataset_samsum.shape

{'train': (14732, 3), 'test': (819, 3), 'validation': (818, 3)}

In [8]:
dataset_samsum.column_names

{'train': ['id', 'dialogue', 'summary'],
 'test': ['id', 'dialogue', 'summary'],
 'validation': ['id', 'dialogue', 'summary']}

In [9]:
from pprint import pprint
print('Dialogue:\n')
print(dataset_samsum['test'][0]['dialogue'])
print()

print('Summarization:\n')
print(dataset_samsum['test'][0]['summary'])
dialogue=dataset_samsum['test'][0]['dialogue']

Dialogue:

Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye

Summarization:

Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry.


In [10]:
pipe=pipeline('summarization',model=model_ckpt)

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
pipe_out=pipe(dialogue)

Your max_length is set to 128, but your input_length is only 122. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=61)


In [12]:
print(pipe_out[0]['summary_text'].replace(" .<n>", ".\n"))

Amanda: Ask Larry Amanda: He called her last time we were at the park together.
Hannah: I'd rather you texted him.
Amanda: Just text him .


In [15]:
def generate_batch_sized_chunks(list_of_elements,batch_size):
    for i in range(0,len(list_of_elements),batch_size):
        yield list_of_elements[i:i+batch_size]
        
        
def calculate_metric_on_test_ds(dataset, metric, model, tokenizer, 
                               batch_size=16, device=device, 
                               column_text="article", 
                               column_summary="highlights"):
    article_batches = list(generate_batch_sized_chunks(dataset[column_text], batch_size))
    target_batches = list(generate_batch_sized_chunks(dataset[column_summary], batch_size))

    for article_batch, target_batch in tqdm(
        zip(article_batches, target_batches), total=len(article_batches)):
        
        inputs = tokenizer(article_batch, max_length=1024,  truncation=True, 
                        padding="max_length", return_tensors="pt")
        
        summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                         attention_mask=inputs["attention_mask"].to(device), 
                         length_penalty=0.8, num_beams=8, max_length=128)
        ''' parameter for length penalty ensures that the model does not generate sequences that are too long. '''
        
        # Finally, we decode the generated texts, 
        # replace the  token, and add the decoded texts with the references to the metric.
        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True, 
                                clean_up_tokenization_spaces=True) 
               for s in summaries]      
        
        decoded_summaries = [d.replace("", " ") for d in decoded_summaries]
        
        
        metric.add_batch(predictions=decoded_summaries, references=target_batch)
        
        #  Finally compute and return the ROUGE scores.
        score = metric.compute()
        return score

In [17]:
rouge_metric=load_metric('rouge')

score=calculate_metric_on_test_ds(dataset_samsum['test'],rouge_metric,
                                  model_pegasus,tokenizer,column_text='dialogue',
                                  column_summary='summary',batch_size=8)

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py): started
  Building wheel for rouge_score (setup.py): finished with status 'done'
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24972 sha256=f58463cd9b8ca0a00afa1575a6702e2fa203430a66e40d64d052d41ed9bf05e8
  Stored in directory: c:\users\vladt\appdata\local\pip\cache\wheels\1e\19\43\8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2
