In [1]:
from transformers import pipeline, set_seed
from transformers import AutoModelForSeq2SeqLM,AutoTokenizer
import pandas as pd
import matplotlib.pyplot as plt
from datasets import load_dataset
import evaluate
import torch
import nltk
from nltk.tokenize import sent_tokenize
from tqdm import tqdm

# nltk.download("punkt")

In [2]:
import torch
device="cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [3]:
model_code="google/pegasus-xsum"
model_pegasus=AutoModelForSeq2SeqLM.from_pretrained(model_code).to(device)

tokenizer=AutoTokenizer.from_pretrained(model_code)

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
from datasets import load_dataset

dataset = load_dataset("nyamuda/samsum")
print(dataset)


DatasetDict({
    train: Dataset({
        features: ['id', 'summary', 'dialogue'],
        num_rows: 14732
    })
    validation: Dataset({
        features: ['id', 'summary', 'dialogue'],
        num_rows: 818
    })
    test: Dataset({
        features: ['id', 'summary', 'dialogue'],
        num_rows: 819
    })
})


In [5]:
train=dataset["train"]
validation=dataset["validation"]
test=dataset["test"]

In [6]:

def tokenize_function(example):
    # Tokenize dialogue (input)
    inputs = tokenizer(
        example["dialogue"], 
        max_length=512,  
        truncation=True
    )
    # Tokenize summary (target)
    targets = tokenizer(
        example["summary"], 
        max_length=512,  
        truncation=True
    )
    inputs["labels"] = targets["input_ids"]
    return {
        "input_ids":inputs["input_ids"],
        "attention_mask":inputs["attention_mask"],
        "labels":targets["input_ids"]
    }


In [7]:
train=train.map(tokenize_function,batched=True)
test=test.map(tokenize_function,batched=True)
validation=validation.map(tokenize_function,batched=True)


In [8]:
len(train)

14732

In [9]:
from transformers import DataCollatorForSeq2Seq
data_collator=DataCollatorForSeq2Seq(tokenizer,model=model_pegasus)

In [None]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments


training_args = TrainingArguments(
    fp16=True,
    output_dir='pegasus-samsum',          # output directory
    num_train_epochs=1,              # total # of training epochs
    per_device_train_batch_size=1,  # batch size per device during training
    per_device_eval_batch_size=1,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    gradient_accumulation_steps=8, 
    weight_decay=0.01             # strength of weight decay
    # logging_dir='./logs',            # directory for storing logs
)

trainer = Trainer(
    model=model_pegasus,                         # the instantiated ðŸ¤— Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train.select(range(10)),         # training dataset
    eval_dataset=test,           # evaluation dataset
    data_collator=data_collator
)

In [11]:
torch.cuda.empty_cache()  # clears reserved memory
torch.cuda.reset_peak_memory_stats()  # optional: reset memory stats
trainer.train()

Step,Training Loss




TrainOutput(global_step=2, training_loss=3.750455856323242, metrics={'train_runtime': 26.9456, 'train_samples_per_second': 0.371, 'train_steps_per_second': 0.074, 'total_flos': 4345483591680.0, 'train_loss': 3.750455856323242, 'epoch': 1.0})

In [None]:
def  generate_batch_sized_chunks(list_of_elements,batch_size):
    """split the batches into smaller batches that we can process simultaneosly
    Yield successive batched sized chunks from list_of_elements"""
    for i in range(0,len(list_of_elements),batch_size):
        yield list_of_elements[i:i+batch_size]                ## returns a iterator generator


def calculate_metric(dataset,metric,model,tokenizer,batch_size=16,device=device,column_text="dialogue",column_summary="summary"):
    X_batches=list(generate_batch_sized_chunks(dataset[column_text],batch_size))
    y_batches=list(generate_batch_sized_chunks(dataset[column_summary],batch_size))

    for X_batch,y_batch in tqdm(zip(X_batches,y_batches),total=len(X_batches)):
        inputs=tokenizer(X_batch,max_length=512,truncation=True,padding=True,
            return_tensors="pt")

        summaries=model_pegasus.generate(input_ids=inputs["input_ids"].to(device),attention_mask=inputs["attention_mask"].to(device),
                                         length_penalty=0.8,num_beams=8,max_length=128)


        decode_summaries=[tokenizer.decode(s,skip_special_tokens=True,clean_up_tokenization_spaces=True) for s in summaries]

        decode_summaries=[d.replace(""," ") for d in decode_summaries]

        metric.add_batch(predictions=decode_summaries,references=y_batch)

    score=metric.compute()
    return score


In [21]:
rouge_names=["rouge1", "rouge2", "rougeL", "rougeLsum"]
from evaluate import load
rouge_metric = load("rouge")


In [24]:
score=calculate_metric(test[0:10],rouge_metric,model_pegasus,tokenizer,batch_size=2,
                       device=device,column_text="dialogue",column_summary="summary")

for r in rouge_names:
    print(r, score[r])

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 5/5 [00:21<00:00,  4.20s/it]


rouge1 0.02424744121203537
rouge2 0.0
rougeL 0.02441862079497966
rougeLsum 0.025470220298262725


In [25]:
model_pegasus.save_pretrained("pegasus-samsum-model")
tokenizer.save_pretrained("Tokenizer")

('Tokenizer\\tokenizer_config.json',
 'Tokenizer\\special_tokens_map.json',
 'Tokenizer\\spiece.model',
 'Tokenizer\\added_tokens.json',
 'Tokenizer\\tokenizer.json')

In [None]:
# Inference

gen_kwargs={"length_penalty":0.8,"num_beams":8,"max_length":128}

sample_text=""

pipe=pipeline("summaraization",model="pegasus-samsum-model",tokenizer=tokenizer)

ans=pipe(sample_text,**gen_kwargs)[0]["summay_text"]