In [1]:
from transformers import pipeline
import pandas as pd
from datasets import load_dataset, load_metric
import matplotlib.pyplot as plt
from tqdm import tqdm
from datasets import Dataset, DatasetDict,load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, PegasusTokenizerFast, DataCollatorForSeq2Seq
from transformers import TrainingArguments, Trainer
from transformers import BartTokenizer, BartModel
from transformers import BartForConditionalGeneration
import torch
import nltk
from nltk.tokenize import sent_tokenize

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [3]:
def encode_input_data(batch):
    input_encodings = bart_base_tokenizer(batch['article'] , max_length = 1024, truncation = True )
    with bart_base_tokenizer.as_target_tokenizer():
        target_encodings = bart_base_tokenizer(batch['highlights'], max_length = 256, truncation = True )
        
    return_map = {
        'input_ids' : input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids']
    }
    return return_map

In [4]:
def get_rouge_scores(dataset, metric, model, tokenizer, xlable, ylable, batch_size, device):
    xlable_batches = []
    ylable_batches = []
    n = len(data)
    for i in range(0, n, batch_size):
        xlable_batch_data = dataset[xlable][i : i + batch_size]
        ylable_batch_data = dataset[ylable][i : i + batch_size]
        xlable_batches.append(xlable_batch_data)
        ylable_batches.append(ylable_batch_data)

    zipped_data = list(zip(xlable_batches, ylable_batches))
    
    for i in range(len(xlable_batches)):
        x_label_batch = zipped_data[i][0]
        y_label_batch = zipped_data[i][1]
        
        inputs = tokenizer(x_label_batch, max_length=1024,  truncation=True, 
                        padding="max_length", return_tensors="pt")
        
        summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                         attention_mask=inputs["attention_mask"].to(device), 
                         length_penalty=0.6, num_beams=6, max_length=256)
        output_summaries = []
        for summary in summaries:
            output_summary = tokenizer.decode(summary, skip_special_tokens=True, 
                                clean_up_tokenization_spaces=True)
            output_summaries.append(output_summary)  
            
        final_output_summaries = [output_summary.replace("", " ") for output_summary in output_summaries]
        
        metric.add_batch(predictions=final_output_summaries, references=y_label_batch)
    
    rouge_score = metric.compute()
    return rouge_score

In [13]:
def finetunellm(model,tokenizer,dataset, xlabel,ylabel,model_name,tokenizer_name,num_training,num_testing,num_validation,output_dir):
    if(num_training != -1):
        train_data = {xlabel: dataset["train"][0:num_training][xlabel], ylabel: dataset["train"][0:num_training][ylabel], "id": dataset["train"][0:num_training]['id']}
        test_data = {xlabel: dataset["test"][0:num_testing][xlabel], ylabel: dataset["test"][0:num_testing][ylabel], "id": dataset["test"][0:num_testing]['id']}
        validation_data = {xlabel: dataset["validation"][0:num_validation][xlabel], ylabel: dataset["validation"][0:num_validation][ylabel], "id": dataset["validation"][0:num_validation]['id']}
        train_dataset = Dataset.from_dict(train_data)
        validation_dataset = Dataset.from_dict(validation_data)
        test_dataset = Dataset.from_dict(test_data)
        dataset = DatasetDict({"train": train_dataset, "validation": validation_dataset, "test": test_dataset})
    print("1.Dataset creation completed")
    encoded_dataset = dataset.map(encode_input_data, batched = True)
    print("2.Dataset encoding completed")
    seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
    training_args = TrainingArguments(
    output_dir=output_dir, num_train_epochs=30, warmup_steps=500,per_device_train_batch_size=16, 
    per_device_eval_batch_size=16,weight_decay=0.01, logging_steps=10,evaluation_strategy='steps',
    eval_steps=500, save_steps=1000,gradient_accumulation_steps=16
    ) 
    torch.cuda.empty_cache()
    print("3.Starting Training of the Data")
    trainer = Trainer(model=model, args=training_args,
              tokenizer=tokenizer, data_collator=seq2seq_data_collator,
              train_dataset=encoded_dataset["train"], 
              eval_dataset=encoded_dataset["validation"])
    training_result = trainer.train()
    print("4.Training completed")
    print("Training_result = ",training_result)
    model.save_pretrained(model_name)
    tokenizer.save_pretrained(tokenizer_name)
    return dataset

Model = Bart Base, Data Set = CNN Daily Mail

In [14]:
bart_base_model = AutoModelForSeq2SeqLM.from_pretrained('facebook/bart-base').to(device)
bart_base_tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
cnn_dataset = load_dataset("cnn_dailymail",'3.0.0')
xlable = 'article'
ylabel = 'highlights'
model_name = "bart-cnn-model"
tokenizer_name = "bart-cnn-tokenizer"
output_dir = "Final-bart-cnn"
num_training = 10000
num_testing = 1000
num_validation = 1000
dataset = finetunellm(bart_base_model,bart_base_tokenizer,cnn_dataset,xlable,ylabel,model_name,tokenizer_name,num_training,num_testing,num_validation,output_dir)

1.Dataset creation completed


Map: 100%|██████████| 1/1 [00:00<00:00,  7.24 examples/s]
Map: 100%|██████████| 1/1 [00:00<00:00,  6.77 examples/s]
Map: 100%|██████████| 1/1 [00:00<00:00,  7.39 examples/s]
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


2.Dataset encoding completed
3.Starting Training of the Data


Step,Training Loss,Validation Loss


4.Training completed
Training_result =  TrainOutput(global_step=30, training_loss=0.18704514106114706, metrics={'train_runtime': 16.5965, 'train_samples_per_second': 1.808, 'train_steps_per_second': 1.808, 'total_flos': 10092805632000.0, 'train_loss': 0.18704514106114706, 'epoch': 30.0})


In [7]:
pretrained_model = BartForConditionalGeneration.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
input_text = dataset["train"][0][xlable]
output_summary = dataset_samsum_pt["train"][0][ylabel]
encoded_input = tokenizer(input_text, truncation=True, max_length=1024)
decoded_summary = tokenizer.decode(encoded_input["input_ids"], skip_special_tokens=True)
gen_kwargs = {"length_penalty": 1, "num_beams":6, "max_length": 256}
model_pipeline = pipeline("summarization", model=model_name,tokenizer=tokenizer)
print("\nOutput Summary Predicted:")
print(model_pipeline(decoded_summary, **gen_kwargs))

In [8]:
rouge_metric = load_metric('rouge')
batch_size = 8
score = get_rouge_scores(dataset['test'], rouge_metric, pretrained_model, tokenizer, xlable, ylabel, batch_size, device)

Model = Pegasus-cnn_dailymail, Data Set = Government Data

In [17]:
pegasus_cnn_dailymail_model = AutoModelForSeq2SeqLM.from_pretrained('google/pegasus-cnn_dailymail').to(device)
pegasus_cnn_dailymail_tokenizer = PegasusTokenizerFast.from_pretrained('google/pegasus-cnn_dailymail')
government_dataset = load_dataset('ccdv/govreport-summarization')
xlable = 'report'
ylabel = 'summary'
model_name = "pegasus-government-model"
tokenizer_name = "bart-cnn-tokenizer"
output_dir = "Final-pegasus-government"
num_training = 1
num_testing = 1
num_validation = 1
dataset = finetunellm(pegasus_cnn_dailymail_model,pegasus_cnn_dailymail_tokenizer,government_dataset,xlable,ylabel,model_name,tokenizer_name,num_training,num_testing,num_validation,output_dir)

In [18]:
pretrained_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

In [19]:
rouge_metric = load_metric('rouge')
batch_size = 8
score = get_rouge_scores(dataset['test'], rouge_metric, pretrained_model, tokenizer, xlable, ylabel, batch_size, device)

Model = Bart large CNN, Data Set = Samsum

In [20]:
bart_largecnn_model = AutoModelForSeq2SeqLM.from_pretrained('facebook/bart-large-cnn').to(device)
bart_largecnn_tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
samsum_dataset = load_dataset('samsum')
xlable = 'report'
ylabel = 'summary'
model_name = "bart-large-cnn-model"
tokenizer_name = "bart-large-cnn-tokenizer"
output_dir = "Final-bart-large-cnn-samsum"
num_training = -1
num_testing = -1
num_validation = -1
dataset = finetunellm(bart_largecnn_model,bart_largecnn_tokenizer,samsum_dataset,xlable,ylabel,model_name,tokenizer_name,num_training,num_testing,num_validation,output_dir)

In [21]:
pretrained_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

In [22]:
rouge_metric = load_metric('rouge')
batch_size = 8
score = get_rouge_scores(dataset['test'], rouge_metric, pretrained_model, tokenizer, xlable, ylabel, batch_size, device)