In [12]:
# required libraries
!pip install transformers datasets --quiet
!pip install matplotlib rouge-score --quiet
!pip install sentencepiece --quiet
!pip install accelerate -U --quiet

In [13]:
from datasets import load_dataset
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, Trainer, TrainingArguments
from datasets import load_dataset, load_metric, DatasetDict, load_from_disk
import numpy as np
import matplotlib.pyplot as plt
from google.colab import drive
from rouge_score import rouge_scorer
import os
import gc
import torch
import json

In [None]:
# drive to store the trained model and datasets
drive.mount('/content/drive')

In [None]:
# Check if CUDA is available and set the device accordingly
#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#print(f'Using device: {device}')

In [None]:
# Set up model
model_name = 'google/pegasus-cnn_dailymail'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

In [16]:
# Function to tokenize the dataset
def tokenize_function(examples):
    model_inputs = tokenizer(examples["article"], max_length=1024, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["highlights"], max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Load and tokenize the dataset
tokenized_dataset_path = '/content/drive/MyDrive/Pegasus/tokenized_datasets'  # Adjust the path as needed
if os.path.exists(tokenized_dataset_path):
    tokenized_datasets = DatasetDict.load_from_disk(tokenized_dataset_path)
else:
    # Load dataset
    dataset_path = '/content/drive/MyDrive/Pegasus/dataset'
    if os.path.exists(dataset_path):
      dataset = load_from_disk(dataset_path)
    else:
      dataset = load_dataset("cnn_dailymail", "3.0.0")
      dataset.save_to_disk(dataset_path)
    dataset['train'] = dataset['train'].select(range(10000))
    dataset['validation'] = dataset['validation'].select(range(2000))
    dataset['test'] = dataset['test'].select(range(100))
    # Tokenize dataset
    tokenized_datasets = dataset.map(tokenize_function, batched=True)
    tokenized_datasets.save_to_disk(tokenized_dataset_path)

In [None]:
# hyper parameters for the fine-tuning of the model
training_args = TrainingArguments(
    output_dir="model_results",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=5e-5,
    warmup_steps=500,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    logging_dir="logs",
)

In [None]:
# Initialize the Trainer which trains and validates the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
)

In [None]:
# Train the model
trainer.train()



In [None]:
# path to save the models and hyperparameters
model_path = '/content/drive/MyDrive/Pegasus/model'

In [None]:
# Save the fine-tuned model
model.save_pretrained(model_path)

In [None]:
# saving the tokenizer
tokenizer.save_pretrained(model_path)

In [None]:
# saving the hyper parameters to continue fine-tuning
training_args_dict = training_args.to_dict()
with open('/content/drive/MyDrive/Pegasus/training_args.json', 'w') as f:
    json.dump(training_args_dict, f, indent=4)

In [17]:
# loading the saved parameters
with open('/content/drive/MyDrive/Pegasus/training_args.json', 'r') as f:
    loaded_args_dict = json.load(f)
loaded_training_args = TrainingArguments(**loaded_args_dict)

In [18]:
# loading the saved tokenizer and model
tokenizer = PegasusTokenizer.from_pretrained(model_path)
fine_tuned_model = PegasusForConditionalGeneration.from_pretrained(model_path)

In [19]:
# loading the tokenized test dataset for ROGUE score calculation
test_dataset = tokenized_datasets['test']

In [None]:
# generate preidcted summaries for each article in the test dataset
def generate_predictions(batch):
    input_ids = torch.tensor(batch['input_ids'])
    outputs = fine_tuned_model.generate(input_ids, max_length=125, num_beams=5, early_stopping=True)
    batch['predicted_summary'] = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return batch

result = test_dataset.map(generate_predictions, batched=True)

In [None]:
rouge = load_metric('rouge')

def compute_rouge_scores(outputs, references):
    return rouge.compute(predictions=outputs, references=references, use_aggregator=True, use_stemmer=True)

# Extract the summaries and the references
decoded_preds = [prediction['predicted_summary'] for prediction in result]
decoded_labels = [tokenizer.decode(label, skip_special_tokens=True) for label in test_dataset['labels']]

# Compute the scores
rouge_scores = compute_rouge_scores(decoded_preds, decoded_labels)

# Calculate average scores
average_scores = {}
for key in rouge_scores:
    score = rouge_scores[key]
    average_scores[key] = {
        'precision': score.mid.precision * 100,
        'recall': score.mid.recall * 100,
        'fmeasure': score.mid.fmeasure * 100
    }
print(average_scores)

In [None]:
# Example text to summarize
# manual testing with inputs
text_to_summarize = input("")

# Encode the text and generate summary
inputs = tokenizer.encode("summarize: " + text_to_summarize, return_tensors="pt", truncation=True)
summary_ids = model.generate(inputs, max_length=250, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

print(f'summary:{summary}')