In [3]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, texts, summaries, tokenizer, max_length):
        self.texts = texts
        self.summaries = summaries
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        summary = self.summaries[idx]
        input_text = f"Text: {text} Summary: {summary} END"
        encoding = self.tokenizer(
            input_text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        labels = self.tokenizer(
            summary,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "labels": labels["input_ids"].flatten()
        }


# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# Read data from your txt file
with open("test.txt", "r", encoding="utf-8") as file:
    data = file.read().strip().split("END\n\n")

# Extract texts and summaries
train_texts = [sample.split("Text:")[1].split("Summary:")[0].strip() for sample in data]
train_summaries = [sample.split("Summary:")[1].strip() for sample in data]

# Create dataset
train_dataset = CustomDataset(train_texts, train_summaries, tokenizer, max_length=128)

# Prepare training arguments
training_args = TrainingArguments(
    output_dir="./models",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
)

# Define custom training loop to calculate loss
def compute_loss(model, inputs):
    outputs = model(**inputs)
    return outputs.loss

# Instantiate Trainer with custom training loop
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
trainer.save_model("./fine_tuned_model")

# Save the tokenizer separately
tokenizer.save_pretrained("./fine_tuned_model")

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
100%|██████████| 3/3 [00:02<00:00,  1.49it/s]


{'train_runtime': 2.0172, 'train_samples_per_second': 2.974, 'train_steps_per_second': 1.487, 'train_loss': 2.7329867680867515, 'epoch': 3.0}


('./fine_tuned_model/tokenizer_config.json',
 './fine_tuned_model/special_tokens_map.json',
 './fine_tuned_model/vocab.json',
 './fine_tuned_model/merges.txt',
 './fine_tuned_model/added_tokens.json')

In [10]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load the saved model
model_path = "./fine_tuned_model"
model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_path)

# Given review text and summary
review_text = "Cricket is a team sport involving a bat and ball played between two teams of eleven players each. The objective is to score more runs (points) than the opposing team. A match is divided into innings during which one team bats, two batter at a time, and the other team bowls."
given_summary = "Cricket is a popular sport with a long history."

# Concatenate review text and given summary
input_text = f"{review_text} {given_summary}"

# Tokenize the concatenated text
input_ids = tokenizer.encode(input_text, return_tensors='pt')

# Generate summary
output = model.generate(input_ids, max_length=100, num_return_sequences=1, early_stopping=True)

# Decode and print the generated summary
generated_summary = tokenizer.decode(output[0], skip_special_tokens=True)
print("Given Review Text:", review_text)
print("Given Summary:", given_summary)
print("Generated Summary:", generated_summary)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Given Review Text: Cricket is a team sport involving a bat and ball played between two teams of eleven players each. The objective is to score more runs (points) than the opposing team. A match is divided into innings during which one team bats, two batter at a time, and the other team bowls.
Given Summary: Cricket is a popular sport with a long history.
Generated Summary: Cricket is a team sport involving a bat and ball played between two teams of eleven players each. The objective is to score more runs (points) than the opposing team. A match is divided into innings during which one team bats, two batter at a time, and the other team bowls. Cricket is a popular sport with a long history. The first team to win the match was the Indian team, the second team was the English team, and the third team was the British team. The


In [13]:
# from rouge_score import rouge_scorer

from rouge import Rouge

# Initialize Rouge
rouge = Rouge()

# Compute ROUGE scores
scores = rouge.get_scores(generated_summary, input_text)

# Print ROUGE scores
print("ROUGE-1: Precision: {}, Recall: {}, F1-Score: {}".format(scores[0]['rouge-1']['p'], scores[0]['rouge-1']['r'], scores[0]['rouge-1']['f']))
print("ROUGE-2: Precision: {}, Recall: {}, F1-Score: {}".format(scores[0]['rouge-2']['p'], scores[0]['rouge-2']['r'], scores[0]['rouge-2']['f']))
print("ROUGE-L: Precision: {}, Recall: {}, F1-Score: {}".format(scores[0]['rouge-l']['p'], scores[0]['rouge-l']['r'], scores[0]['rouge-l']['f']))


ROUGE-1: Precision: 0.8333333333333334, Recall: 1.0, F1-Score: 0.9090909041322315
ROUGE-2: Precision: 0.7125, Recall: 1.0, F1-Score: 0.8321167834620918
ROUGE-L: Precision: 0.8333333333333334, Recall: 1.0, F1-Score: 0.9090909041322315
