In [1]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
import pandas as pd
from sklearn.model_selection import train_test_split

# Load pre-trained GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# Read the dataset from the CSV file
df = pd.read_csv("/Users/sarvajeethuk/Downloads/IR/Assignment_4/small_data.csv")
df = df.dropna(subset=["Cleaned_Summary"])

# Extract the "reviews" and "summaries" columns
input_text = df["Cleaned_Text"].tolist()
input_summary = df["Cleaned_Summary"].tolist()

# Split dataset into training and testing sets
train_texts, test_texts, train_summaries, test_summaries = train_test_split(input_text, input_summary, test_size=0.25, random_state=42)

# Filter out empty summaries
test_summaries = [summary for summary in test_summaries if summary.strip()]

# Tokenize input summaries for testing set
test_summary_tokenized = tokenizer(test_summaries, return_tensors="pt", padding=True, truncation=True)

# Tokenize input texts and summaries for both training and testing sets
train_text_tokenized = tokenizer(train_texts, return_tensors="pt", padding=True, truncation=True)
train_summary_tokenized = tokenizer(train_summaries, return_tensors="pt", padding=True, truncation=True)

test_text_tokenized = tokenizer(test_texts, return_tensors="pt", padding=True, truncation=True)
# test_summary_tokenized = tokenizer(test_summaries, return_tensors="pt", padding=True, truncation=True)

# Prepare datasets for training and testing
class SummarizationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = SummarizationDataset(train_text_tokenized)
test_dataset = SummarizationDataset(test_text_tokenized)

# Define custom training and validation loops
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

for epoch in range(3):
    model.train()
    total_train_loss = 0
    for batch in train_dataset:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()

    # Validation loop
    model.eval()
    total_eval_loss = 0
    for batch in test_dataset:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
            loss = outputs.loss

        total_eval_loss += loss.item()

    average_train_loss = total_train_loss / len(train_dataset)
    average_eval_loss = total_eval_loss / len(test_dataset)
    print(f"Epoch {epoch+1}, Average Training Loss: {average_train_loss}, Average Evaluation Loss: {average_eval_loss}")

# Save the fine-tuned model
model.save_pretrained("/Users/sarvajeethuk/Downloads/IR/Assignment_4/Model")
tokenizer.save_pretrained("/Users/sarvajeethuk/Downloads/IR/Assignment_4/Model")

  from .autonotebook import tqdm as notebook_tqdm
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 1, Average Training Loss: 1.1218497688138245, Average Evaluation Loss: 1.089384514093399


In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load fine-tuned GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("/Users/sarvajeethuk/Downloads/IR/Assignment_4/Model")
model = GPT2LMHeadModel.from_pretrained("/Users/sarvajeethuk/Downloads/IR/Assignment_4/Model")
# Given review text
review_text = "bought several vitality canned dog food product found good quality product look like stew processed meat smell better labrador finicky appreciates product better"

# Given summary
given_summary = "good quality dog food."

# Tokenize review text
input_ids = tokenizer.encode(review_text, return_tensors="pt", max_length=1024, truncation=True)

# Generate summary
generated_summary_ids = model.generate(input_ids=input_ids.to(model.device), max_length=50, num_beams=4, early_stopping=True)

# Decode generated summary
generated_summary = tokenizer.decode(generated_summary_ids[0], skip_special_tokens=True)

print("Given Review Text:", review_text)
print("Given Summary:", given_summary)
print("Generated Summary:", generated_summary)

In [None]:
from rouge import Rouge


# Initialize Rouge
rouge = Rouge()

# Compute ROUGE scores
scores = rouge.get_scores(generated_summary, given_summary)

# Print ROUGE scores
print("ROUGE-1: Precision: {}, Recall: {}, F1-Score: {}".format(scores[0]['rouge-1']['p'], scores[0]['rouge-1']['r'], scores[0]['rouge-1']['f']))
print("ROUGE-2: Precision: {}, Recall: {}, F1-Score: {}".format(scores[0]['rouge-2']['p'], scores[0]['rouge-2']['r'], scores[0]['rouge-2']['f']))
print("ROUGE-L: Precision: {}, Recall: {}, F1-Score: {}".format(scores[0]['rouge-l']['p'], scores[0]['rouge-l']['r'], scores[0]['rouge-l']['f']))
