In [None]:
pip install torch transformers scikit-learn pandas matplotlib

In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token  # GPT-2 doesn't have a pad token, so use eos_token

# Load the dataset from CSV
df = pd.read_csv('reviews.csv')  # Ensure the CSV has 'Sentence' and 'Aspects' columns

# Extract columns
sentences = df['Sentence'].tolist()
aspects = df['Aspects'].tolist()

# Custom Dataset for GPT-2
class GPT2AspectDataset(Dataset):
    def __init__(self, sentences, aspects, tokenizer, max_length=128):
        self.sentences = sentences
        self.aspects = aspects
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        aspect = self.aspects[idx]
        input_text = f"Review: {sentence} Aspect:"
        target_text = f"{aspect} {tokenizer.eos_token}"

        # Encode input and target
        input_encoding = self.tokenizer(input_text, truncation=True, max_length=self.max_length, return_tensors="pt")
        target_encoding = self.tokenizer(target_text, truncation=True, max_length=self.max_length, return_tensors="pt")

        # Create labels
        input_ids = input_encoding['input_ids'].squeeze()
        attention_mask = input_encoding['attention_mask'].squeeze()
        labels = target_encoding['input_ids'].squeeze()

        # Padding if necessary
        if labels.shape[0] < self.max_length:
            labels = torch.cat([labels, torch.full((self.max_length - labels.shape[0],), -100)])
        else:
            labels = labels[:self.max_length]

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

# Create Dataset and DataLoader
dataset = GPT2AspectDataset(sentences, aspects, tokenizer)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)  # Adjust batch size based on resources

# Load GPT-2 model
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.resize_token_embeddings(len(tokenizer))  # Adjust model to tokenizer size
model.to(device)

# Define optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
total_steps = len(dataloader) * 4  # Assume 4 epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
train_losses = []
epochs = 4
for epoch in range(epochs):
    model.train()
    total_train_loss = 0
    for batch in dataloader:
        batch_input_ids = batch['input_ids'].to(device)
        batch_attention_masks = batch['attention_mask'].to(device)
        batch_labels = batch['labels'].to(device)

        model.zero_grad()
        outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_masks, labels=batch_labels)
        loss = outputs.loss
        total_train_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(dataloader)
    train_losses.append(avg_train_loss)
    print(f"Epoch {epoch + 1}, Training Loss: {avg_train_loss:.4f}")

# Plot the learning curve
plt.figure(figsize=(10, 6))
plt.plot(train_losses, label='Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Learning Curve')
plt.legend()
plt.show()

# Evaluation on the dataset
model.eval()
predictions = []
actuals = []
with torch.no_grad():
    for batch in dataloader:
        batch_input_ids = batch['input_ids'].to(device)
        batch_attention_masks = batch['attention_mask'].to(device)

        outputs = model.generate(input_ids=batch_input_ids, attention_mask=batch_attention_masks, max_length=50)
        decoded_preds = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
        decoded_labels = [tokenizer.decode(label[label != -100], skip_special_tokens=True) for label in batch['labels']]

        predictions.extend(decoded_preds)
        actuals.extend(decoded_labels)

# Evaluation Metrics
# This example assumes a perfect match. Adjust matching criteria based on your data format.
accuracy = accuracy_score(actuals, predictions)
precision = precision_score(actuals, predictions, average='weighted', zero_division=1)
recall = recall_score(actuals, predictions, average='weighted', zero_division=1)
f1 = f1_score(actuals, predictions, average='weighted', zero_division=1)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Plotting a simplified confusion matrix (adjust labels if there are more classes)
conf_matrix = confusion_matrix(actuals, predictions)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()
