In [24]:
# 📌 STEP 1: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [25]:
# 📌 STEP 2: Install Dependencies
!pip install transformers scikit-learn --quiet


In [26]:
# 📌 STEP 3: Import Required Libraries
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np


In [31]:
# 📌 STEP 4: Load Your Labeled Dataset from Google Drive
data_path = '/content/drive/MyDrive/Code/Restaurent_data/augmented_data_restaurant_bert.csv'  # <- change this
df = pd.read_csv(data_path)

# Assume your CSV has columns: 'review', aspect, polarity
# If 'sentence' is named differently, change it here
df['text'] = df['aspect_category'] + " [ASP] " + df['aspect_term']

In [43]:
# 📌 STEP 5: Tokenization & Dataset Preparation
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class ReviewDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        # Convert string labels to numerical labels
        self.labels = [0 if label == "Negative" else 1 if label == "Neutral" else 2 for label in labels]
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        # Ensure the text is a string before tokenization
        text = str(self.texts[idx])  # Convert to string if not already
        encoded = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_len, return_tensors='pt')
        return {
            'input_ids': encoded['input_ids'].squeeze(),
            'attention_mask': encoded['attention_mask'].squeeze(),
            'label': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Split the data
train_texts, val_texts, train_labels, val_labels = train_test_split(df['text'], df['polarity'], test_size=0.2, random_state=42)

# Ensure all elements in train_texts and val_texts are strings
train_texts = train_texts.astype(str).tolist()  # Convert to strings and then to list
val_texts = val_texts.astype(str).tolist()  # Convert to strings and then to list


train_dataset = ReviewDataset(train_texts, train_labels.tolist(), tokenizer)
val_dataset = ReviewDataset(val_texts, val_labels.tolist(), tokenizer)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

In [44]:
# 📌 STEP 6: Define BERT Classifier
class BERTClassifier(nn.Module):
    def __init__(self, hidden_size=768, num_labels=3):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        return self.classifier(self.dropout(pooled_output))


In [None]:
# 📌 STEP 7: Model Training
# This step was missing and is where train_losses is populated
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = BERTClassifier().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

num_epochs = 5  # Adjust as needed
train_losses = [] # Initialize train_losses

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    train_losses.append(avg_loss)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")



In [35]:
# 📌 STEP 8: Evaluate Model
def evaluate_model():
    model.eval()
    preds, true = [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask)
            predictions = torch.argmax(outputs, dim=1)
            preds.extend(predictions.cpu().numpy())
            true.extend(labels.cpu().numpy())

    print("\nClassification Report:")
    print(classification_report(true, preds, target_names=["Negative", "Neutral", "Positive"]))
    print("Confusion Matrix:")
    print(confusion_matrix(true, preds))


In [36]:
# 📌 STEP 9: Plot Loss
plt.plot(train_losses)
plt.title("Training Loss per Epoch")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.grid(True)
plt.show()


NameError: name 'train_losses' is not defined

In [37]:
# 📌 STEP 10: Train and Evaluate
train_model(epochs=3)
evaluate_model()


NameError: name 'train_model' is not defined