In [None]:
# @title Step 0: Setup and Installations
!pip install transformers datasets seqeval evaluate accelerate -q -U # Added accelerate for Trainer

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m354.7/354.7 kB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m10.8 MB/s[0m eta [

In [None]:
# @title Step 0: Setup and Installations


import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset as TorchDataset, DataLoader
from datasets import Dataset as HfDataset, DatasetDict # Renamed Hugging Face Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification,
    DataCollatorWithPadding
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_recall_fscore_support # For LSTM eval
from collections import Counter, defaultdict
import evaluate # Use the evaluate library for metrics
import logging
import json
import re
import nltk
from nltk.tokenize import word_tokenize

In [None]:
# --- Download NLTK data ---
try:
    nltk.data.find('tokenizers/punkt')
except LookupError: # Use LookupError instead of DownloadError
    nltk.download('punkt', quiet=True)

In [None]:
# Suppress excessive warnings
logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR)
logging.getLogger("datasets").setLevel(logging.ERROR) # Reduce datasets logging

In [None]:
# --- Configuration ---
XML_FILE = 'Restaurants_Train.xml' # Make sure this file is uploaded
TRANSFORMER_MODEL_NAME = 'roberta-base' # Name consistency
TEST_SIZE = 0.2
RANDOM_SEED = 42

# --- LSTM Hyperparameters ---
LSTM_EMBEDDING_DIM = 100
LSTM_HIDDEN_DIM = 128
LSTM_NUM_LAYERS = 1
LSTM_DROPOUT = 0.3
LSTM_LR = 0.001
LSTM_EPOCHS = 20 # Train LSTM for a bit longer
LSTM_BATCH_SIZE = 16
LSTM_MAX_SEQ_LEN = 100 # Max sequence length for LSTM input

# --- Full Polarity Labels ---
POLARITY_LIST = ['positive', 'negative', 'neutral', 'conflict'] # All classes
POLARITY_MAP = {label: i for i, label in enumerate(POLARITY_LIST)}
NUM_POLARITY_LABELS = len(POLARITY_LIST)
print(f"Full Polarity Labels: {POLARITY_LIST}")
print(f"Polarity Label Map: {POLARITY_MAP}")

Full Polarity Labels: ['positive', 'negative', 'neutral', 'conflict']
Polarity Label Map: {'positive': 0, 'negative': 1, 'neutral': 2, 'conflict': 3}


In [None]:
# --- Check for GPU ---
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("Using CPU")

Using GPU: Tesla T4


In [None]:
# @title Step 1: Data Loading and Parsing (All Polarities)

def parse_restaurant_xml_full(xml_file):
    """Parses the SemEval Restaurant XML format, keeping all polarities."""
    tree = ET.parse(xml_file)
    root = tree.getroot()
    data = []

    for sentence_elem in root.findall('.//sentence'):
        sentence_id = sentence_elem.get('id')
        text_elem = sentence_elem.find('text')
        text = text_elem.text if text_elem is not None else None

        if not text:
            print(f"Warning: Sentence {sentence_id} has no text.")
            continue

        aspect_terms = []
        aspect_term_elems = sentence_elem.find('aspectTerms')
        if aspect_term_elems is not None:
            for term_elem in aspect_term_elems.findall('aspectTerm'):
                term_text = term_elem.get('term')
                polarity = term_elem.get('polarity')
                try:
                    # Use get with default to handle missing attributes gracefully
                    from_str = term_elem.get('from')
                    to_str = term_elem.get('to')

                    if from_str is None or to_str is None or term_text is None or polarity is None:
                         print(f"Skipping term due to missing attributes in sentence {sentence_id}: {term_elem.attrib}")
                         continue

                    from_idx = int(from_str)
                    to_idx = int(to_str)
                    # Use POLARITY_MAP, default to -1 if polarity not found (shouldn't happen with this data)
                    pol_label = POLARITY_MAP.get(polarity, -1)

                    if pol_label != -1: # Ensure polarity is valid
                         aspect_terms.append({
                             'term': term_text,
                             'polarity': polarity,
                             'polarity_label': pol_label,
                             'from': from_idx,
                             'to': to_idx
                         })
                    else:
                         print(f"Warning: Unknown polarity '{polarity}' for term '{term_text}' in sentence {sentence_id}. Skipping.")

                except (ValueError, TypeError) as e:
                     print(f"Skipping term due to invalid from/to in sentence {sentence_id}: {term_elem.attrib} - Error: {e}")

        # We don't need aspect categories for this specific request
        # aspect_categories = [] # ... parsing logic if needed ...

        data.append({
            'id': sentence_id,
            'text': text,
            'aspect_terms': aspect_terms, # List of terms with full polarity info
            # 'aspect_categories': aspect_categories # Include if needed later
        })
    return data

In [None]:
# --- Load Data ---
raw_data_full = parse_restaurant_xml_full(XML_FILE)
print(f"Loaded {len(raw_data_full)} sentences.")
if not raw_data_full:
     raise ValueError("Failed to load any data. Check XML file path and format.")

Loaded 3044 sentences.


In [None]:
# Example:
print("\nExample Raw Data Entry (Full Polarity):")
example_entry = next((item for item in raw_data_full if item['aspect_terms']), raw_data_full[0]) # Show first with aspects or just first
print(json.dumps(example_entry, indent=2))



Example Raw Data Entry (Full Polarity):
{
  "id": "3121",
  "text": "But the staff was so horrible to us.",
  "aspect_terms": [
    {
      "term": "staff",
      "polarity": "negative",
      "polarity_label": 1,
      "from": 8,
      "to": 13
    }
  ]
}


In [None]:
# --- Split Data ---
# Keep original list format for LSTM, create HF Dataset for Transformers
df = pd.DataFrame(raw_data_full)
train_df, test_df = train_test_split(df, test_size=TEST_SIZE, random_state=RANDOM_SEED)
train_raw_list = train_df.to_dict('records') # For LSTM
test_raw_list = test_df.to_dict('records')   # For LSTM & Final Eval Loop

# --- Create Hugging Face Datasets (for Transformer models) ---
train_dataset_hf = HfDataset.from_pandas(train_df)
test_dataset_hf = HfDataset.from_pandas(test_df)
raw_datasets_hf = DatasetDict({'train': train_dataset_hf, 'test': test_dataset_hf})

print(f"Train examples: {len(train_raw_list)}, Test examples: {len(test_raw_list)}")
print("\nHF Dataset structure (Full Polarity):")
print(raw_datasets_hf)

Train examples: 2435, Test examples: 609

HF Dataset structure (Full Polarity):
DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'aspect_terms', '__index_level_0__'],
        num_rows: 2435
    })
    test: Dataset({
        features: ['id', 'text', 'aspect_terms', '__index_level_0__'],
        num_rows: 609
    })
})


In [None]:
# @title Step 2a: Data Preparation for Aspect Term Extraction (ATE)

# --- ATE Labels (BIO) ---
ate_label_list = ['O', 'B-ASP', 'I-ASP']
ate_label_map = {label: i for i, label in enumerate(ate_label_list)}
num_ate_labels = len(ate_label_list)

# --- Tokenizer ---
# Ensure tokenizer is loaded only once if possible
tokenizer = AutoTokenizer.from_pretrained(TRANSFORMER_MODEL_NAME)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
def tokenize_and_align_labels_ate(examples):
    """Tokenizes text and aligns character-level spans to token-level BIO labels for ATE."""
    tokenized_inputs = tokenizer(
        examples["text"],
        truncation=True,
        is_split_into_words=False,
        return_offsets_mapping=True,
        max_length=512, # Set a reasonable max length
        padding=False # Collator will handle padding
    )
    all_labels = []

    for i, sentence_aspects in enumerate(examples["aspect_terms"]): # Use 'aspect_terms' field
        doc_labels = []
        offset_mapping = tokenized_inputs["offset_mapping"][i]
        doc_labels = [ate_label_map['O']] * len(offset_mapping) # Initialize with 'O'

        for aspect_info in sentence_aspects:
            start_char = aspect_info['from']
            end_char = aspect_info['to']

            # Map character spans to token indices
            token_start_index = -1
            token_end_index = -1
            for idx, (start, end) in enumerate(offset_mapping):
                if start == 0 and end == 0: continue # Skip special tokens
                if token_start_index == -1 and start <= start_char < end:
                    token_start_index = idx
                if start < end_char:
                    token_end_index = idx

            # Assign BIO labels
            if token_start_index != -1 and token_end_index != -1 and token_start_index <= token_end_index:
                doc_labels[token_start_index] = ate_label_map['B-ASP']
                for k in range(token_start_index + 1, token_end_index + 1):
                    # Check if k is within bounds before assigning
                    if k < len(doc_labels):
                         doc_labels[k] = ate_label_map['I-ASP']
                    # else:
                         # This case (k >= len(doc_labels)) might occur if aspect span goes beyond max_length cutoff
                         # print(f"Warning: Token index {k} out of bounds for doc_labels (len {len(doc_labels)}) in sentence {i}")


        # Align labels with word pieces, setting subsequent pieces to -100
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        if word_ids is None: # Safety check
            all_labels.append([-100] * len(doc_labels))
            continue

        previous_word_idx = None
        final_labels = []
        for k, word_idx in enumerate(word_ids):
            if k >= len(doc_labels): break # Stop if word_ids index exceeds labels list length

            if word_idx is None: # Special token
                final_labels.append(-100)
            elif word_idx != previous_word_idx: # First token of a new word
                final_labels.append(doc_labels[k])
            else: # Subsequent token of the same word
                # If part of an aspect (I-ASP), keep the label, otherwise ignore
                if doc_labels[k] == ate_label_map['I-ASP']:
                    final_labels.append(ate_label_map['I-ASP'])
                else:
                    final_labels.append(-100)
            previous_word_idx = word_idx
        all_labels.append(final_labels)

    tokenized_inputs["labels"] = all_labels
    # Remove mapping, not needed by model
    if "offset_mapping" in tokenized_inputs:
        tokenized_inputs.pop("offset_mapping")
    return tokenized_inputs

In [None]:
# --- Apply Tokenization and Alignment for ATE ---
cols_to_remove_ate = [col for col in raw_datasets_hf["train"].column_names if col not in ["text", "aspect_terms"]]
tokenized_datasets_ate = raw_datasets_hf.map(
    tokenize_and_align_labels_ate, batched=True,
    remove_columns=cols_to_remove_ate + ["aspect_terms"]
)

print("\nTokenized ATE dataset structure:")
print(tokenized_datasets_ate)
print("\nExample ATE tokenized entry (showing input_ids and labels):")
print(f"Input IDs: {tokenized_datasets_ate['train'][0]['input_ids']}")
print(f"Labels:    {tokenized_datasets_ate['train'][0]['labels']}")

Map:   0%|          | 0/2435 [00:00<?, ? examples/s]

Map:   0%|          | 0/609 [00:00<?, ? examples/s]


Tokenized ATE dataset structure:
DatasetDict({
    train: Dataset({
        features: ['text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2435
    })
    test: Dataset({
        features: ['text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 609
    })
})

Example ATE tokenized entry (showing input_ids and labels):
Input IDs: [0, 2387, 1623, 56, 5, 10969, 3998, 879, 6, 14140, 6, 8, 2480, 6353, 8, 37, 3776, 70, 155, 7484, 4, 2]
Labels:    [-100, 0, 0, 0, 0, 1, 2, 2, 0, 1, 0, 0, 1, 2, 0, 0, 0, 0, 0, 1, 0, -100]


In [None]:
# @title Step 2b: Aspect Term Extraction (ATE) - Model Training
# --- ATE Model & Training ---
model_ate = AutoModelForTokenClassification.from_pretrained(
    TRANSFORMER_MODEL_NAME,
    num_labels=num_ate_labels,
    id2label={i: l for i, l in enumerate(ate_label_list)},
    label2id=ate_label_map
).to(device)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# --- Data Collator ---
data_collator_ate = DataCollatorForTokenClassification(tokenizer=tokenizer)

# --- Metrics ---
seqeval_metric = evaluate.load("seqeval")

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [None]:
def compute_metrics_ate(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_predictions = [
        [ate_label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [ate_label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    # Handle cases where there might be no predicted/true labels after filtering -100
    if not any(true_labels) and not any(true_predictions):
         print("Warning: No labels found for metric calculation in this batch.")
         # Return default values or skip calculation for this batch
         return {"precision": 0.0, "recall": 0.0, "f1": 0.0, "accuracy": 0.0}

    results = seqeval_metric.compute(predictions=true_predictions, references=true_labels, zero_division=0)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
# --- Training Arguments ---
training_args_ate = TrainingArguments(
    output_dir="./results/ate_full", # Distinct dir
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10, # Maybe slightly more epochs for potentially more complex patterns
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    push_to_hub=False,
    report_to="none",
    logging_steps=50,
)

In [None]:
# --- Trainer ---
trainer_ate = Trainer(
    model=model_ate,
    args=training_args_ate,
    train_dataset=tokenized_datasets_ate["train"],
    eval_dataset=tokenized_datasets_ate["test"],
    tokenizer=tokenizer,
    data_collator=data_collator_ate,
    compute_metrics=compute_metrics_ate,
)

# --- Train ---
print("\n--- Starting ATE Model Training (Full Polarity Data) ---")
trainer_ate.train()
print("\n--- ATE Model Training Finished ---")

  trainer_ate = Trainer(



--- Starting ATE Model Training (Full Polarity Data) ---


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0779,0.073382,0.851852,0.88579,0.86849,0.976523
2,0.0647,0.071598,0.856959,0.883134,0.86985,0.977764
3,0.0319,0.086271,0.85,0.903054,0.875724,0.977867
4,0.0185,0.101459,0.880105,0.887118,0.883598,0.979522
5,0.0066,0.11604,0.87969,0.903054,0.891219,0.98035
6,0.0133,0.133163,0.868047,0.891102,0.879423,0.977971
7,0.0065,0.134482,0.870013,0.897742,0.88366,0.978591
8,0.0031,0.146435,0.865729,0.89907,0.882085,0.978488
9,0.0031,0.151517,0.873548,0.89907,0.886126,0.979005
10,0.0005,0.152287,0.87808,0.89907,0.888451,0.979626



--- ATE Model Training Finished ---


In [None]:
# --- Evaluate ---
print("\n--- Evaluating ATE Model ---")
eval_results_ate = trainer_ate.evaluate()
print(eval_results_ate)


--- Evaluating ATE Model ---


{'eval_loss': 0.11603996157646179, 'eval_precision': 0.8796895213454075, 'eval_recall': 0.9030544488711819, 'eval_f1': 0.891218872870249, 'eval_accuracy': 0.9803495707932568, 'eval_runtime': 1.6339, 'eval_samples_per_second': 372.733, 'eval_steps_per_second': 47.127, 'epoch': 10.0}


In [None]:
# --- Save Model ---
ate_model_path_full = "./fine_tuned_ate_model_full"
trainer_ate.save_model(ate_model_path_full)
tokenizer.save_pretrained(ate_model_path_full) # Save tokenizer with the model
print(f"ATE model saved to {ate_model_path_full}")

ATE model saved to ./fine_tuned_ate_model_full


In [None]:
# @title Step 2c: Aspect Term Extraction (ATE) - Inference Function (Same as before)

# This function remains the same as it only extracts based on BIO tags
def extract_aspects_from_text(text, model_path, tokenizer_path):
    """Uses the fine-tuned ATE model to extract aspect terms from text."""
    try:
        local_tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
        local_model = AutoModelForTokenClassification.from_pretrained(model_path).to(device)
        local_model.eval()
    except Exception as e:
        print(f"Error loading ATE model/tokenizer from {model_path}: {e}")
        return []

    inputs = local_tokenizer(text, return_tensors="pt", truncation=True, return_offsets_mapping=True, max_length=512)
    offset_mapping = inputs.pop("offset_mapping").squeeze().tolist()
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        logits = local_model(**inputs).logits

    predictions = torch.argmax(logits, dim=2).squeeze().tolist()
    input_ids = inputs["input_ids"].squeeze().tolist()
    id2label = local_model.config.id2label

    aspects = []
    current_aspect_tokens = []
    current_aspect_start_char = -1
    current_aspect_end_char = -1 # Track end character precisely

    for i, pred_id in enumerate(predictions):
        if input_ids[i] in [local_tokenizer.cls_token_id, local_tokenizer.sep_token_id, local_tokenizer.pad_token_id]:
            continue
        start_char, end_char = offset_mapping[i]
        if start_char == end_char: continue

        pred_label = id2label[pred_id]

        if pred_label == 'B-ASP':
            if current_aspect_tokens: # Finalize previous aspect
                final_text = text[current_aspect_start_char:current_aspect_end_char]
                aspects.append({"term": final_text.strip(), "from": current_aspect_start_char, "to": current_aspect_end_char})
            # Start new aspect
            current_aspect_tokens = [input_ids[i]]
            current_aspect_start_char = start_char
            current_aspect_end_char = end_char
        elif pred_label == 'I-ASP':
            if current_aspect_tokens: # Continue current aspect
                current_aspect_tokens.append(input_ids[i])
                current_aspect_end_char = end_char # Update end char
            # else: Ignore I-ASP without B-ASP
        elif pred_label == 'O':
            if current_aspect_tokens: # Finalize aspect
                final_text = text[current_aspect_start_char:current_aspect_end_char]
                aspects.append({"term": final_text.strip(), "from": current_aspect_start_char, "to": current_aspect_end_char})
                current_aspect_tokens = []
                current_aspect_start_char = -1

    # Add last aspect if sentence ended with one
    if current_aspect_tokens:
        final_text = text[current_aspect_start_char:current_aspect_end_char]
        aspects.append({"term": final_text.strip(), "from": current_aspect_start_char, "to": current_aspect_end_char})

    return aspects

In [None]:
# @title Step 3a: Data Preparation for Aspect Sentiment Classification (ASC - Full Polarity)

def prepare_asc_data_transformer(examples):
    """Prepares data for ASC: pairs (sentence, aspect_term) -> full_polarity_label."""
    processed_texts = []
    processed_labels = []

    for i in range(len(examples["text"])):
        sentence = examples["text"][i]
        aspect_terms = examples["aspect_terms"][i] # Use the field with full data

        for term_info in aspect_terms:
            # Use ground truth terms and their full polarity labels for training
            text_pair = f"{sentence} [SEP] {term_info['term']}"
            processed_texts.append(text_pair)
            # Append the polarity_label (0, 1, 2, or 3)
            processed_labels.append(term_info['polarity_label'])

    # Tokenize the pairs
    tokenized = tokenizer(processed_texts, truncation=True, padding=False, max_length=512) # Padding handled by collator
    tokenized['labels'] = processed_labels
    return tokenized


In [None]:
# --- Apply ASC Data Preparation ---
# Use the original raw_datasets which contains the ground truth terms and full polarities
tokenized_datasets_asc_transformer = raw_datasets_hf.map(
    prepare_asc_data_transformer, batched=True,
    remove_columns=raw_datasets_hf["train"].column_names
)

# Filter out examples where processing might have failed (e.g., no terms)
# The map function handles batching, so filtering might be complex here.
# Instead, ensure the prepare function handles empty aspect lists gracefully.
# Let's check the dataset size.
print(f"Number of ASC training examples: {len(tokenized_datasets_asc_transformer['train'])}")
print(f"Number of ASC test examples: {len(tokenized_datasets_asc_transformer['test'])}")
if len(tokenized_datasets_asc_transformer['train']) == 0:
     raise ValueError("ASC training dataset is empty after preparation. Check data processing.")


print("\nTokenized ASC dataset structure (Full Polarity):")
print(tokenized_datasets_asc_transformer)
print("\nExample ASC tokenized entry (showing input_ids and label):")
print(f"Input IDs: {tokenized_datasets_asc_transformer['train'][0]['input_ids']}")
print(f"Label:     {tokenized_datasets_asc_transformer['train'][0]['labels']} ({POLARITY_LIST[tokenized_datasets_asc_transformer['train'][0]['labels']]})")


Map:   0%|          | 0/2435 [00:00<?, ? examples/s]

Map:   0%|          | 0/609 [00:00<?, ? examples/s]

Number of ASC training examples: 2946
Number of ASC test examples: 753

Tokenized ASC dataset structure (Full Polarity):
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2946
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 753
    })
})

Example ASC tokenized entry (showing input_ids and label):
Input IDs: [0, 2387, 1623, 56, 5, 10969, 3998, 879, 6, 14140, 6, 8, 2480, 6353, 8, 37, 3776, 70, 155, 7484, 4, 646, 3388, 510, 742, 10969, 3998, 879, 2]
Label:     0 (positive)


In [None]:
# @title Step 3b: Aspect Sentiment Classification (ASC) - Model Training (Full Polarity)

# --- Model ---
asc_id2label = {i: label for i, label in enumerate(POLARITY_LIST)}
asc_label2id = {label: i for i, label in enumerate(POLARITY_LIST)}

model_asc = AutoModelForSequenceClassification.from_pretrained(
    TRANSFORMER_MODEL_NAME,
    num_labels=NUM_POLARITY_LABELS, # Should be 4
    id2label=asc_id2label,
    label2id=asc_label2id
).to(device)

# --- Data Collator ---
data_collator_asc = DataCollatorWithPadding(tokenizer=tokenizer)

# --- Metrics ---
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

In [None]:
def compute_metrics_asc(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    acc = accuracy_metric.compute(predictions=predictions, references=labels)
    # Use 'weighted' f1 for multi-class imbalance, 'micro' for overall accuracy equivalent, 'macro' for unweighted average
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")
    return {
        "accuracy": acc["accuracy"],
        "f1_weighted": f1["f1"],
    }

# --- Training Arguments ---
training_args_asc = TrainingArguments(
    output_dir="./results/asc_full", # Distinct dir
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    num_train_epochs=5, # Match ATE epochs or adjust
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1_weighted", # Optimize for weighted F1
    fp16=True,
    save_total_limit=1,
    push_to_hub=False,
    report_to="none",
    logging_steps=50,
)

# --- Trainer ---
trainer_asc = Trainer(
    model=model_asc,
    args=training_args_asc,
    train_dataset=tokenized_datasets_asc_transformer["train"],
    eval_dataset=tokenized_datasets_asc_transformer["test"],
    tokenizer=tokenizer,
    data_collator=data_collator_asc,
    compute_metrics=compute_metrics_asc,
)

  trainer_asc = Trainer(


In [None]:
# --- Train ---
print("\n--- Starting ASC Model Training (Full Polarity) ---")
trainer_asc.train()
print("\n--- ASC Model Training Finished ---")


--- Starting ASC Model Training (Full Polarity) ---


Epoch,Training Loss,Validation Loss,Accuracy,F1 Weighted
1,0.7394,0.701312,0.719788,0.697246
2,0.4948,0.584557,0.803453,0.780991
3,0.4434,0.640456,0.816733,0.80214
4,0.1628,0.918217,0.814077,0.80966



--- ASC Model Training Finished ---


In [None]:
# --- Evaluate ---
print("\n--- Evaluating ASC Model (Full Polarity) ---")
eval_results_asc = trainer_asc.evaluate()
print(eval_results_asc)


--- Evaluating ASC Model (Full Polarity) ---


{'eval_loss': 0.9182167053222656, 'eval_accuracy': 0.8140770252324038, 'eval_f1_weighted': 0.8096597150301473, 'eval_runtime': 1.4098, 'eval_samples_per_second': 534.12, 'eval_steps_per_second': 67.386, 'epoch': 4.987788331071913}


In [None]:
# --- Save Model ---
asc_model_path_full = "./fine_tuned_asc_model_full"
trainer_asc.save_model(asc_model_path_full)
tokenizer.save_pretrained(asc_model_path_full) # Save tokenizer with the model
print(f"ASC model saved to {asc_model_path_full}")

ASC model saved to ./fine_tuned_asc_model_full


In [None]:
# @title Step 3c: Aspect Sentiment Classification (ASC) - Inference Function (Full Polarity)

def classify_aspect_sentiment_transformer(sentence, aspect_term, model_path, tokenizer_path):
    """Classifies the sentiment (full polarity) of a given aspect term within a sentence."""
    try:
        local_tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
        local_model = AutoModelForSequenceClassification.from_pretrained(model_path).to(device)
        local_model.eval()
    except Exception as e:
        print(f"Error loading ASC model/tokenizer from {model_path}: {e}")
        return "error_loading_model"

    # Prepare input using the [CLS] sentence [SEP] aspect_term [SEP] format
    text_pair = f"{sentence} [SEP] {aspect_term}"
    inputs = local_tokenizer(text_pair, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)

    # Get predictions
    with torch.no_grad():
        logits = local_model(**inputs).logits

    predicted_class_id = torch.argmax(logits, dim=1).item()

    # Use the model's config to map id back to label string
    # Ensure the loaded model has the correct id2label config
    if hasattr(local_model.config, 'id2label'):
         predicted_label = local_model.config.id2label[predicted_class_id]
    else:
         # Fallback if config is missing (shouldn't happen if saved correctly)
         predicted_label = POLARITY_LIST[predicted_class_id]


    # Optionally return probabilities/scores
    # probabilities = torch.softmax(logits, dim=1).squeeze().tolist()
    # score = probabilities[predicted_class_id]
    # return predicted_label, score

    return predicted_label

In [None]:
# --- Inference Example (using results from ATE) ---
print("\n--- ASC Inference Example (Full Polarity) ---")
# Re-run ATE inference on a test sentence to get aspects
test_sentence_idx = 10 # Choose an index
test_sentence = raw_datasets_hf['test'][test_sentence_idx]['text']
print(f"Test Sentence: '{test_sentence}'")

extracted_aspects = extract_aspects_from_text(test_sentence, ate_model_path_full, ate_model_path_full)

if extracted_aspects:
    print("Extracted Aspects and Predicted Sentiments:")
    for aspect in extracted_aspects:
        term = aspect['term']
        predicted_sentiment = classify_aspect_sentiment_transformer(
            test_sentence, term, asc_model_path_full, asc_model_path_full
        )
        print(f"- Aspect: '{term}', Predicted Sentiment: {predicted_sentiment}")
else:
    print("No aspects were extracted by ATE for this sentence.")


--- ASC Inference Example (Full Polarity) ---
Test Sentence: 'i recommend the thai popcorn :)'
Extracted Aspects and Predicted Sentiments:
- Aspect: 'thai popcorn', Predicted Sentiment: positive


In [None]:
# @title Step 4: Full Pipeline Inference Example

def run_absa_pipeline_full(text, ate_model_path, asc_model_path):
    """Runs the ATE -> ASC pipeline for full polarity classification."""
    results = {"text": text, "aspects": []}

    # 1. Aspect Term Extraction (ATE)
    ate_tokenizer = AutoTokenizer.from_pretrained(ate_model_path) # Re-load locally if needed
    extracted_terms = extract_aspects_from_text(text, ate_model_path, ate_model_path)

    # 2. Aspect Sentiment Classification (ASC) for each extracted term
    if extracted_terms:
        # Load ASC resources once if classifying multiple terms for the same sentence
        asc_tokenizer = AutoTokenizer.from_pretrained(asc_model_path)
        asc_model = AutoModelForSequenceClassification.from_pretrained(asc_model_path).to(device)
        asc_model.eval()

        for term_info in extracted_terms:
            term_text = term_info['term']

            # Prepare input for ASC
            text_pair = f"{text} [SEP] {term_text}"
            inputs = asc_tokenizer(text_pair, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)

            # Get prediction
            with torch.no_grad():
                logits = asc_model(**inputs).logits
            predicted_class_id = torch.argmax(logits, dim=1).item()
            sentiment = asc_model.config.id2label[predicted_class_id] # Use model's map

            results["aspects"].append({
                "term": term_text,
                "from": term_info["from"],
                "to": term_info["to"],
                "sentiment": sentiment
            })
    else:
        print(f"Pipeline: No aspect terms extracted for: '{text}'")

    return results

In [None]:
# --- Run Full Pipeline ---
print("\n--- Full Pipeline Inference Examples (Full Polarity) ---")

# Example 1: Sentence ID 3121
sentence_3121 = "But the staff was so horrible to us."
print(f"\nInput: {sentence_3121}")
result_3121 = run_absa_pipeline_full(sentence_3121, ate_model_path_full, asc_model_path_full)
print("Output:")
print(json.dumps(result_3121, indent=2))

# Example 2: Sentence ID 1634
sentence_1634 = "The food is uniformly exceptional, with a very capable kitchen which will proudly whip up whatever you feel like eating, whether it's on the menu or not."
print(f"\nInput: {sentence_1634}")
result_1634 = run_absa_pipeline_full(sentence_1634, ate_model_path_full, asc_model_path_full)
print("Output:")
print(json.dumps(result_1634, indent=2))

# Example 3: Sentence ID 296 (Multiple terms, neg/neu)
sentence_296 = "They did not have mayonnaise, forgot our toast, left out ingredients (ie cheese in an omelet), below hot temperatures and the bacon was so over cooked it crumbled on the plate when you touched it."
print(f"\nInput: {sentence_296}")
result_296 = run_absa_pipeline_full(sentence_296, ate_model_path_full, asc_model_path_full)
print("Output:")
print(json.dumps(result_296, indent=2))

# Example 4: Sentence ID 1793 (Conflict category, neutral terms)
sentence_1793 = "It took half an hour to get our check, which was perfect since we could sit, have drinks and talk!"
print(f"\nInput: {sentence_1793}")
result_1793 = run_absa_pipeline_full(sentence_1793, ate_model_path_full, asc_model_path_full)
print("Output:")
print(json.dumps(result_1793, indent=2))


--- Full Pipeline Inference Examples (Full Polarity) ---

Input: But the staff was so horrible to us.
Output:
{
  "text": "But the staff was so horrible to us.",
  "aspects": [
    {
      "term": "staff",
      "from": 8,
      "to": 13,
      "sentiment": "negative"
    }
  ]
}

Input: The food is uniformly exceptional, with a very capable kitchen which will proudly whip up whatever you feel like eating, whether it's on the menu or not.
Output:
{
  "text": "The food is uniformly exceptional, with a very capable kitchen which will proudly whip up whatever you feel like eating, whether it's on the menu or not.",
  "aspects": [
    {
      "term": "food",
      "from": 4,
      "to": 8,
      "sentiment": "positive"
    },
    {
      "term": "kitchen",
      "from": 55,
      "to": 62,
      "sentiment": "positive"
    },
    {
      "term": "menu",
      "from": 141,
      "to": 145,
      "sentiment": "neutral"
    }
  ]
}

Input: They did not have mayonnaise, forgot our toast, left

#BaseLine Model - Using Bi Directional LSTM

In [None]:
# --- LSTM Data Preparation ---
def build_vocab(data, tokenizer_func=word_tokenize, min_freq=1):
    counter = Counter()
    for item in data: counter.update(tokenizer_func(item['text'].lower()))
    vocab = {'<PAD>': 0, '<UNK>': 1}; idx = 2
    for word, freq in counter.items():
        if freq >= min_freq: vocab[word] = idx; idx += 1
    return vocab

In [None]:
nltk.download('punkt', quiet=True) # Ensure punkt tokenizer data is downloaded
nltk.download('punkt_tab', quiet=True)

True

In [None]:
lstm_vocab = build_vocab(train_raw_list)
VOCAB_SIZE = len(lstm_vocab)
PAD_IDX = lstm_vocab['<PAD>']
print(f"LSTM Vocab Size: {VOCAB_SIZE}")

LSTM Vocab Size: 4063


In [None]:
def find_token_span(sentence_tokens, aspect_tokens):
    for i in range(len(sentence_tokens) - len(aspect_tokens) + 1):
        if sentence_tokens[i:i+len(aspect_tokens)] == aspect_tokens: return i, i + len(aspect_tokens) -1
    return -1, -1


In [None]:
class LstmAbsaDataset(TorchDataset):
    def __init__(self, data, vocab, tokenizer_func=word_tokenize, max_len=LSTM_MAX_SEQ_LEN):
        self.data = []; self.vocab = vocab; self.tokenizer_func = tokenizer_func
        self.max_len = max_len; self.unk_idx = vocab.get('<UNK>', 1)
        for item in data:
            sentence_tokens = self.tokenizer_func(item['text'].lower())
            for aspect_info in item['aspect_terms']:
                aspect_tokens = self.tokenizer_func(aspect_info['term'].lower())
                start_idx, end_idx = find_token_span(sentence_tokens, aspect_tokens)
                if start_idx == -1: continue # Skip if aspect not found exactly
                sentence_indices = [self.vocab.get(token, self.unk_idx) for token in sentence_tokens]
                self.data.append({'sentence_indices': sentence_indices, 'aspect_start': start_idx,
                                  'aspect_end': end_idx, 'label': aspect_info['polarity_label']})
    def __len__(self): return len(self.data)
    def __getitem__(self, idx):
        item = self.data[idx]; indices = item['sentence_indices']; seq_len = len(indices)
        if seq_len > self.max_len:
            indices = indices[:self.max_len]
            item['aspect_start'] = min(item['aspect_start'], self.max_len - 1)
            item['aspect_end'] = min(item['aspect_end'], self.max_len - 1)
            seq_len = self.max_len
        else: indices.extend([PAD_IDX] * (self.max_len - seq_len))
        item['aspect_start'] = max(0, item['aspect_start'])
        item['aspect_end'] = max(0, item['aspect_end'])
        if item['aspect_start'] > item['aspect_end']: item['aspect_start'] = item['aspect_end']
        return {'input_ids': torch.tensor(indices, dtype=torch.long),
                'aspect_start': torch.tensor(item['aspect_start'], dtype=torch.long),
                'aspect_end': torch.tensor(item['aspect_end'], dtype=torch.long),
                'labels': torch.tensor(item['label'], dtype=torch.long)}

train_dataset_lstm = LstmAbsaDataset(train_raw_list, lstm_vocab)
test_dataset_lstm = LstmAbsaDataset(test_raw_list, lstm_vocab)

if len(train_dataset_lstm) == 0: raise ValueError("LSTM Training dataset empty.")
if len(test_dataset_lstm) == 0: print("Warning: LSTM Test dataset empty.")

In [None]:
def simple_collate_fn(batch):
    keys = batch[0].keys(); return {k: torch.stack([item[k] for item in batch]) for k in keys}

train_loader_lstm = DataLoader(train_dataset_lstm, batch_size=LSTM_BATCH_SIZE, shuffle=True, collate_fn=simple_collate_fn)
test_loader_lstm = DataLoader(test_dataset_lstm, batch_size=LSTM_BATCH_SIZE, shuffle=False, collate_fn=simple_collate_fn) if len(test_dataset_lstm) > 0 else None
print(f"LSTM Train Batches: {len(train_loader_lstm)}, Test Batches: {len(test_loader_lstm) if test_loader_lstm else 0}")


LSTM Train Batches: 183, Test Batches: 47


In [None]:
# --- LSTM Model Definition ---
class LstmAbsaClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim,
                 n_layers, bidirectional, dropout, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers,
                           bidirectional=bidirectional, batch_first=True,
                           dropout=dropout if n_layers > 1 else 0)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim * 2, output_dim) # *2 for bidirectional
    def forward(self, input_ids, aspect_start, aspect_end):
        embedded = self.dropout(self.embedding(input_ids))
        outputs, (hidden, cell) = self.lstm(embedded)
        pooled_outputs = []
        for i in range(outputs.shape[0]):
            start = aspect_start[i].item(); end = min(aspect_end[i].item(), outputs.shape[1] - 1)
            start = min(start, end)
            pooled = torch.mean(outputs[i, start:end+1, :], dim=0) # Avg pooling
            pooled_outputs.append(pooled)
        pooled_batch = self.dropout(torch.stack(pooled_outputs))
        return self.fc(pooled_batch)


In [None]:
# --- Instantiate & Train LSTM Model ---
model_lstm = LstmAbsaClassifier(
    VOCAB_SIZE, LSTM_EMBEDDING_DIM, LSTM_HIDDEN_DIM, NUM_POLARITY_LABELS,
    LSTM_NUM_LAYERS, True, LSTM_DROPOUT, PAD_IDX
).to(device)
optimizer_lstm = optim.Adam(model_lstm.parameters(), lr=LSTM_LR)
criterion_lstm = nn.CrossEntropyLoss()

print("\n--- Training Baseline ASC (LSTM) Model ---")
best_lstm_f1 = -1.0
LSTM_MODEL_PATH = "./baseline_lstm_model.pt"

for epoch in range(LSTM_EPOCHS):
    model_lstm.train(); epoch_loss = 0
    for batch in train_loader_lstm:
        optimizer_lstm.zero_grad()
        ids = batch['input_ids'].to(device); start = batch['aspect_start'].to(device)
        end = batch['aspect_end'].to(device); labels = batch['labels'].to(device)
        predictions = model_lstm(ids, start, end)
        loss = criterion_lstm(predictions, labels); loss.backward(); optimizer_lstm.step()
        epoch_loss += loss.item()
    # Eval
    model_lstm.eval(); all_preds, all_labels = [], []
    if test_loader_lstm:
        with torch.no_grad():
            for batch in test_loader_lstm:
                ids = batch['input_ids'].to(device); start = batch['aspect_start'].to(device)
                end = batch['aspect_end'].to(device); labels = batch['labels'].to(device)
                predictions = model_lstm(ids, start, end)
                all_preds.extend(torch.argmax(predictions, dim=1).cpu().tolist())
                all_labels.extend(labels.cpu().tolist())
    epoch_avg_loss = epoch_loss / len(train_loader_lstm)
    if test_loader_lstm and all_labels:
        report = classification_report(all_labels, all_preds, target_names=POLARITY_LIST, zero_division=0, output_dict=True)
        f1_w = report['weighted avg']['f1-score']; acc = report['accuracy']
        print(f'Epoch {epoch+1}/{LSTM_EPOCHS} | Loss: {epoch_avg_loss:.4f} | Test Acc: {acc:.4f} | Test F1 (w): {f1_w:.4f}')
        if f1_w > best_lstm_f1:
             best_lstm_f1 = f1_w; torch.save(model_lstm.state_dict(), LSTM_MODEL_PATH)
             print(f"  -> New best LSTM model saved to {LSTM_MODEL_PATH}")
    else: print(f'Epoch {epoch+1}/{LSTM_EPOCHS} | Loss: {epoch_avg_loss:.4f} | Test Set Empty/No Preds')

print(f"Finished LSTM Training. Best F1: {best_lstm_f1:.4f}")
if best_lstm_f1 > -1: model_lstm.load_state_dict(torch.load(LSTM_MODEL_PATH))
print(f"Loaded best LSTM model from {LSTM_MODEL_PATH}")
model_lstm.eval() # Set to eval mode



--- Training Baseline ASC (LSTM) Model ---
Epoch 1/20 | Loss: 1.0314 | Test Acc: 0.6051 | Test F1 (w): 0.5548
  -> New best LSTM model saved to ./baseline_lstm_model.pt
Epoch 2/20 | Loss: 0.8683 | Test Acc: 0.6345 | Test F1 (w): 0.6042
  -> New best LSTM model saved to ./baseline_lstm_model.pt
Epoch 3/20 | Loss: 0.7411 | Test Acc: 0.6560 | Test F1 (w): 0.6281
  -> New best LSTM model saved to ./baseline_lstm_model.pt
Epoch 4/20 | Loss: 0.6316 | Test Acc: 0.6292 | Test F1 (w): 0.6275
Epoch 5/20 | Loss: 0.5326 | Test Acc: 0.6519 | Test F1 (w): 0.6450
  -> New best LSTM model saved to ./baseline_lstm_model.pt
Epoch 6/20 | Loss: 0.4421 | Test Acc: 0.6439 | Test F1 (w): 0.6422
Epoch 7/20 | Loss: 0.3722 | Test Acc: 0.6439 | Test F1 (w): 0.6519
  -> New best LSTM model saved to ./baseline_lstm_model.pt
Epoch 8/20 | Loss: 0.2929 | Test Acc: 0.6627 | Test F1 (w): 0.6552
  -> New best LSTM model saved to ./baseline_lstm_model.pt
Epoch 9/20 | Loss: 0.2439 | Test Acc: 0.6506 | Test F1 (w): 0.6485

LstmAbsaClassifier(
  (embedding): Embedding(4063, 100, padding_idx=0)
  (lstm): LSTM(100, 128, batch_first=True, bidirectional=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=4, bias=True)
)

In [None]:
# --- LSTM Inference Function ---
def classify_aspect_sentiment_lstm(sentence, aspect_term, model, vocab, tokenizer_func=word_tokenize, max_len=LSTM_MAX_SEQ_LEN):
    model.eval(); unk_idx = vocab.get('<UNK>', 1); pad_idx = vocab.get('<PAD>', 0)
    sentence_tokens = tokenizer_func(sentence.lower())
    aspect_tokens = tokenizer_func(aspect_term.lower())
    start_idx, end_idx = find_token_span(sentence_tokens, aspect_tokens)
    if start_idx == -1: return 'neutral' # Fallback
    indices = [vocab.get(token, unk_idx) for token in sentence_tokens]; seq_len = len(indices)
    if seq_len > max_len:
        indices = indices[:max_len]; start_idx = min(start_idx, max_len - 1); end_idx = min(end_idx, max_len - 1)
    else: indices.extend([pad_idx] * (max_len - seq_len))
    start_idx = max(0, start_idx); end_idx = max(0, end_idx)
    if start_idx > end_idx: start_idx = end_idx
    input_ids = torch.tensor([indices], dtype=torch.long).to(device)
    aspect_start = torch.tensor([start_idx], dtype=torch.long).to(device)
    aspect_end = torch.tensor([end_idx], dtype=torch.long).to(device)
    with torch.no_grad(): predictions = model(input_ids, aspect_start, aspect_end)
    pred_idx = torch.argmax(predictions, dim=1).item()
    return POLARITY_LIST[pred_idx]

In [None]:
# @title Step 5: End-to-End Evaluation and Comparison (with Accuracy)

def calculate_strict_metrics(true_pairs_list, pred_pairs_dict):
    """
    Calculates strict P, R, F1, and Accuracy (Jaccard Index)
    based on exact match of (term, sentiment).
    """
    # Convert list of ground truth dicts to a set of (term, polarity) tuples
    # Lowercase and strip term for robust comparison
    true_set = set((d['term'].strip().lower(), d['polarity']) for d in true_pairs_list)

    results = {}
    # Iterate through each approach's predictions (e.g., 'pipeline', 'baseline')
    for approach_name, pred_list in pred_pairs_dict.items():
        # Convert list of predicted dicts to a set of (term, sentiment) tuples
        pred_set = set((d['term'].strip().lower(), d['sentiment']) for d in pred_list)

        # Calculate Intersection (True Positives)
        true_positives = len(true_set.intersection(pred_set))

        # Calculate Precision, Recall, F1
        precision = true_positives / len(pred_set) if len(pred_set) > 0 else 0.0
        recall = true_positives / len(true_set) if len(true_set) > 0 else 0.0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0

        # Calculate Union and Accuracy (Jaccard Index)
        union_size = len(true_set) + len(pred_set) - true_positives
        accuracy_jaccard = true_positives / union_size if union_size > 0 else 1.0 # Accuracy is 1 if both sets are empty

        # Store results for this approach
        results[approach_name] = {
            "precision": precision,
            "recall": recall,
            "f1": f1,
            "accuracy_jaccard": accuracy_jaccard, # Added accuracy
            "tp": true_positives,
            "pred_count": len(pred_set)
        }

    results["true_count"] = len(true_set)
    return results

In [None]:
 ASC_TRANSFORMER_MODEL_PATH = asc_model_path_full
 ATE_MODEL_PATH = ate_model_path_full

### below cell takes more time run (10 min)!!

In [None]:
print("\n--- Running End-to-End Evaluation ---")

all_true_pairs_eval = [] # Store ground truth for final eval
all_pred_pairs_eval = {'pipeline': [], 'baseline': []}
# Load trained models explicitly for inference loop clarity
ate_model_inf = AutoModelForTokenClassification.from_pretrained(ATE_MODEL_PATH).to(device)
ate_tokenizer_inf = AutoTokenizer.from_pretrained(ATE_MODEL_PATH)

pipeline_asc_model_inf = AutoModelForSequenceClassification.from_pretrained(ASC_TRANSFORMER_MODEL_PATH).to(device)
pipeline_asc_tokenizer_inf = AutoTokenizer.from_pretrained(ASC_TRANSFORMER_MODEL_PATH)

baseline_asc_model_inf = LstmAbsaClassifier( # Re-init architecture
     VOCAB_SIZE, LSTM_EMBEDDING_DIM, LSTM_HIDDEN_DIM, NUM_POLARITY_LABELS,
    LSTM_NUM_LAYERS, True, LSTM_DROPOUT, PAD_IDX
).to(device)
if best_lstm_f1 > -1: # Load weights if saved
     baseline_asc_model_inf.load_state_dict(torch.load(LSTM_MODEL_PATH, map_location=device))
baseline_asc_model_inf.eval()
# Iterate through the original TEST data list
for item in test_raw_list:
    sentence = item['text']
    sentence_id = item['id']

    # --- Ground Truth ---
    ground_truth_terms = item['aspect_terms']
    for gt in ground_truth_terms:
        all_true_pairs_eval.append({'sentence_id': sentence_id, 'term': gt['term'], 'polarity': gt['polarity']})

    # --- Run ATE (Shared) ---
    # Use the loaded inference versions
    extracted_aspects = extract_aspects_from_text(sentence, ATE_MODEL_PATH, ATE_MODEL_PATH) # Path usage is okay here too

    # --- Run Pipeline ASC (Transformer) ---
    for aspect_info in extracted_aspects:
        term = aspect_info['term']
        if not term: continue
        pipeline_sentiment = classify_aspect_sentiment_transformer(
            sentence, term, ASC_TRANSFORMER_MODEL_PATH, ASC_TRANSFORMER_MODEL_PATH # Use paths
        )
        all_pred_pairs_eval['pipeline'].append({'sentence_id': sentence_id, 'term': term, 'sentiment': pipeline_sentiment})

    # --- Run Baseline ASC (LSTM) ---
    for aspect_info in extracted_aspects:
        term = aspect_info['term']
        if not term: continue
        # Use the loaded LSTM model instance
        baseline_sentiment = classify_aspect_sentiment_lstm(
            sentence, term, baseline_asc_model_inf, lstm_vocab, word_tokenize, LSTM_MAX_SEQ_LEN
        )
        all_pred_pairs_eval['baseline'].append({'sentence_id': sentence_id, 'term': term, 'sentiment': baseline_sentiment})




--- Running End-to-End Evaluation ---


#End to End Comparision between Bi Directional LSTM model(Base Line) and the TransformerModel(Distil BERT) New Approach


In [None]:
# --- Calculate Strict Metrics ---
strict_results = calculate_strict_metrics(all_true_pairs_eval, all_pred_pairs_eval)

# --- Display Results ---
print("\n--- Comparison Results (Test Set) ---")
print(f"\nShared ATE Model Performance (Term Extraction Only):")
# Use results from the trainer's evaluation directly
print(f"  - Precision: {eval_results_ate.get('eval_precision', 'N/A'):.4f}")
print(f"  - Recall:    {eval_results_ate.get('eval_recall', 'N/A'):.4f}")
print(f"  - F1-Score:  {eval_results_ate.get('eval_f1', 'N/A'):.4f}")

print(f"\nEnd-to-End Performance (Strict Match: Term + Sentiment):")
print(f"  - Total Ground Truth Pairs: {strict_results['true_count']}")

print("\n  Baseline (ATE + LSTM ASC):")
baseline_res = strict_results['baseline']
print(f"    - Predicted Pairs: {baseline_res['pred_count']}")
print(f"    - Correct Pairs (TP): {baseline_res['tp']}")
print(f"    - Precision: {baseline_res['precision']:.4f}")
print(f"    - Recall:    {baseline_res['recall']:.4f}")
print(f"    - F1-Score:  {baseline_res['f1']:.4f}")
#print(f"    - Accuracy (Jaccard): {baseline_res['accuracy_jaccard']:.4f}")

print("\n  Pipeline (ATE + Transformer ASC):")
pipeline_res = strict_results['pipeline']
print(f"    - Predicted Pairs: {pipeline_res['pred_count']}")
print(f"    - Correct Pairs (TP): {pipeline_res['tp']}")
print(f"    - Precision: {pipeline_res['precision']:.4f}")
print(f"    - Recall:    {pipeline_res['recall']:.4f}")
print(f"    - F1-Score:  {pipeline_res['f1']:.4f}")
#print(f"    - Accuracy (Jaccard): {pipeline_res['accuracy_jaccard']:.4f}")


--- Comparison Results (Test Set) ---

Shared ATE Model Performance (Term Extraction Only):
  - Precision: 0.8797
  - Recall:    0.9031
  - F1-Score:  0.8912

End-to-End Performance (Strict Match: Term + Sentiment):
  - Total Ground Truth Pairs: 445

  Baseline (ATE + LSTM ASC):
    - Predicted Pairs: 439
    - Correct Pairs (TP): 288
    - Precision: 0.6560
    - Recall:    0.6472
    - F1-Score:  0.6516

  Pipeline (ATE + Transformer ASC):
    - Predicted Pairs: 457
    - Correct Pairs (TP): 334
    - Precision: 0.7309
    - Recall:    0.7506
    - F1-Score:  0.7406


In [50]:
# --- Show Specific Examples: Pipeline Correct, Baseline Incorrect (Sentence Level) ---
print("\n--- Example Sentences: Where Pipeline Corrected Baseline Error (Max 10 Unique Sentences) ---")

max_to_show = 10
candidate_sentences_info = [] # Store info for qualifying sentences

# Pre-process predictions into sets keyed by sentence_id for efficiency
pipeline_preds_by_sent = defaultdict(set)
for p in all_pred_pairs_eval['pipeline']:
    pipeline_preds_by_sent[p['sentence_id']].add((p['term'].strip().lower(), p['sentiment']))

baseline_preds_by_sent = defaultdict(set)
for p in all_pred_pairs_eval['baseline']:
    baseline_preds_by_sent[p['sentence_id']].add((p['term'].strip().lower(), p['sentiment']))


processed_sentence_ids = set() # Avoid processing the same sentence multiple times if needed
for item in test_raw_list:
    sentence_id = item['id']
    if sentence_id in processed_sentence_ids:
         continue # Already decided if this sentence is a candidate

    sentence = item['text']
    ground_truth_aspects = item['aspect_terms']
    found_qualifying_aspect = False

    current_pipeline_preds_set = pipeline_preds_by_sent.get(sentence_id, set())
    current_baseline_preds_set = baseline_preds_by_sent.get(sentence_id, set())

    # Check each ground truth aspect in this sentence
    for gt_aspect in ground_truth_aspects:
        gt_term_normalized = gt_aspect['term'].strip().lower()
        gt_polarity = gt_aspect['polarity'] # Use original polarity string
        gt_pair = (gt_term_normalized, gt_polarity)

        pipeline_correct = gt_pair in current_pipeline_preds_set
        baseline_correct = gt_pair in current_baseline_preds_set

        # If Pipeline got it right AND Baseline got it wrong for *at least one* aspect
        if pipeline_correct and not baseline_correct:
            found_qualifying_aspect = True
            break # Found one, no need to check other aspects in this sentence

    # If this sentence qualifies, store its info
    if found_qualifying_aspect:
        # Store all predictions for this sentence for later display
        pipeline_preds_for_sent = [(p['term'], p['sentiment']) for p in all_pred_pairs_eval['pipeline'] if p['sentence_id'] == sentence_id]
        baseline_preds_for_sent = [(p['term'], p['sentiment']) for p in all_pred_pairs_eval['baseline'] if p['sentence_id'] == sentence_id]

        candidate_sentences_info.append({
            'id': sentence_id,
            'text': sentence,
            'ground_truth': [(gt['term'], gt['polarity']) for gt in ground_truth_aspects],
            'pipeline_preds': pipeline_preds_for_sent,
            'baseline_preds': baseline_preds_for_sent
        })

    processed_sentence_ids.add(sentence_id) # Mark as processed

# --- Print Selected Examples ---
import random
random.shuffle(candidate_sentences_info) # Shuffle to get a random sample

count_shown = 0
for example_info in candidate_sentences_info:
    if count_shown >= max_to_show:
        break

    print(f"\nSentence (ID {example_info['id']}): {example_info['text']}")
    print(f"  - Ground Truth: {example_info['ground_truth'] if example_info['ground_truth'] else '[]'}")
    print(f"  - Pipeline Preds: {example_info['pipeline_preds'] if example_info['pipeline_preds'] else '[]'}")
    print(f"  - Baseline Preds: {example_info['baseline_preds'] if example_info['baseline_preds'] else '[]'}")
    count_shown += 1

if count_shown == 0:
    print("\nNo sentence examples found where Pipeline was correct for an aspect and Baseline was incorrect for the same aspect.")
elif count_shown < max_to_show:
     print(f"\n(Showing {count_shown} unique sentences meeting the criteria)")


--- Example Sentences: Where Pipeline Corrected Baseline Error (Max 10 Unique Sentences) ---

Sentence (ID 1867): The first 2 courses were very good, but the chocolate sampler was too rich for me and the dessert wine far too sweet.
  - Ground Truth: [('courses', 'positive'), ('chocolate sampler', 'negative'), ('dessert wine', 'negative')]
  - Pipeline Preds: [('courses', 'positive'), ('chocolate sampler', 'negative'), ('dessert wine', 'negative')]
  - Baseline Preds: [('courses', 'positive'), ('chocolate sampler', 'negative'), ('dessert wine', 'neutral')]

Sentence (ID 2479): We visited Bread Bar during January restaurant week and were so pleased with the menu selections and service.
  - Ground Truth: [('menu selections', 'positive'), ('service', 'positive')]
  - Pipeline Preds: [('menu selections', 'positive'), ('service', 'positive')]
  - Baseline Preds: [('menu selections', 'positive'), ('service', 'neutral')]

Sentence (ID 979): The service is descent even when this small place is