In [28]:
import typing as T
from typing import List, Dict, Set, Tuple
from collections import defaultdict
import json
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

In [2]:
class Triplet:
    def __init__(self, aspect: str, opinion: str, polarity: str):
        self.aspect = aspect.lower().strip()
        self.opinion = opinion.lower().strip()
        self.polarity = polarity.lower().strip()

    def __eq__(self, other):
        return (
            self.aspect == other.aspect and
            self.opinion == other.opinion and
            self.polarity == other.polarity
        )

    def __hash__(self):
        return hash((self.aspect, self.opinion, self.polarity))

    def __repr__(self):
        return f"({self.aspect}, {self.opinion}, {self.polarity})"


def compute_metrics(
    gold_data: T.List[T.Dict],
    pred_data: T.List[T.Dict],
    mode: str = "triplet"  # 'triplet', 'aspect', or 'opinion'
):
    """
    Compute precision, recall, f1 based on extracted triplets or individual components (aspect/opinion).
    """
    assert mode in ["triplet", "aspect", "opinion"], "Mode must be 'triplet', 'aspect', or 'opinion'"

    gold_items = defaultdict(set)
    pred_items = defaultdict(set)

    for idx, item in enumerate(gold_data):
        for triplet in item.get("triplets", []):
            aspect_text = triplet["aspect_term"]["text"]
            opinion_text = triplet["opinion_term"]["text"]
            polarity = triplet["polarity"]

            if mode == "triplet":
                gold_items[idx].add(Triplet(aspect_text, opinion_text, polarity))
            elif mode == "aspect":
                gold_items[idx].add(aspect_text.lower().strip())
            elif mode == "opinion":
                gold_items[idx].add(opinion_text.lower().strip())

    for idx, item in enumerate(pred_data):
        for triplet in item.get("triplets", []):
            aspect_text = triplet["aspect_term"]["text"]
            opinion_text = triplet["opinion_term"]["text"]
            polarity = triplet["polarity"]

            if mode == "triplet":
                pred_items[idx].add(Triplet(aspect_text, opinion_text, polarity))
            elif mode == "aspect":
                pred_items[idx].add(aspect_text.lower().strip())
            elif mode == "opinion":
                pred_items[idx].add(opinion_text.lower().strip())

    # Now calculate TP, FP, FN
    tp = 0
    fp = 0
    fn = 0

    for idx in gold_items.keys():
        gold_set = gold_items[idx]
        pred_set = pred_items.get(idx, set())

        current_tp = len(gold_set & pred_set)
        current_fp = len(pred_set - gold_set)
        current_fn = len(gold_set - pred_set)

        tp += current_tp
        fp += current_fp
        fn += current_fn

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0

    return {
        "precision": precision,
        "recall": recall,
        "f1": f1
    }


def compute_aspect_accuracy(gold_data: T.List[T.Dict], pred_data: T.List[T.Dict]) -> float:
    """
    Compute simple aspect detection accuracy (1 if aspect is found, 0 otherwise).
    """
    y_true = []
    y_pred = []

    for idx, item in enumerate(gold_data):
        gold_aspects = [triplet["aspect_term"]["text"].lower().strip() for triplet in item.get("triplets", [])]
        pred_aspects = [triplet["aspect_term"]["text"].lower().strip() for triplet in pred_data[idx].get("triplets", [])]

        for aspect in gold_aspects:
            y_true.append(1)
            y_pred.append(1 if aspect in pred_aspects else 0)

        for aspect in pred_aspects:
            if aspect not in gold_aspects:
                y_true.append(0)
                y_pred.append(1)

    return accuracy_score(y_true, y_pred)

In [29]:
with open("output_corrected.json", "r", encoding="utf-8") as f:
    gold_data = json.load(f)

gold_data[97]

{'review_text': 'Alati on teie juurde hea tulla. Laua broneerimine on lihtne ja kiire. Interjöör hubane ja toit alati ootuspäraselt maitsev.\nKui vähegi võimalik, siis võiksite tuua joogikaardile ka alkoholivaba šampanja pokaaliga ostmise võimaluse ning suuremat valikut mokteile. :)',
 'triplets': [{'aspect_term': {'start': 32,
    'end': 49,
    'text': 'Laua broneerimine'},
   'opinion_term': {'start': 53, 'end': 59, 'text': 'lihtne'},
   'polarity': 'positive'},
  {'aspect_term': {'start': 32, 'end': 49, 'text': 'Laua broneerimine'},
   'opinion_term': {'start': 63, 'end': 68, 'text': 'kiire'},
   'polarity': 'positive'},
  {'aspect_term': {'start': 70, 'end': 79, 'text': 'Interjöör'},
   'opinion_term': {'start': 80, 'end': 86, 'text': 'hubane'},
   'polarity': 'positive'},
  {'aspect_term': {'start': 90, 'end': 94, 'text': 'toit'},
   'opinion_term': {'start': 95,
    'end': 122,
    'text': 'alati ootuspäraselt maitsev'},
   'polarity': 'positive'}]}

In [None]:
with open("predicted_output_eurollm_trunc_prompt_1_shot.json", "r", encoding="utf-8") as f:
    pred_data = json.load(f)

pred_data[97]

{'review_text': 'Alati on teie juurde hea tulla. Laua broneerimine on lihtne ja kiire. Interjöör hubane ja toit alati ootuspäraselt maitsev. Kui vähegi võimalik, siis võiksite tuua joogikaardile ka alkoholivaba šampanja pokaaliga ostmise võimaluse ning suuremat valikut mokteile. :)',
 'triplets': [{'aspect_term': {'text': 'Alati on teie juurde hea tulla.',
    'polarity': 'positive',
    'start': 0,
    'end': 31},
   'opinion_term': {'text': 'Laua broneerimine on lihtne ja kiire.',
    'polarity': 'positive',
    'start': 32,
    'end': 69},
   'polarity': 'positive'},
  {'aspect_term': {'text': 'Interjöör hubane ja toit alati ootuspäraselt maitsev.',
    'polarity': 'positive',
    'start': 70,
    'end': 123},
   'opinion_term': {'text': 'Kui vähegi võimalik, siis võiksite tuua joogikaardile ka alkoholivaba šampanja pokaaliga ostmise võimaluse ning suuremat valikut mokteile. :)',
    'polarity': 'positive',
    'start': 124,
    'end': 265},
   'polarity': 'positive'}]}

Postprocessing

In [30]:
import re

def clean_term(text):
    # Delete unwanted punctuation and emoji
    text = text.replace(":)", "").strip(" .,!?;:-–—()[]{}«»\"'")
    return text

In [31]:
def extract_polarity_and_fix(triplet):
    new_triplet = {}
    if isinstance(triplet.get("aspect_term"), dict):
        if "polarity" in triplet["aspect_term"]:
            triplet["polarity"] = triplet["aspect_term"].pop("polarity")
    if isinstance(triplet.get("opinion_term"), dict):
        if "polarity" in triplet["opinion_term"]:
            triplet["polarity"] = triplet["opinion_term"].pop("polarity")
    new_triplet["aspect_term"] = triplet.get("aspect_term", "")
    new_triplet["opinion_term"] = triplet.get("opinion_term", "")
    new_triplet["polarity"] = triplet.get("polarity", "").lower().strip()
    return new_triplet

In [32]:
EST_TO_ENG_POLARITY = {
    "positiivne": "positive",
    "negatiivne": "negative",
    "neutraalne": "neutral"
}

def normalize_polarity(polarity):
    return EST_TO_ENG_POLARITY.get(polarity.lower(), polarity.lower())

In [33]:
def postprocess_predictions(pred_data):
    cleaned_results = []

    for item in pred_data:
        original_text = item.get("review_text", "")
        triplets = item.get("triplets", [])
        new_triplets = []

        for triplet in triplets:
            triplet_fixed = extract_polarity_and_fix(triplet)
            triplet_fixed["aspect_term"]["text"] = clean_term(triplet["aspect_term"]["text"])
            if triplet_fixed.get("opinion_term"):
              triplet_fixed.get("opinion_term")["text"] = clean_term(triplet.get("opinion_term")["text"])
            else:
              triplet_fixed["opinion_term"] = {"text": ""}
            triplet_fixed["polarity"] = normalize_polarity(triplet["polarity"])
            new_triplets.append(triplet_fixed)

        cleaned_results.append({
            "review_text": original_text,
            "triplets": new_triplets
        })

    return cleaned_results

In [None]:
cleaned_data = postprocess_predictions(pred_data)
cleaned_data[97]

{'review_text': 'Alati on teie juurde hea tulla. Laua broneerimine on lihtne ja kiire. Interjöör hubane ja toit alati ootuspäraselt maitsev. Kui vähegi võimalik, siis võiksite tuua joogikaardile ka alkoholivaba šampanja pokaaliga ostmise võimaluse ning suuremat valikut mokteile. :)',
 'triplets': [{'aspect_term': {'text': 'Alati on teie juurde hea tulla',
    'start': 0,
    'end': 31},
   'opinion_term': {'text': 'Laua broneerimine on lihtne ja kiire',
    'start': 32,
    'end': 69},
   'polarity': 'positive'},
  {'aspect_term': {'text': 'Interjöör hubane ja toit alati ootuspäraselt maitsev',
    'start': 70,
    'end': 123},
   'opinion_term': {'text': 'Kui vähegi võimalik, siis võiksite tuua joogikaardile ka alkoholivaba šampanja pokaaliga ostmise võimaluse ning suuremat valikut mokteile',
    'start': 124,
    'end': 265},
   'polarity': 'positive'}]}

In [None]:
# Calculate metrics in triplet mode
metrics = compute_metrics(gold_data, cleaned_data, mode="triplet")
print(f"Triplet - Precision: {metrics['precision']:.4f}, Recall: {metrics['recall']:.4f}, F1: {metrics['f1']:.4f}")

# Calculate metrics in aspect mode
aspect_metrics = compute_metrics(gold_data, cleaned_data, mode="aspect")
print(f"Aspect - Precision: {aspect_metrics['precision']:.4f}, Recall: {aspect_metrics['recall']:.4f}, F1: {aspect_metrics['f1']:.4f}")

# Calculate only aspect accuracy (like in unsupervised method)
aspect_accuracy = compute_aspect_accuracy(gold_data, cleaned_data)
print(f"Aspect Accuracy: {aspect_accuracy:.4f}")

Triplet - Precision: 0.1289, Recall: 0.1364, F1: 0.1325
Aspect - Precision: 0.3629, Recall: 0.3494, F1: 0.3561
Aspect Accuracy: 0.2210


In [22]:
def prepare_aspect_set(data: List[Dict]) -> Dict[int, Set[str]]:
    """Prepare sets of aspect terms for each sample."""
    aspect_sets = {}
    for idx, item in enumerate(data):
        triplet_list = item.get("triplets", [])
        aspect_texts = set(
            triplet["aspect_term"]["text"].strip().lower() for triplet in triplet_list
            if "aspect_term" in triplet and "text" in triplet["aspect_term"]
        )
        aspect_sets[idx] = aspect_texts
    return aspect_sets

def prepare_triplet_set(data: List[Dict]) -> Dict[int, Set[Tuple[str, str, str]]]:
    """Prepare sets of (aspect term, opinion term, polarity) triplets for each sample."""
    triplet_sets = {}
    for idx, item in enumerate(data):
        triplet_list = item.get("triplets", [])
        triplets = set(
            (
                triplet["aspect_term"]["text"].strip().lower(),
                triplet["opinion_term"]["text"].strip().lower(),
                triplet["polarity"].lower()
            )
            for triplet in triplet_list
            if all(key in triplet for key in ["aspect_term", "opinion_term", "polarity"])
        )
        triplet_sets[idx] = triplets
    return triplet_sets

def evaluate_triplets(
    gold_data: List[Dict],
    pred_data: List[Dict],
    mode: str = "triplet"
) -> Dict[str, float]:
    """
    Compute precision, recall, F1, and accuracy for aspects or triplets.

    Args:
        gold_data: list of dictionaries with gold aspect/triplet annotations
        pred_data: list of dictionaries with predicted aspect/triplet annotations
        mode: "aspect" to evaluate aspect terms, "triplet" to evaluate full triplets

    Returns:
        A dictionary with precision, recall, F1-score, and accuracy
    """
    assert mode in ["aspect", "triplet"], "Mode must be 'aspect' or 'triplet'"
    assert len(gold_data) == len(pred_data), "Mismatch in number of samples between gold and predictions"

    y_true = []
    y_pred = []

    if mode == "aspect":
        gold_sets = prepare_aspect_set(gold_data)
        pred_sets = prepare_aspect_set(pred_data)
    else:  # mode == "triplet"
        gold_sets = prepare_triplet_set(gold_data)
        pred_sets = prepare_triplet_set(pred_data)

    for idx in range(len(gold_data)):
        gold_review = gold_data[idx].get("review_text", "").strip()
        pred_review = pred_data[idx].get("review_text", "").strip()
        # assert gold_review == pred_review, f"Review text mismatch at index {idx}\nGold review: {gold_review}\nPredicted review: {pred_review}"

        gold_items = gold_sets.get(idx, set())
        pred_items = pred_sets.get(idx, set())

        # True positives
        for item in gold_items:
            y_true.append(1)
            y_pred.append(1 if item in pred_items else 0)

        # False positives
        for item in pred_items:
            if item not in gold_items:
                y_true.append(0)
                y_pred.append(1)

    precision = f'{precision_score(y_true, y_pred, zero_division=0) * 100:.2f}'
    recall = f'{recall_score(y_true, y_pred, zero_division=0) * 100:.2f}'
    f1 = f'{f1_score(y_true, y_pred, zero_division=0) * 100:.2f}'
    accuracy = f'{accuracy_score(y_true, y_pred) * 100:.2f}'

    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "accuracy": accuracy
    }

In [34]:
with open("basic_0_adj_eng.json", "r", encoding="utf-8") as f:
    basic_0_adj_eng = json.load(f)

cleaned_basic_0_adj_eng = postprocess_predictions(basic_0_adj_eng)

In [35]:
len(basic_0_adj_eng)

100

In [36]:
evaluate_triplets(gold_data, basic_0_adj_eng, mode="aspect")

{'precision': '32.00', 'recall': '11.90', 'f1': '17.34', 'accuracy': '9.50'}

In [37]:
evaluate_triplets(gold_data, cleaned_basic_0_adj_eng, mode="aspect")

{'precision': '32.00', 'recall': '11.90', 'f1': '17.34', 'accuracy': '9.50'}

In [38]:
evaluate_triplets(gold_data, basic_0_adj_eng, mode="triplet")

{'precision': '5.04', 'recall': '1.82', 'f1': '2.67', 'accuracy': '1.35'}

In [39]:
evaluate_triplets(gold_data, cleaned_basic_0_adj_eng, mode="triplet")

{'precision': '7.56', 'recall': '2.73', 'f1': '4.01', 'accuracy': '2.05'}