
# Metaphor Identification & Alignment Visualizer

This notebook helps you:
1. **Color-code metaphors** (from XML tags like `<Metaphor> ... </Metaphor>`) in the **predicted** and **ground-truth** texts, separately.
2. **Align** predicted vs. ground-truth tokens (even when the texts differ) and **color-code**:
   - **True Positive (TP)** tokens
   - **False Positive (FP)** tokens
   - **False Negative (FN)** tokens
   - **Mismatched tokens** (where the tokens themselves differ after alignment)

### How it works
- We parse the XML-like metaphor tags to label tokens as metaphor or not.
- We tokenize both texts and then **pair** them using the `paired` package **if available**.  
  If `paired` is not available in your environment, we **fall back** to a robust dynamic-programming sequence aligner (Needleman–Wunsch) implemented in pure Python within this notebook.
- We then generate HTML visualizations for:
  - **(1)** the metaphor-only coloring (separately for predicted and ground-truth)
  - **(2)** the aligned token comparison with TP / FP / FN / Mismatch coloring.

> **Note:** You can re-run cells or adapt tokenization for your needs.


In [7]:

# Imports
import re
from dataclasses import dataclass
from typing import List, Tuple, Optional

# Try to import 'paired'. If unavailable, we will fall back to our own aligner.
USE_PAIRED = False
try:
    import paired  # type: ignore
    USE_PAIRED = True
except Exception:
    USE_PAIRED = False

from IPython.display import HTML, display


In [8]:

@dataclass
class Token:
    text: str
    is_metaphor: bool

TOKEN_PATTERN = re.compile(r"""
    \w+            # word tokens (letters/digits/_)
    |               # or
    [^\w\s]       # any single non-word, non-space char (punctuation/symbols)
""", re.VERBOSE)

def parse_metaphor_spans(text: str) -> List[Tuple[int, int]]:
    """Return list of (start_idx, end_idx) char spans inside <Metaphor>...</Metaphor> blocks.
    Overlapping/nested tags are not supported and will be treated sequentially.
    """
    spans = []
    start_tag = '<Metaphor>'
    end_tag = '</Metaphor>'
    idx = 0
    while True:
        s = text.find(start_tag, idx)
        if s == -1:
            break
        e = text.find(end_tag, s + len(start_tag))
        if e == -1:
            break  # unclosed; ignore
        # content span (excluding tags)
        content_start = s + len(start_tag)
        content_end = e
        spans.append((content_start, content_end))
        idx = e + len(end_tag)
    return spans

def char_in_any_span(i: int, spans: List[Tuple[int,int]]) -> bool:
    for s, e in spans:
        if s <= i < e:
            return True
    return False

def strip_tags(text: str) -> str:
    return text.replace('<Metaphor>', '').replace('</Metaphor>', '')

def tokenize_with_metaphor(text_with_xml: str) -> List[Token]:
    """Tokenize and label tokens if they originate from inside <Metaphor>...</Metaphor> spans."""
    spans = parse_metaphor_spans(text_with_xml)
    raw = strip_tags(text_with_xml)
    tokens = []
    for m in TOKEN_PATTERN.finditer(raw):
        tok = m.group(0)
        # Map back to original indices by scanning raw positions against spans.
        # The 'raw' string indices align with the stripped-text positions.
        # So we need to compute metaphor membership by checking the corresponding
        # indices in the original string after adjusting for removed tags.
        # Instead of re-mapping, we can approximate by checking the same indices
        # in the stripped string against re-mapped spans.
        # Let's rebuild "stripped to original" mapping once.
        tokens.append(Token(tok, False))
    # Build a mapping from stripped index -> metaphor bool by reconstructing stripped text and tracking positions
    metaphor_mask = []
    original = text_with_xml
    stripped_builder = []
    stripped_pos_to_original_pos = []
    i = 0
    while i < len(original):
        if original.startswith('<Metaphor>', i):
            i += len('<Metaphor>')
            continue
        if original.startswith('</Metaphor>', i):
            i += len('</Metaphor>')
            continue
        stripped_pos_to_original_pos.append(i)
        stripped_builder.append(original[i])
        i += 1
    stripped = ''.join(stripped_builder)
    spans = parse_metaphor_spans(text_with_xml)  # spans in original indices
    # Now label tokens using positions in stripped string mapped back to original
    labeled_tokens = []
    for m in TOKEN_PATTERN.finditer(stripped):
        tok = m.group(0)
        # if any character of the token (in stripped coords) maps to a metaphor span in original coords -> label True
        in_meta = False
        for pos in range(m.start(), m.end()):
            orig_pos = stripped_pos_to_original_pos[pos]
            if char_in_any_span(orig_pos, spans):
                in_meta = True
                break
        labeled_tokens.append(Token(tok, in_meta))
    return labeled_tokens


In [9]:

def align_with_paired(pred_tokens: List[str], gt_tokens: List[str]) -> List[Tuple[Optional[str], Optional[str]]]:
    """Use 'paired' package to align two token sequences if available.
    Returns list of (pred_tok_or_None, gt_tok_or_None) pairs. Gaps are None.
    """
    # The `paired` library API can vary; we do a conservative attempt:
    # We'll use a simple dynamic edit-distance alignment from paired if it exposes one.
    # If not available, we raise to trigger fallback.
    # Many 'paired' examples expose a function like paired.align or paired.pair.
    try:
        # Try a few likely entry points; if none exist, raise.
        if hasattr(paired, 'align'):
            al = paired.align(pred_tokens, gt_tokens)  # Expected to return list of tuples
            out = []
            for a, b in al:
                out.append((a if a != '' else None, b if b != '' else None))
            return out
        elif hasattr(paired, 'pair'):
            al = paired.pair(pred_tokens, gt_tokens)
            out = []
            for a, b in al:
                out.append((a if a != '' else None, b if b != '' else None))
            return out
        else:
            raise RuntimeError("paired library present but no known align function; using fallback.")
    except Exception as e:
        raise RuntimeError(f"paired alignment failed: {e}")

def needleman_wunsch(a: List[str], b: List[str], match_score=2, mismatch_penalty=-1, gap_penalty=-1):
    """Classic Needleman–Wunsch global alignment for sequences of tokens."""
    n, m = len(a), len(b)
    # Initialize score matrix
    score = [[0]*(m+1) for _ in range(n+1)]
    for i in range(1, n+1):
        score[i][0] = i * gap_penalty
    for j in range(1, m+1):
        score[0][j] = j * gap_penalty
    # Fill
    for i in range(1, n+1):
        for j in range(1, m+1):
            diag = score[i-1][j-1] + (match_score if a[i-1] == b[j-1] else mismatch_penalty)
            up = score[i-1][j] + gap_penalty
            left = score[i][j-1] + gap_penalty
            score[i][j] = max(diag, up, left)
    # Traceback
    aligned = []
    i, j = n, m
    while i > 0 or j > 0:
        if i > 0 and j > 0 and score[i][j] == score[i-1][j-1] + (match_score if a[i-1] == b[j-1] else mismatch_penalty):
            aligned.append((a[i-1], b[j-1]))
            i -= 1
            j -= 1
        elif i > 0 and score[i][j] == score[i-1][j] + gap_penalty:
            aligned.append((a[i-1], None))
            i -= 1
        else:
            aligned.append((None, b[j-1]))
            j -= 1
    aligned.reverse()
    return aligned

def align_tokens(pred: List[str], gt: List[str]) -> List[Tuple[Optional[str], Optional[str]]]:
    if 'USE_PAIRED' in globals() and USE_PAIRED:
        try:
            return align_with_paired(pred, gt)
        except Exception:
            pass
    return needleman_wunsch(pred, gt)


In [10]:

def needs_space(prev: Optional[str], curr: Optional[str]) -> bool:
    """Basic English-ish spacing: no space before punctuation, add space between words."""
    if prev is None or curr is None:
        return False
    if re.fullmatch(r"[^\w\s]", curr):  # current is punctuation
        return False
    if re.fullmatch(r"[^\w\s]", prev):  # previous is punctuation
        return True
    return True

def tokens_to_html(tokens: List[Token], metaphor_color="#fff3bf", text_color="#000") -> str:
    html = ["<div style='line-height:1.8; font-family: ui-sans-serif, system-ui, -apple-system, Segoe UI, Roboto;'>"]
    prev = None
    for t in tokens:
        if needs_space(prev, t.text):
            html.append(" ")
        style = f"background:{metaphor_color}; padding:2px 3px; border-radius:4px;" if t.is_metaphor else ""
        title = "Metaphor" if t.is_metaphor else "Literal/Other"
        html.append(f"<span title='{title}' style='{style} color:{text_color};'>{t.text}</span>")
        prev = t.text
    html.append("</div>")
    return ''.join(html)

# Colors for categories
COLORS = {
    'TP': '#d3f9d8',       # green-ish
    'FP': '#ffe3e3',       # red-ish
    'FN': '#fff3bf',       # yellow-ish
    'MISMATCH': '#e9ecef', # gray
}

def aligned_html(pred_toks: List[Token], gt_toks: List[Token]) -> str:
    pred_words = [t.text for t in pred_toks]
    gt_words = [t.text for t in gt_toks]
    alignment = align_tokens(pred_words, gt_words)

    # Map words -> metaphor flags in order (with counters for duplicates)
    from collections import defaultdict, deque
    pred_queue = defaultdict(deque)
    gt_queue = defaultdict(deque)
    for idx, t in enumerate(pred_toks):
        pred_queue[t.text].append(t.is_metaphor)
    for idx, t in enumerate(gt_toks):
        gt_queue[t.text].append(t.is_metaphor)

    rows = ["<div style='font-family: ui-sans-serif, system-ui, -apple-system, Segoe UI, Roboto'>"]
    rows.append("<div style='margin-bottom:8px'><strong>Legend:</strong> " +
                "&nbsp; <span style='background:%s;padding:2px 6px;border-radius:4px'>TP</span>" % COLORS['TP'] +
                "&nbsp; <span style='background:%s;padding:2px 6px;border-radius:4px'>FP</span>" % COLORS['FP'] +
                "&nbsp; <span style='background:%s;padding:2px 6px;border-radius:4px'>FN</span>" % COLORS['FN'] +
                "&nbsp; <span style='background:%s;padding:2px 6px;border-radius:4px'>Mismatch</span>" % COLORS['MISMATCH'] +
                "</div>")

    # Build two aligned rows
    pred_cells = []
    gt_cells = []
    for a, b in alignment:
        a_is_meta = None
        b_is_meta = None
        if a is not None:
            a_is_meta = pred_queue[a].popleft() if pred_queue[a] else False
        if b is not None:
            b_is_meta = gt_queue[b].popleft() if gt_queue[b] else False

        if a is None or b is None or a != b:
            cat = 'MISMATCH'
        else:
            if a_is_meta and b_is_meta:
                cat = 'TP'
            elif a_is_meta and not b_is_meta:
                cat = 'FP'
            elif (not a_is_meta) and b_is_meta:
                cat = 'FN'
            else:
                cat = None  # matched non-metaphor token

        style = f"background:{COLORS[cat]};" if cat else ""
        title_pred = f"Pred: {'Metaphor' if a_is_meta else 'Not Metaphor'}" if a is not None else "Pred: (gap)"
        title_gt   = f"GT: {'Metaphor' if b_is_meta else 'Not Metaphor'}" if b is not None else "GT: (gap)"

        pred_tok_html = a if a is not None else "—"
        gt_tok_html   = b if b is not None else "—"

        pred_cells.append(f"<span title='{title_pred}' style='padding:2px 4px; border-radius:4px; {style} margin:1px; display:inline-block'>{pred_tok_html}</span>")
        gt_cells.append(  f"<span title='{title_gt}'   style='padding:2px 4px; border-radius:4px; {style} margin:1px; display:inline-block'>{gt_tok_html}</span>")

    rows.append("<div style='margin:6px 0'><strong>Predicted (aligned):</strong><br>" + ' '.join(pred_cells) + "</div>")
    rows.append("<div style='margin:6px 0'><strong>Ground truth (aligned):</strong><br>" + ' '.join(gt_cells) + "</div>")
    rows.append("</div>")
    return ''.join(rows)


In [11]:

def visualize(predicted_text: str, ground_truth_text: str):
    # Tokenize with metaphor labels
    pred_tokens = tokenize_with_metaphor(predicted_text)
    gt_tokens = tokenize_with_metaphor(ground_truth_text)

    # (1) Separate metaphor coloring
    html_pred_1 = tokens_to_html(pred_tokens, metaphor_color="#fff3bf")
    html_gt_1   = tokens_to_html(gt_tokens, metaphor_color="#cfe8ff")  # different tint for GT

    display(HTML("""
    <h3>(1) Metaphor highlighting only</h3>
    <div style='display:flex; gap:24px; flex-wrap:wrap'>
      <div><div style='font-weight:600; margin-bottom:6px'>Predicted</div>{pred}</div>
      <div><div style='font-weight:600; margin-bottom:6px'>Ground Truth</div>{gt}</div>
    </div>
    """.format(pred=html_pred_1, gt=html_gt_1)))

    # (2) Alignment-based TP/FP/FN/Mismatch
    html_aligned = aligned_html(pred_tokens, gt_tokens)
    display(HTML("<h3>(2) Alignment with TP / FP / FN / Mismatch</h3>" + html_aligned))

# --- Demo ---
demo_pred = "The market was a <Metaphor>raging bull</Metaphor> that <Metaphor>devoured</Metaphor> all fear ."
demo_gt   = "The market was a <Metaphor>bull</Metaphor> that <Metaphor>ate</Metaphor> up all fear ."

# visualize(demo_pred, demo_gt)
# print("\nNote: If the optional 'paired' package isn't installed, the notebook automatically uses a fallback aligner.")



## Try your own inputs

Edit the two variables in the next cell and re-run it to see the visualizations.


In [12]:

# Paste your texts here (with <Metaphor> ... </Metaphor> tags as needed):
your_pred_text = """The project was a <Metaphor>rollercoaster</Metaphor> of emotions ."""
your_gt_text   = """The project felt like a <Metaphor>roller coaster</Metaphor> of emotions ."""

visualize(your_pred_text, your_gt_text)
