In [2]:
import json
from pathlib import Path
from collections import Counter
import pandas as pd
import plotly.express as px
from IPython.display import display, Markdown

DATA_DIR = Path("../data")

def load_jsonl(path):
    rows = []
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            if line.strip():
                rows.append(json.loads(line))
    return rows

train = load_jsonl(DATA_DIR / "train.jsonl")
dev = load_jsonl(DATA_DIR / "dev.jsonl")

def label_stats(rows):
    counts = Counter()
    for row in rows:
        for ent in row.get("entities", []):
            counts[ent["label"]] += 1
    return counts

train_counts = label_stats(train)
dev_counts = label_stats(dev)

df_counts = pd.DataFrame([
    {"split": "train", "label": label, "count": count}
    for label, count in train_counts.items()
] + [
    {"split": "dev", "label": label, "count": count}
    for label, count in dev_counts.items()
])

fig = px.bar(
    df_counts,
    x="label",
    y="count",
    color="split",
    barmode="group",
    title="Entity counts by split",
)
fig.show()

def highlight_entities(row, max_chars=None):
    text = row["text"]
    spans = sorted(row.get("entities", []), key=lambda x: x["start"])
    colored = []
    cursor = 0
    colors = {
        "CREDIT_CARD": "#ffadad",
        "PHONE": "#ffd6a5",
        "EMAIL": "#fdffb6",
        "PERSON_NAME": "#caffbf",
        "DATE": "#9bf6ff",
        "CITY": "#a0c4ff",
        "LOCATION": "#bdb2ff",
    }

    for span in spans:
        start, end, label = span["start"], span["end"], span["label"]
        if start > cursor:
            colored.append(text[cursor:start])
        highlight = text[start:end]
        colored.append(
            f"<span style='background-color:{colors.get(label, '#e0e0e0')};"
            f"padding:2px;border-radius:3px;'>{highlight} "
            f"<small>{label}</small></span>"
        )
        cursor = end
    colored.append(text[cursor:])

    snippet = "".join(colored)
    if max_chars is not None and len(snippet) > max_chars:
        snippet = snippet[:max_chars] + "â€¦"
    return snippet

display(Markdown("### Sample highlighted utterances (full text)"))
for sample in (train[:3] + dev[:3]):
    display(Markdown(
        f"**{sample['id']}**<br>"
        f"{highlight_entities(sample, max_chars=None)}"
    ))

### Sample highlighted utterances (full text)

**utt_0000**<br>currently staying in <span style='background-color:#a0c4ff;padding:2px;border-radius:3px;'>north jose <small>CITY</small></span> near <span style='background-color:#bdb2ff;padding:2px;border-radius:3px;'>central mall <small>LOCATION</small></span>

**utt_0001**<br>my credit card number is <span style='background-color:#ffadad;padding:2px;border-radius:3px;'>4820 2764 0502 5079 <small>CREDIT_CARD</small></span> like and email is <span style='background-color:#fdffb6;padding:2px;border-radius:3px;'>brucecassie at example dot com <small>EMAIL</small></span> name on the card is <span style='background-color:#caffbf;padding:2px;border-radius:3px;'>amy ferguson <small>PERSON_NAME</small></span>

**utt_0002**<br>my credit card number is <span style='background-color:#ffadad;padding:2px;border-radius:3px;'>three five 6 2 eight 5 7 3 1 three nine 7 2 1 three 2 <small>CREDIT_CARD</small></span> and email is <span style='background-color:#fdffb6;padding:2px;border-radius:3px;'>angela dot pittman at yahoo dot com <small>EMAIL</small></span> name on the card is <span style='background-color:#caffbf;padding:2px;border-radius:3px;'>angela pittman <small>PERSON_NAME</small></span>

**utt_0900**<br>this is <span style='background-color:#caffbf;padding:2px;border-radius:3px;'>willie horn <small>PERSON_NAME</small></span> send note to <span style='background-color:#fdffb6;padding:2px;border-radius:3px;'>hannah06 at example dot org <small>EMAIL</small></span> or ping <span style='background-color:#ffd6a5;padding:2px;border-radius:3px;'>2 1 7 5 1 1 5 6 0 6 <small>PHONE</small></span>

**utt_0901**<br>email id is <span style='background-color:#fdffb6;padding:2px;border-radius:3px;'>allison dot chapman at mail dot com <small>EMAIL</small></span> person name <span style='background-color:#caffbf;padding:2px;border-radius:3px;'>allison chapman <small>PERSON_NAME</small></span>

**utt_0902**<br>card digits are <span style='background-color:#ffadad;padding:2px;border-radius:3px;'>5113 4688 8103 2728 <small>CREDIT_CARD</small></span> reach me at number <span style='background-color:#ffd6a5;padding:2px;border-radius:3px;'>9 1 zero four one 9 1 one 9 two <small>PHONE</small></span>