In [4]:
import json, re, pandas as pd
from html import unescape
from pathlib import Path

# --------- Paths ---------
input_path = Path("novel_foods_catalogue.json")
if not input_path.exists():
    input_path = Path("novel_foods_catalogue.json")

cards_out = Path("novel_foods_cards.csv")
multiv_out = Path("novel_foods_multivectors.csv")

# --------- Utils ---------
def strip_html(text):
    if not isinstance(text, str):
        return ""
    text = unescape(text)
    text = re.sub(r"<[^>]*>", " ", text)
    return re.sub(r"\s+", " ", text).strip()

def normalize_name(name):
    return re.sub(r"\s+", " ", strip_html(name)).strip(" ,;")

# split by commas/semicolons **outside parentheses**
SPLIT_OUTSIDE_PAR = re.compile(r"[;,]\s*(?![^()]*\))")

LATIN_BINOMIAL = re.compile(r"^[A-Z][a-z]+(?:\s+[a-z\-]+){1,3}(?:\s+[A-Za-z\.\(\)]+)*$")

def is_latin_taxonomic(s: str) -> bool:
    s = normalize_name(s)
    return len(s) >= 4 and bool(LATIN_BINOMIAL.match(s))

def parse_lang_labeled_block(raw: str):
    """
    'EN: foo, FR: bar; DE: baz' -> {'EN': [...], 'FR': [...], 'DE': [...]}
    (supports commas or semicolons outside parentheses)
    """
    if not raw or not isinstance(raw, str):
        return {}
    raw = normalize_name(raw)
    pat = re.compile(r"([A-Z]{2})\s*:\s*([^:]+?)(?=(?:[A-Z]{2}\s*:)|$)")
    out = {}
    for m in pat.finditer(raw):
        lang = m.group(1)
        vals = [normalize_name(x) for x in SPLIT_OUTSIDE_PAR.split(m.group(2)) if x.strip()]
        if vals:
            out.setdefault(lang, []).extend(vals)
    return out

def parse_parenthetical_langs(raw: str):
    """
    'Feijoa (DK, FI, FR)' or 'ginseng siberiano (ES) (PT)' -> {'DK':[Feijoa], 'FI':[Feijoa], ...}
    """
    out = {}
    if not raw or not isinstance(raw, str):
        return out
    text = normalize_name(raw)
    chunks = [c.strip() for c in SPLIT_OUTSIDE_PAR.split(text) if c.strip()]
    for chunk in chunks:
        groups = re.findall(r"\(([A-Z]{2}(?:\s*,\s*[A-Z]{2})*)\)", chunk)
        name = re.sub(r"\s*\(([A-Z]{2}(?:\s*,\s*[A-Z]{2})*)\)", "", chunk).strip()
        name = normalize_name(name)
        if not name:
            continue
        if groups:
            codes = []
            for g in groups:
                codes.extend([c.strip() for c in g.split(",")])
            for code in codes:
                out.setdefault(code, []).append(name)
        else:
            out.setdefault("UNK", []).append(name)
    return out

def parse_common_names(raw: str):
    """Tries 'EN:' style, then parenthetical codes. Fallback -> UNK split."""
    labeled = parse_lang_labeled_block(raw)
    if labeled:
        return labeled
    paren = parse_parenthetical_langs(raw)
    if paren:
        return paren
    if not raw:
        return {}
    names = [normalize_name(x) for x in SPLIT_OUTSIDE_PAR.split(str(raw)) if x.strip()]
    return {"UNK": names} if names else {}

def split_synonyms(text: str):
    """Synonyms split by commas/semicolons outside parentheses."""
    if not text or not isinstance(text, str):
        return []
    text = normalize_name(text)
    parts = [p for p in SPLIT_OUTSIDE_PAR.split(text) if p]
    return parts

# --------- Load JSON ---------
with open(input_path, "r", encoding="utf-8") as f:
    data = json.load(f)

df = pd.json_normalize(data)

# Detect columns
id_col     = next((c for c in df.columns if "policy" in c.lower() and "id" in c.lower()), None)
canon_col  = next((c for c in df.columns if "novel_food_name" in c.lower()), None)
common_col = next((c for c in df.columns if "common" in c.lower()), None)
syn_col    = next((c for c in df.columns if "synonym" in c.lower()), None)
assert id_col and canon_col, "Missing required columns (policy_item_id & novel_food_name)."

# --------- Aggregate per entity ---------
entities = {}
for _, row in df.iterrows():
    eid = str(row.get(id_col))
    if not eid or eid == "None":
        continue
    canon = normalize_name(row.get(canon_col, ""))

    common = row.get(common_col)
    syn    = row.get(syn_col)

    ent = entities.setdefault(eid, {"canonical": "", "common": {}, "syn_lat": [], "syn_unk": []})
    if canon and not ent["canonical"]:
        ent["canonical"] = canon  # first non-empty

    # common names
    if isinstance(common, str) and common.strip():
        cmap = parse_common_names(common)
        for lang, names in cmap.items():
            bucket = ent["common"].setdefault(lang, [])
            for n in names:
                n = normalize_name(n)
                if n and n not in bucket:
                    bucket.append(n)

    # synonyms
    if isinstance(syn, str) and syn.strip():
        for s in split_synonyms(syn):
            s = normalize_name(s)
            if not s: 
                continue
            if is_latin_taxonomic(s):
                if s not in ent["syn_lat"]:
                    ent["syn_lat"].append(s)
            else:
                if s not in ent["syn_unk"]:
                    ent["syn_unk"].append(s)

# --------- Build outputs ---------
cards_rows = []
multiv_rows = []  # one row per (entity, section, item)

for eid, data in entities.items():
    sections = []

    # canonical
    canon = data.get("canonical", "")
    if canon:
        canon_tag = "CANON_LAT" if is_latin_taxonomic(canon) else "CANON_EN"
        sections.append(f"[{canon_tag}] {canon}")
        multiv_rows.append({
            "policy_item_id": eid,
            "section": canon_tag,
            "language": "LATIN" if canon_tag=="CANON_LAT" else "UNK",
            "text": canon
        })

    # common names by language
    for lang in sorted(data["common"].keys()):
        names = sorted(set(data["common"][lang]))
        if names:
            sections.append(f"[COMMON_{lang}] " + " | ".join(names))
            for n in names:
                multiv_rows.append({
                    "policy_item_id": eid,
                    "section": f"COMMON_{lang}",
                    "language": lang,
                    "text": n
                })

    # synonyms (LATIN / UNK)
    if data["syn_lat"]:
        syn_lat = sorted(set(data["syn_lat"]))
        sections.append(f"[SYN_LAT] " + " [SEP] ".join(syn_lat))
        for s in syn_lat:
            multiv_rows.append({
                "policy_item_id": eid,
                "section": "SYN_LAT",
                "language": "LATIN",
                "text": s
            })
    if data["syn_unk"]:
        syn_unk = sorted(set(data["syn_unk"]))
        sections.append(f"[SYN_UNK] " + " [SEP] ".join(syn_unk))
        for s in syn_unk:
            multiv_rows.append({
                "policy_item_id": eid,
                "section": "SYN_UNK",
                "language": "UNK",
                "text": s
            })

    cards_rows.append({
        "policy_item_id": eid,
        "canonical": canon,
        "entity_text": "\n".join(sections)
    })

# --------- Save ---------
cards_df = pd.DataFrame(cards_rows)
multiv_df = pd.DataFrame(multiv_rows)

cards_df.to_csv(cards_out, index=False, encoding="utf-8")
multiv_df.to_csv(multiv_out, index=False, encoding="utf-8")

print(f"✅ Cards written: {cards_out}  (rows: {len(cards_df)})")
print(f"✅ Multi-vectors written: {multiv_out}  (rows: {len(multiv_df)})")

# Optional peek
display_cols_cards = ["policy_item_id","canonical","entity_text"]
display_cols_mult  = ["policy_item_id","section","language","text"]
print(cards_df[display_cols_cards].head(3).to_string(index=False))
print(multiv_df[display_cols_mult].head(6).to_string(index=False))


✅ Cards written: novel_foods_cards.csv  (rows: 863)
✅ Multi-vectors written: novel_foods_multivectors.csv  (rows: 1821)
policy_item_id                    canonical                             entity_text
        677319 3, 3'-Diindolylmethane (DIM) [CANON_EN] 3, 3'-Diindolylmethane (DIM)
        677344         4-hydroxy isoleucine         [CANON_EN] 4-hydroxy isoleucine
        677369          5-hydroxytryptophan          [CANON_EN] 5-hydroxytryptophan
policy_item_id   section language                         text
        677319  CANON_EN      UNK 3, 3'-Diindolylmethane (DIM)
        677344  CANON_EN      UNK         4-hydroxy isoleucine
        677369  CANON_EN      UNK          5-hydroxytryptophan
        677449 CANON_LAT    LATIN               Abies balsamea
        677449   SYN_LAT    LATIN   Abies balsamea f. balsamea
        677479 CANON_LAT    LATIN              Abies pectinata
