# NLP

In [None]:
# === Anchor-based, single-CSV pipeline (YAML-faithful + acronym-safe) ===

import os, re, json, yaml, string, difflib, bisect
from typing import Dict, Iterable, List, Tuple, Optional
from dataclasses import dataclass
from collections import Counter
import pandas as pd
from tqdm import tqdm

# ---------------- NLTK bootstrap ----------------
import nltk
def _ensure_nltk():
    for res, pkg in [
        ("tokenizers/punkt","punkt"),
        ("corpora/wordnet","wordnet"),
        ("corpora/omw-1.4","omw-1.4"),
    ]:
        try: nltk.data.find(res)
        except LookupError: nltk.download(pkg, quiet=True)
_ensure_nltk()
from nltk.stem import WordNetLemmatizer
_lem = WordNetLemmatizer()

# ---------------- Normalisation & compounding ----------------
def load_cpwords(py_path: str) -> Dict[str,str]:
    env = {}
    with open(py_path, "r", encoding="utf-8") as f:
        code = f.read()
    exec(compile(code, py_path, "exec"), env, env)
    if "compound_keywords" not in env or not isinstance(env["compound_keywords"], dict):
        raise ValueError("compound_keywords not found in cpwords file")
    out = {}
    for k, v in env["compound_keywords"].items():
        kk = str(k).strip().lower()
        vv = str(v).strip().lower().replace("-", " ").replace(" ", "_")
        if kk: out[kk] = vv
    return out

def _collect_multiword_phrases_from_ontology(ont: dict) -> List[str]:
    out = set()
    def add_if_multi(s):
        s = str(s).strip()
        if s and (" " in s or "-" in s):
            out.add(s.lower())
    for v in (ont or {}).values():
        if isinstance(v, dict):
            for vv in v.values():
                if isinstance(vv, (list, tuple, set)):
                    for t in vv: add_if_multi(t)
                else:
                    add_if_multi(vv)
        elif isinstance(v, (list, tuple, set)):
            for t in v: add_if_multi(t)
        else:
            add_if_multi(v)
    return sorted(out, key=len, reverse=True)

def compile_compounds_regex(phrases: Iterable[str], cpwords_map: Dict[str,str]) -> re.Pattern:
    base = set(p.strip().lower() for p in phrases if p and p.strip())
    base |= set(k.strip().lower() for k in (cpwords_map or {}).keys())
    if not base: return re.compile(r"(?!x)x")
    toks = sorted(base, key=len, reverse=True)
    pat = r"\b(" + "|".join(re.escape(p) for p in toks) + r")\b"
    return re.compile(pat, re.I)

def apply_compounds(text: str, compounds_rx: re.Pattern, cpwords_map: Dict[str,str]) -> str:
    def repl(m):
        key = m.group(1).lower()
        return cpwords_map.get(key, key.replace("-", "_").replace(" ", "_"))
    return compounds_rx.sub(repl, text)

def canonicalise_term(term: str) -> str:
    t = term.lower().strip().replace("-", "_").replace(" ", "_")
    return re.sub(r"_+", "_", t)

def normalize(text: str, compounds_rx: Optional[re.Pattern], cpwords_map: Dict[str,str]) -> str:
    if not text: return ""
    t = text.lower()
    if compounds_rx is not None:
        t = apply_compounds(t, compounds_rx, cpwords_map)
    keep_us = string.punctuation.replace("_","")
    t = re.sub(rf"[{re.escape(keep_us)}]+", " ", t)
    t = re.sub(r"\s+", " ", t).strip()
    t = " ".join(_lem.lemmatize(w) for w in t.split())
    return t


def any_regex_hits(text: str, regs: List[re.Pattern]) -> List[str]:
    """collect raw matched strings for a list of regexes"""
    out = []
    for rx in regs:
        out += [m.group(0) for m in rx.finditer(text)]
    return out
# ---------------- Sentence split & token windows ----------------
_SENT_SPLIT = re.compile(r'(?<=[\.\?\!])\s+')
def sents(text: str) -> List[str]:
    try:
        from nltk.tokenize import sent_tokenize
        return sent_tokenize(text or "")
    except Exception:
        return _SENT_SPLIT.split(text or "")

def tokenize(text: str):
    tokens, starts = [], []
    for m in re.finditer(r"\S+", text):
        tokens.append(m.group(0))
        starts.append(m.start())
    return tokens, starts

def window_from_charspan(text: str, tokens: List[str], starts: List[int], span: Tuple[int,int], k: int) -> str:
    i = bisect.bisect_right(starts, span[0]) - 1
    if i < 0: i = 0
    lo = max(0, i - k); hi = min(len(tokens)-1, i + k)
    return " ".join(tokens[lo:hi+1])
###
# ---- STRICT token-boundary helpers (avoid 'ga*' → gap/gas/gauge/gain) ----
_ACRO_LEFT  = r"(?<![A-Za-z0-9_])"
_ACRO_RIGHT = r"(?![A-Za-z0-9_])"

def build_phrase_regexes_strict(terms: Iterable[str]) -> List[re.Pattern]:
    """
    Build regexes for phrases (multi-word or tokens len>=4).
    Single short tokens are handled separately (acronym logic).
    """
    regs, seen = [], set()
    for t in (terms or []):
        v = canonicalise_term(t)
        if not v:
            continue
        # skip short single tokens (they'll be handled as acronyms)
        if re.fullmatch(r"[a-z0-9_]+", v) and "_" not in v and len(v) <= 3:
            continue
        key = ("strict", v)
        if key in seen:
            continue
        seen.add(key)
        regs.append(re.compile(r"\b" + _flexify(v) + r"\b", re.I))
    return regs

def build_acronym_regexes_exact(raw_terms: Iterable[str]) -> List[re.Pattern]:
    """
    Acronyms on RAW text, case-sensitive exact token: GA, MPC, PID, RL, DRL, PSO, PMV, PPD, CO2, PM2.5, ...
    """
    regs = []
    for t in (raw_terms or []):
        s = str(t).strip()
        if not s or " " in s:
            continue
        # keep shortish tokens or with digits/dot
        if len(s) <= 6 or re.search(r"[0-9.]", s):
            pat = _ACRO_LEFT + re.escape(s.upper()) + _ACRO_RIGHT
            regs.append(re.compile(pat))
    return regs

def build_lower_acronym_regexes_norm(norm_terms: Iterable[str]) -> List[re.Pattern]:
    """
    Exact-token acronyms (len >= 3) on NORMALIZED text (case-insensitive),
    to catch papers that write acronyms in lowercase (e.g., 'pso').
    We *do not* include 2-letter tokens here to avoid noise.
    """
    regs = []
    for t in (norm_terms or []):
        v = canonicalise_term(t)
        if not v or " " in v or "_" in v:
            continue
        if len(v) >= 3:  # 3+ letters only
            regs.append(re.compile(_ACRO_LEFT + re.escape(v) + _ACRO_RIGHT, re.I))
    return regs

# ---------------- Regex helpers ----------------
SEPARATOR_FLEX = r"[ _-]+"
def _flexify(term: str) -> str:
    t = (term or "").strip()
    t = re.escape(t)
    t = re.sub(r"\\\s+", SEPARATOR_FLEX, t)
    return t + r"\w*"

def build_regexes_from_terms(terms: Iterable[str], require_min_ngram=None) -> List[re.Pattern]:
    regs, seen = [], set()
    for t in (terms or []):
        v = canonicalise_term(t)
        if not v: continue
        if require_min_ngram and len(re.split(r"[ _-]+", v)) < require_min_ngram:
            continue
        key = ("rx", v)
        if key in seen: continue
        seen.add(key)
        regs.append(re.compile(r"\b" + _flexify(v) + r"\b", re.I))
    return regs

# ---------------- Action/performance gates ----------------
VERBS_RX     = re.compile(r"\b(model(?:ing|led|ed|s)?|estimate(?:d|s|r)?|predict(?:ed|s|ive)?|analy[sz]e(?:d|s|r)?|simulate(?:d|s)?)\b", re.I)
FAVOURED_RX  = re.compile(r"\b(performance|accuracy|efficienc\w*|load|consumption|demand|baseline|benchmark(?:ing)?|error|mae|rmse|mape)\b", re.I)
MODEL_WORD_RX= re.compile(r"\bmodel(?:s|ing|led|ed)?\b", re.I)
GENERIC_OPT_RX = re.compile(r"\b(optimi[sz]e?(?:d|s|r)?|optimi[sz]ation|multi-?objective)\b", re.I)

# ---------------- Gappy “<term> … model” support ----------------
GAP_MAX   = 3   # words allowed between term and "model"
WORD_WINDOW = 15# ± words around anchor
MERGE_SIM = 0.90# similarity threshold for merging

# ----- General gappy "<core> ... <SUFFIX>" helpers -----

TERM_SUFFIX_RX = r"(?:-?(?:based|driven|guided|assisted|enabled|oriented|like|alike))?"  # allowed after the *core*

_SUFFIX_PATTERNS = {
    "model": r"model\w*",
    "optimization": r"optimi[sz]\w*",
    "control": r"control\w*",
}

def _sep_flexify(s: str) -> str:
    s = canonicalise_term(s)
    parts = [re.escape(p) for p in s.split("_") if p]
    if not parts: return ""
    return r"(?:%s)" % r"[ _-]+".join(parts)

def _split_core_and_suffix(term: str, suffix_key: str) -> Tuple[str, str]:
    """
    If term already contains the suffix token (e.g. 'linear_model', 'multi_objective_optimization'),
    split into (core, suffix). Otherwise, treat whole term as core (suffix empty).
    """
    t = canonicalise_term(term)
    toks = [p for p in t.split("_") if p]
    suf = suffix_key
    # find last index where token equals the suffix (e.g., 'model', 'optimization', 'control')
    idx = None
    for i in range(len(toks)-1, -1, -1):
        if toks[i] == suf:
            idx = i; break
    if idx is None:
        return t, ""   # no explicit suffix present
    core = "_".join(toks[:idx]) if idx > 0 else ""
    return core, suf

def build_gappy_suffix_regexes(terms: Iterable[str], suffix_key: str, gap_max: int = GAP_MAX) -> List[re.Pattern]:
    """
    Build regexes that match:  <core>(-based/...)? (<=gap words) <suffix>
    - If term already has the suffix, we treat tokens *before* suffix as <core>
    - If term has no suffix, we use the *entire* term as <core> and still require suffix at the end
    """
    regs = []
    suffix_pat = _SUFFIX_PATTERNS[suffix_key]  # e.g., model\w*
    for t in set(canonicalise_term(x) for x in (terms or []) if str(x).strip()):
        core, suf = _split_core_and_suffix(t, suffix_key)
        if not core:
            # fall back: if they wrote exactly 'model' alone, ignore; otherwise require suffix with gap
            core = t if t != suffix_key else ""
        if not core:
            continue
        core_pat = _sep_flexify(core)
        pat = (
            r"\b" + core_pat + TERM_SUFFIX_RX +             # core + optional '-based' etc.
            r"(?:\W+\w+){0," + str(gap_max) + r"}" +        # up to N words between
            r"\W+" + suffix_pat + r"\b"                     # required suffix token
        )
        regs.append(re.compile(pat, re.I))
    return regs

def extract_core_from_gappy_hit(hit: str, suffix_key: str) -> str:
    """
    From a matched '<core>(-based)? ... <suffix>' return canonicalised <core>.
    We trim everything from the suffix rightwards, then strip any trailing gap words/suffix-like endings.
    """
    s = hit.lower()
    # split at the *first* occurrence of the suffix family
    suf_pat = re.compile(_SUFFIX_PATTERNS[suffix_key], re.I)
    m = suf_pat.search(s)
    if not m:
        return canonicalise_term(s)
    core = s[:m.start()]
    # drop trailing gap words and core-suffix artefacts
    core = re.sub(r"(?:\W+\w+){0," + str(GAP_MAX) + r"}$", "", core)
    core = re.sub(TERM_SUFFIX_RX + r"$", "", core)
    return canonicalise_term(core)

# ---------------- 90% merge for near-duplicates ----------------
def _sim(a: str, b: str) -> float:
    return difflib.SequenceMatcher(None, a, b).ratio()

def merge_counter_by_similarity(counter: "Counter[str]", threshold: float = MERGE_SIM) -> "Counter[str]":
    items = [(canonicalise_term(k), v) for k, v in counter.items()]
    
    items.sort(key=lambda kv: len(kv[0]))
    reps, used = [], set()
    for i, (k, v) in enumerate(items):
        if i in used: continue
        cluster = [(k, v)]; used.add(i)
        for j in range(i+1, len(items)):
            if j in used: continue
            k2, v2 = items[j]
            if (len(k) > 6 or len(k2) > 6):
                sim_thresh = 0.85
            elif len(k) <= 3 or len(k2) <= 3:
                sim_thresh = 0.95
            else:
                sim_thresh = threshold
            if _sim(k, k2) >= sim_thresh or k in k2 or k2 in k:
                cluster.append((k2, v2)); used.add(j)
        rep = min((t for t,_ in cluster), key=len)
        reps.append((rep, sum(c for _, c in cluster)))
    out = Counter()
    for k, v in reps: out[k] += v
    return out

def merge_set_by_similarity(terms: Iterable[str], threshold: float = MERGE_SIM) -> List[str]:
    cnt = Counter(canonicalise_term(t) for t in terms if str(t).strip())
    merged = merge_counter_by_similarity(cnt, threshold)
    return sorted(merged.keys())

# ---------------- Optional Word2Vec expansion (scale only) ----------------
try:
    from gensim.models import Word2Vec as GensimWord2Vec
except Exception:
    GensimWord2Vec = None

@dataclass
class NLPConfig:
    mode: str = "auto"   # off|light|full|auto
    model_path: str = ""
    exp_topn: int = 40
    exp_thresh: float = 0.70
    exp_min_count: int = 10
    context_k: int = 1

def load_nlp_config(path_or_ontology_yaml: Optional[str]) -> NLPConfig:
    cfg = NLPConfig()
    if path_or_ontology_yaml and os.path.exists(path_or_ontology_yaml):
        with open(path_or_ontology_yaml, "r", encoding="utf-8") as f:
            y = yaml.safe_load(f) or {}
        y = y.get("_nlp", y)
        cfg.mode = str(y.get("mode", cfg.mode)).lower()
        cfg.model_path = str(y.get("model_path", cfg.model_path))
        cfg.exp_topn = int(y.get("exp_topn", cfg.exp_topn))
        cfg.exp_thresh = float(y.get("exp_thresh", cfg.exp_thresh))
        cfg.exp_min_count = int(y.get("exp_min_count", cfg.exp_min_count))
        cfg.context_k = int(y.get("context_k", cfg.context_k))
    return cfg

def resolve_nlp_mode(cfg: NLPConfig) -> str:
    if cfg.mode in {"off","light","full"}: return cfg.mode
    if GensimWord2Vec is None: return "light"
    candidates = [cfg.model_path] if cfg.model_path else []
    candidates += ["model.word2vec","model.w2v","word2vec.model", os.path.join(os.getcwd(),"word2vec.model")]
    for p in candidates:
        if p and os.path.exists(p):
            try:
                GensimWord2Vec.load(p); return "full"
            except Exception:
                continue
    return "light"

def expand_terms_w2v(model, seeds: List[str], topn=40, thresh=0.70, min_count=10) -> List[str]:
    out = set(canonicalise_term(s) for s in seeds if s)
    if not model: return sorted(out)
    for t in list(out):
        if t in model.wv:
            for cand, sim in model.wv.most_similar(t, topn=topn):
                if sim < thresh: continue
                try:
                    if model.wv.get_vecattr(cand,"count") < min_count: continue
                except Exception: pass
                if any(ch.isdigit() for ch in cand): continue
                out.add(canonicalise_term(cand))
    return sorted(out)

# ---------------- IO ----------------
def load_jsons(dirpath):
    for fn in os.listdir(dirpath):
        if fn.endswith(".json"):
            p = os.path.join(dirpath, fn)
            try:
                with open(p, "r", encoding="utf-8") as f:
                    yield fn, json.load(f)
            except Exception as e:
                print(f"[WARN] failed to read {fn}: {e}")

# ---------------- Acronym-safe matchers (use RAW text) ----------------
_ACRO_LEFT  = r"(?<![A-Za-z0-9_])"
_ACRO_RIGHT = r"(?![A-Za-z0-9_])"

def pick_acronyms(terms: Iterable[str]) -> List[str]:
    acr = []
    for t in (terms or []):
        s = str(t).strip()
        if not s: continue
        if " " in s: continue          # spaces → phrase, not acronym
        # keep shortish tokens or mixed with digits/dots (e.g., PM2.5)
        if len(s) <= 6 or re.search(r"[0-9.]", s):
            acr.append(s)
    return acr

def build_acronym_regexes(terms: Iterable[str]) -> List[re.Pattern]:
    regs = []
    for t in pick_acronyms(terms):
        pat = _ACRO_LEFT + re.escape(t.upper()) + _ACRO_RIGHT
        regs.append(re.compile(pat))    # case-sensitive on raw text
    return regs
# ---------------- YAML-faithful category counting (acronym-safe) ----------------
def count_yaml_category(raw_text: str, norm_text: str, terms: Iterable[str],
                        suffix_hint: Optional[str] = None) -> Counter:
    """
    YAML-faithful counting that avoids acronym noise:
      - multi-word phrases & tokens len>=4 -> flexible phrase regex on normalized text
      - acronyms (single tokens len<=3 or with digits) -> exact-token on RAW (case-sensitive)
      - 3+ letter acronyms also matched exactly on normalized text (case-insensitive)
      - optional gappy '<core> ... <suffix>' (e.g., optimization/control) on normalized text
    """
    cnt = Counter()

    # 1) phrases (multi-word or len>=4) on normalized text
    phrase_rx = build_phrase_regexes_strict(terms)
    for rx in phrase_rx:
        for m in rx.finditer(norm_text):
            cnt[canonicalise_term(m.group(0))] += 1

    # 2) acronyms on raw text (2–6 chars or contains digits/dot), case-sensitive exact
    acro_rx = build_acronym_regexes_exact(terms)
    for rx in acro_rx:
        for m in rx.finditer(raw_text):
            cnt[canonicalise_term(m.group(0))] += 1

    # 3) lower-case acronyms (len>=3) on normalized text, exact token, case-insensitive
    acro_lower_rx = build_lower_acronym_regexes_norm(terms)
    for rx in acro_lower_rx:
        for m in rx.finditer(norm_text):
            cnt[canonicalise_term(m.group(0))] += 1

    # 4) optional gappy suffix (e.g., '... optimization' / '... control') on normalized text
    if suffix_hint in _SUFFIX_PATTERNS:
        gappy_rx = build_gappy_suffix_regexes(terms, suffix_key=suffix_hint, gap_max=GAP_MAX)
        for rx in gappy_rx:
            for m in rx.finditer(norm_text):
                core = extract_core_from_gappy_hit(m.group(0), suffix_key=suffix_hint)
                cnt[core] += 1

    return merge_counter_by_similarity(cnt, threshold=MERGE_SIM)


# ---------------- Paradigm term-only regex builder ----------------
def build_simple_paradigm_term_rx(paradigms: dict) -> dict:
    out = {}
    for pname, terms in (paradigms or {}).items():
        canon_terms = [canonicalise_term(t) for t in (terms or [])]
        out[pname] = build_regexes_from_terms(canon_terms, require_min_ngram=1)
    return out
# ---------------- Merging ranked columns ----------------
def _merge_ranked_cols(cols: dict) -> str:
    """
    Merge several *_ranked strings like 'term_a (3);term_b (1)' into one,
    summing counts and sorting by count desc, then name.
    """
    merged = Counter()
    for _, v in (cols or {}).items():
        if not v or v == "NM":
            continue
        for token in v.split(";"):
            token = token.strip()
            if not token:
                continue
            if "(" in token and token.endswith(")"):
                name, count = token.rsplit("(", 1)
                try:
                    c = int(count.strip(") ").strip())
                except:
                    c = 1
                merged[name.strip()] += c
            else:
                merged[token] += 1
    if not merged:
        return "NM"
    items = sorted(merged.items(), key=lambda kv: (-kv[1], kv[0]))
    return ";".join(f"{t} ({n})" for t, n in items)

def load_alias_map(ont: dict) -> Dict[str, str]:
    """Load a global alias→canonical map; both sides are underscore-canonicalised."""
    amap = {}
    for k, v in (ont.get("_alias_map", {}) or {}).items():
        if v is None:  # allow disabling an alias by mapping to null
            continue
        amap[canonicalise_term(k)] = canonicalise_term(v)
    return amap

def collapse_counter_aliases(counter: Counter, alias_map: Dict[str, str]) -> Counter:
    """
    Move counts from alias keys to their canonical target per alias_map.
    Then apply the usual 0.90 similarity merge.
    """
    if not alias_map:
        return merge_counter_by_similarity(counter, threshold=MERGE_SIM)
    out = Counter(counter)
    for k, c in list(out.items()):
        can = alias_map.get(canonicalise_term(k))
        if can and can != k:
            out[can] += c
            del out[k]
    return merge_counter_by_similarity(out, threshold=MERGE_SIM)

def map_terms_set(terms: Iterable[str], alias_map: Dict[str, str]) -> List[str]:
    """Apply alias_map to a set/list of strings (no counts)."""
    mapped = []
    for t in (terms or []):
        ct = canonicalise_term(t)
        mapped.append(alias_map.get(ct, ct))
    # de-dup then apply similarity merge on names
    return merge_set_by_similarity(mapped, threshold=MERGE_SIM)
# ---------------- Core analysis (anchor-based + YAML-faithful cats) ----------------
def analyze_anchor_based_single(doc: dict,
                                ont: dict,
                                compounds_rx: Optional[re.Pattern],
                                cp_map: Dict[str,str],
                                cfg: NLPConfig,
                                w2v,
                                alias_map: Dict[str, str]) -> dict:
    raw = (doc.get("full-text-retrieval-response", {}) or {}).get("originalText","") or ""
    text = normalize(raw, compounds_rx, cp_map)  # normalized (lowercased)
    S = sents(text)
    tokens, starts = tokenize(text)

    # Ontology buckets
    scale       = ont.get("scale", {}) or {}
    paradigms   = ont.get("model_paradigm", {}) or {}
    optim_cats  = ont.get("optimization_methods", {}) or {}
    app_cats    = ont.get("applications", {}) or {}
    data_cats   = ont.get("data_types", {}) or {}
    
    # Canonicalised seeds (+ optional W2V) for scale only
    def seeds(key):
        base = [canonicalise_term(x) for x in (scale.get(key, []) or [])]
        return expand_terms_w2v(w2v, base, cfg.exp_topn, cfg.exp_thresh, cfg.exp_min_count) if w2v else base

    building_terms = seeds("building_model")
    system_terms   = seeds("system_model")
    climate_terms  = seeds("climate_model")
    occup_terms    = seeds("occupancy_model")

    # Finders for scale terms (need spans)
    def rx_find_terms(terms):
        if not terms: return None
        pats = [r"\b" + _flexify(t) + r"\b" for t in set(terms)]
        return re.compile("|".join(pats), re.I)

    RX_FIND = {
        "building": rx_find_terms(building_terms),
        "system":   rx_find_terms(system_terms),
        "weather":  rx_find_terms(climate_terms),
        "occupancy":rx_find_terms(occup_terms),
    }

    # Paradigm & optimisation matchers
    par_rx        = {k: build_regexes_from_terms(v, require_min_ngram=1) for k, v in (paradigms or {}).items()}
    par_gappy_rx  = {k: build_gappy_suffix_regexes(v, suffix_key="model", gap_max=GAP_MAX)
                 for k, v in (paradigms or {}).items()}

    par_term_rx   = build_simple_paradigm_term_rx(paradigms)
    opt_rx        = {k: build_regexes_from_terms(v, require_min_ngram=1) for k, v in (optim_cats or {}).items()}

    # Containers
    term_counts = {c: Counter() for c in RX_FIND.keys()}
    par_found   = {c: set() for c in RX_FIND.keys()}
    par_terms   = {c: set() for c in RX_FIND.keys()}
    opt_methods, opt_terms = set(), set()

    # PRIMARY path — scale-term windows
    for cat, rx in RX_FIND.items():
        if rx is None: continue
        for m in rx.finditer(text):
            w = window_from_charspan(text, tokens, starts, m.span(), k=WORD_WINDOW)
            if not (VERBS_RX.search(w) or FAVOURED_RX.search(w)):
                continue
            term_counts[cat][canonicalise_term(m.group(0))] += 1

            # Paradigms in this window → ONLY this category
            for pname in par_rx.keys():
                plain_hits = []
                for rxx in par_rx[pname]:
                    plain_hits += [canonicalise_term(h.group(0)) for h in rxx.finditer(w)]
                gappy_strings = any_regex_hits(w, par_gappy_rx.get(pname, []))
                gappy_terms   = [extract_core_from_gappy_hit(h, suffix_key="model") for h in gappy_strings]
                belongs_via_gappy = False
                if gappy_terms and par_term_rx.get(pname):
                    for t_ in gappy_terms:
                        if any(r.search(t_) for r in par_term_rx[pname]):
                            belongs_via_gappy = True; break
                if plain_hits or belongs_via_gappy:
                    par_found[cat].add(pname)
                    par_terms[cat].update(plain_hits)
                    par_terms[cat].update(gappy_terms)

            # Optimisation phrases in this window (global list)
            for oname, regs in opt_rx.items():
                found = []
                for rxx in regs:
                    found += [canonicalise_term(h.group(0)) for h in rxx.finditer(w)]
                if found:
                    opt_methods.add(oname); opt_terms.update(found)
            g = GENERIC_OPT_RX.search(w)
            if g:
                opt_methods.add("generic"); opt_terms.add(canonicalise_term(g.group(0)))

    # FLEX path — paradigm anchored; map to categories if window shows 'model' + category hints
    CATEGORY_HINT = {
        "building":  re.compile(r"\b(building|facility|premise|asset|whole[_ ]building|zone|envelope|bem|ubem)\b", re.I),
        "system":    re.compile(r"\b(system|hvac|ahu|doas|vav|fcu|coil|chiller|boiler|pump|tower|vr[fb]|heat[_ ]pump)\b", re.I),
        "occupancy": re.compile(r"\b(occupanc\w*|occupant\w*|people|tenant|user)\b", re.I),
        "weather":   re.compile(r"\b(weather|climate|outdoor|ambient|meteorolog\w*)\b", re.I),
    }
    ALL_PAR_RXS = [r for regs in par_rx.values() for r in regs]

    for rx in ALL_PAR_RXS:
        for m in rx.finditer(text):
            w = window_from_charspan(text, tokens, starts, m.span(), k=WORD_WINDOW)
            if not (MODEL_WORD_RX.search(w) and (VERBS_RX.search(w) or FAVOURED_RX.search(w))):
                continue
            for cat in ["building","system","occupancy","weather"]:
                hint_ok  = CATEGORY_HINT[cat].search(w) is not None
                scale_ok = (RX_FIND[cat] is not None) and (RX_FIND[cat].search(w) is not None)
                if not (hint_ok or scale_ok): continue

                pstr = canonicalise_term(m.group(0))
                all_gappy_strings = any_regex_hits(w, [rgx for lst in par_gappy_rx.values() for rgx in lst])
                ext_terms = [extract_core_from_gappy_hit(h, suffix_key="model") for h in all_gappy_strings]


                matched_buckets = set()
                for pname, regs in par_rx.items():
                    if any(r.search(m.group(0)) for r in regs):
                        matched_buckets.add(pname)
                for pname, regs in par_term_rx.items():
                    if any(r.search(t_) for r in regs for t_ in ([pstr] + ext_terms)):
                        matched_buckets.add(pname)

                for pname in matched_buckets:
                    par_found[cat].add(pname)
                par_terms[cat].add(pstr)
                par_terms[cat].update(ext_terms)

    # Merge near-duplicates (scale/paradigm terms)
    for cat in term_counts.keys():
        term_counts[cat] = merge_counter_by_similarity(term_counts[cat], threshold=MERGE_SIM)
        par_terms[cat]   = set(merge_set_by_similarity(par_terms[cat], threshold=MERGE_SIM))

    # ---------------- YAML-faithful counts for optimization/applications/data_types ----------------
    # We do these at document-level (no window), with acronym-safe matching.

    opt_ranked_cols = {}
    for cat_name, terms in (optim_cats or {}).items():
        c = count_yaml_category(raw, text, terms, suffix_hint="optimization")
        opt_ranked_cols[f"optimization_{canonicalise_term(cat_name)}_ranked"] = (
            "NM" if not c else ";".join(f"{t} ({n})" for t, n in sorted(c.items(), key=lambda kv:(-kv[1], kv[0])))
        )

# ----- OPTIMIZATION (suffix = optimization) -----
    opt_ranked_cols = {}
    for cat_name, terms in (optim_cats or {}).items():
        c = count_yaml_category(raw, text, terms, suffix_hint="optimization")
        c = collapse_counter_aliases(c, alias_map)  # <<< collapse aliases
        opt_ranked_cols[f"optimization_{canonicalise_term(cat_name)}_ranked"] = (
            "NM" if not c else ";".join(f"{t} ({n})" for t, n in sorted(c.items(), key=lambda kv: (-kv[1], kv[0])))
        )


    app_ranked_cols = {}
    for cat_name, terms in (app_cats or {}).items():
        hint = "control" if ("control" in canonicalise_term(cat_name)
                            or any(canonicalise_term(t).endswith("_control") for t in (terms or []))) else None
        c = count_yaml_category(raw, text, terms, suffix_hint=hint)
        c = collapse_counter_aliases(c, alias_map)  # <<< collapse aliases
        app_ranked_cols[f"applications_{canonicalise_term(cat_name)}_ranked"] = (
            "NM" if not c else ";".join(f"{t} ({n})" for t, n in sorted(c.items(), key=lambda kv: (-kv[1], kv[0])))
        )


    data_ranked_cols = {}
    for cat_name, terms in (data_cats or {}).items():
        c = count_yaml_category(raw, text, terms, suffix_hint=None)
        c = collapse_counter_aliases(c, alias_map)  # <<< collapse aliases
        data_ranked_cols[f"data_{canonicalise_term(cat_name)}_ranked"] = (
            "NM" if not c else ";".join(f"{t} ({n})" for t, n in sorted(c.items(), key=lambda kv: (-kv[1], kv[0])))
        )

    # Assemble row
    def ranked_pairs(counter: Counter) -> str:
        if not counter: return "NM"
        items = sorted(counter.items(), key=lambda kv: (-kv[1], kv[0]))
        return ";".join(f"{t} ({c})" for t, c in items)

    def dedup_join(xs: Iterable[str]) -> str:
        vals = [canonicalise_term(x) for x in (xs or []) if str(x).strip()]
        return ";".join(sorted(set(vals))) if vals else "NM"

    row = {
    "building_terms_ranked":  ranked_pairs(term_counts["building"]),
    "system_terms_ranked":    ranked_pairs(term_counts["system"]),
    "weather_terms_ranked":   ranked_pairs(term_counts["weather"]),
    "occupancy_terms_ranked": ranked_pairs(term_counts["occupancy"]),
    "building_paradigms":       dedup_join(par_found["building"]),
    "system_paradigms":         dedup_join(par_found["system"]),
    "weather_paradigms":        dedup_join(par_found["weather"]),
    "occupancy_paradigms":      dedup_join(par_found["occupancy"]),
    "building_paradigm_terms":  dedup_join(par_terms["building"]),
    "system_paradigm_terms":    dedup_join(par_terms["system"]),
    "weather_paradigm_terms":   dedup_join(par_terms["weather"]),
    "occupancy_paradigm_terms": dedup_join(par_terms["occupancy"]),
    "optimization_methods":     dedup_join(opt_methods),
    "optimization_method_terms":dedup_join(opt_terms),
    # merged summaries the user wants
    "applications_merged": _merge_ranked_cols(app_ranked_cols),
    "data_types_merged":   _merge_ranked_cols(data_ranked_cols),
    }
    row.update(opt_ranked_cols)   # keep per-optimization-subgroup columns
    # (If you *don’t* want the per-subgroup app/data columns, do NOT add app_ranked_cols / data_ranked_cols here)


    row.update(opt_ranked_cols)
    row.update(app_ranked_cols)
    row.update(data_ranked_cols)
    row["applications_merged"] = _merge_ranked_cols(app_ranked_cols)
    row["data_types_merged"]   = _merge_ranked_cols(data_ranked_cols)

    return row

# ---------------- Runner (writes a single CSV) ----------------
def run_anchor_based(
    input_dir: str,
    ontology_path: str,
    cpwords_path: Optional[str] = None,
    anchor_csv: str = "nlp_output/anchor_based.csv",
    nlp: str = "auto",
    config_path: Optional[str] = None,
    verbose: bool = True,
) -> pd.DataFrame:
    with open(ontology_path, "r", encoding="utf-8") as f:
        ont = yaml.safe_load(f) or {}
    alias_map = load_alias_map(ont)
    cfg = load_nlp_config(config_path or ontology_path)
    if nlp: cfg.mode = nlp
    mode = resolve_nlp_mode(cfg)
    if verbose: print(f"[NLP] mode = {mode}")

    # Word2Vec (full)
    w2v = None
    if mode == "full" and GensimWord2Vec is not None:
        candidates = [cfg.model_path] if cfg.model_path else []
        candidates += ["model.word2vec","model.w2v","word2vec.model", os.path.join(os.getcwd(),"word2vec.model")]
        for p in candidates:
            if p and os.path.exists(p):
                try:
                    w2v = GensimWord2Vec.load(p)
                    if verbose: print(f"[NLP] loaded Word2Vec: {p}")
                    break
                except Exception as e:
                    if verbose: print(f"[WARN] could not load {p}: {e}")
        if w2v is None:
            if verbose: print("[WARN] falling back to light mode")
            mode = "light"

    # Compounding sources
    cp_map = {}
    if cpwords_path and os.path.exists(cpwords_path):
        try: cp_map = load_cpwords(cpwords_path)
        except Exception as e:
            if verbose: print(f"[WARN] cpwords not loaded: {e}")
    ont_multi = _collect_multiword_phrases_from_ontology(ont)
    compounds_rx = compile_compounds_regex(ont_multi, cp_map)

    # Process
    rows = []
    os.makedirs(os.path.dirname(anchor_csv) or ".", exist_ok=True)
    for fname, doc in tqdm(list(load_jsons(input_dir)), desc="Anchor-based"):
        r = analyze_anchor_based_single(doc, ont, compounds_rx, cp_map, cfg, w2v, alias_map)
        r["file"] = fname
        rows.append(r)

    # Collect dynamic YAML-driven optimization columns only
    dyn_opt_cols = []
    for cat_name in (ont.get("optimization_methods", {}) or {}).keys():
        dyn_opt_cols.append(f"optimization_{canonicalise_term(cat_name)}_ranked")

    cols_fixed = [
        "file",
        "building_terms_ranked","system_terms_ranked","weather_terms_ranked","occupancy_terms_ranked",
        "building_paradigms","system_paradigms","weather_paradigms","occupancy_paradigms",
        "building_paradigm_terms","system_paradigm_terms","weather_paradigm_terms","occupancy_paradigm_terms",
        "optimization_methods","optimization_method_terms",
        "applications_merged","data_types_merged",  # <-- the two merged columns
    ]

    cols = cols_fixed + dyn_opt_cols
    df = pd.DataFrame(rows)
    for c in cols:
        if c not in df.columns:
            df[c] = "NM"
    df = df[cols].sort_values("file")

    # ensure all columns exist even if empty
    for c in cols:
        if c not in df.columns:
            df[c] = "NM"
    df = df[cols].sort_values("file")
    df.to_csv(anchor_csv, index=False)
    if verbose: print(f"[OK] wrote {anchor_csv} ({len(df)} rows)")
    return df


In [2]:

df_summary = run_anchor_based(
    input_dir="elsevier_out/papers_ftrr",
    ontology_path="nlp_input/ontology.yaml",
    cpwords_path="nlp_input/_cpwords.py",
    anchor_csv="nlp_output/anchor_summary_2.csv",
    nlp="auto",                              # off | light | full | auto
    config_path="nlp_input/ontology.yaml",                        # or a YAML with an _nlp block
)


[NLP] mode = full
[NLP] loaded Word2Vec: word2vec.model


Anchor-based: 100%|██████████| 14/14 [00:09<00:00,  1.49it/s]

[OK] wrote nlp_output/anchor_summary_2.csv (14 rows)





In [3]:
# Show as HTML inside a Jupyter notebook
from IPython.display import display, HTML

display(HTML(df_summary['optimization_metaheuristics_and_optim_ranked'].to_frame().to_html(escape=False)))


Unnamed: 0,optimization_metaheuristics_and_optim_ranked
0,NM
1,particle_swarm_optimization (4);genetic_algorithm (2);multi_objective_function_that_could (1);quadratic_programming (1)
2,NM
3,genetic_algorithm (9);multi_objective_occupant_behavior (2)
4,NM
5,linear_programming (2)
6,NM
7,convex_optimization (3)
8,NM
9,NM


In [4]:
df_summary

Unnamed: 0,file,building_terms_ranked,system_terms_ranked,weather_terms_ranked,occupancy_terms_ranked,building_paradigms,system_paradigms,weather_paradigms,occupancy_paradigms,building_paradigm_terms,system_paradigm_terms,weather_paradigm_terms,occupancy_paradigm_terms,optimization_methods,optimization_method_terms,applications_merged,data_types_merged,optimization_reinforcement_learning_ranked,optimization_classical_control_and_mpc_ranked,optimization_metaheuristics_and_optim_ranked
0,10.1016_j.apenergy.2019.113920.json,room (21);building (10);temperature (4),acmv (19);ahu (17);coil (17);doas (11);chiller...,NM,internal (4);occupancy (3),greybox,greybox,NM,NM,kalman_filter;rc;state_space,state_space,NM,NM,classical_control_and_mpc;generic;metaheuristi...,gain;model_predictive_control;mpc;optimise;opt...,model_predictive_control (214),predicted_mean_vote (94);air_temperature (17);...,NM,model_predictive_control (190),NM
1,10.1016_j.apenergy.2020.115147.json,building (60);room (9);temperature (1);zone (1),acmv (14);accuracy (7);achieve (6);chiller (6)...,NM,occupancy (6);internal (5);heat_loads (1),blackbox,blackbox,blackbox,blackbox,ann;artificial_neural_network;machine_learning...,ann;artificial_neural_network;machine_learning...,ann,ann,classical_control_and_mpc;generic,model_predictive_control;mpc;optimization;opti...,model_predictive_control (321);performance_ana...,predicted_mean_vote (139);air_temperature (45)...,NM,model_predictive_control (287);proportional_in...,particle_swarm_optimization (4);genetic_algori...
2,10.1016_j.apenergy.2020.115426.json,room (21);temperature (3),ac (11),NM,NM,blackbox,blackbox,NM,NM,bayesian_convolutional_neural_network;bcnn;con...,bcnns;data_driven,NM,NM,reinforcement_learning,q_learning;reinforcement_learning,demand_response (2),setpoint (7);room_temperature (6);thermal_comf...,q_learning (7);reinforcement_learning (2),NM,NM
3,10.1016_j.apenergy.2021.117276.json,building (9);temperature (2),achieved (26);actual (22);accurate (11);accura...,NM,occupant_behavior (4),blackbox,blackbox,blackbox,blackbox,data_driven_approach,ann;data_driven;knn;machine_learning;statistic...,knn;machine_learning;statistical;svm;svr,ann;data_driven;knn;machine_learning;svm;svr,generic;metaheuristics_and_optim,genetic_algorithm;optimization;optimizes,performance_evaluation (2),thermal_comfort (22);setpoints (6);heater_stat...,NM,NM,genetic_algorithm (9);multi_objective_occupant...
4,10.1016_j.apenergy.2021.117987.json,room (46);air_quality (3);temperature (3);buil...,accuracy (36);coil (11);hvac (8);according (2)...,outdoor_temperature (8),occupancy (46);internal (7);occupant_behavior (1),NM,NM,NM,NM,NM,NM,NM,NM,metaheuristics_and_optim,gave,commissioning (8);performance_evaluation (2),occupancy (173);setpoint (24);thermal_comfort ...,NM,NM,NM
5,10.1016_j.apenergy.2022.119580.json,zone (63);room (31);temperature (20);building ...,achieved (8);actual (7);according (5);accuracy...,outdoor_temperature (2),internal (20);occupant_number (4);infiltration...,greybox,greybox,NM,greybox,rc,kalman_filter;rc,NM,kalman_filter;rc_model,classical_control_and_mpc;generic;metaheuristi...,gain;model_predictive_control;mpc;optimization...,model_predictive_control (47);predictive_and (2),setpoint (56);predicted_mean_vote (34);room_te...,NM,model_predictive_control (47);pi_controller (2),linear_programming (2)
6,10.1016_j.buildenv.2020.107089.json,room (17);temperature (7);zone (4);building (2),actuation (4);vav (3);according (2);accuracy (...,NM,occupancy_patterns (2);height (1),NM,NM,NM,NM,NM,NM,NM,NM,NM,NM,NM,thermal_comfort (27);air_flow (14);setpoint (1...,NM,NM,NM
7,10.1016_j.enbuild.2020.109792.json,building (3);room (3);temperature (3),accuracy (1);accurate (1);hvac (1);valve (1),NM,NM,NM,NM,NM,NM,NM,NM,NM,NM,classical_control_and_mpc,model_predictive_control,predictive (15),room_temperature (11);ambient_temperature (8),NM,model_predictive_control (13),convex_optimization (3)
8,10.1016_j.enbuild.2020.110271.json,building (49);ieq (12);temperature (12);zone (...,actual (28);across (6);account (5);accuracy (2...,NM,occupancy (2),blackbox,blackbox,NM,NM,statistical,annual;statistical,NM,NM,metaheuristics_and_optim,gap;gas,commissioning (10),occupancy (28);air_temperature (2);metering (2),NM,NM,NM
9,10.1016_j.enbuild.2020.110276.json,building (71),actual (51);pump (26);chiller (24);ahu (16);hv...,NM,occupancy (1),blackbox,NM,NM,NM,statistical_analysis,NM,NM,NM,metaheuristics_and_optim,gap;gas,commissioning (12),occupancy (22);metering (12);air_temperature (...,NM,NM,NM
