In [None]:

#!/usr/bin/env python3
"""CNL-template generator

This script takes a CSV file that already contains *research questions* and
converts each question into an *abstract Controlled-Natural-Language (CNL)
pattern* by

1. marking *Entity Chunks* (EC) and *Property Chunks* (PC) with numbered
   placeholders (EC1, PC1, …) using SpaCy and a small set of linguistic rules;
2. matching the resulting pattern against a library of CLaRO CNL templates;
3. writing out a merged CSV that contains the original question, its pattern,
   the matched template-ID (if any), and a flag indicating whether the pattern
   exists in the library.

Usage
-----

    python cnl_template_generator.py \
        --questions  research_questions.csv \
        --templates  CLaROv2.csv \
        --output     questions_with_templates.csv
"""

import argparse
import re
from pathlib import Path

import pandas as pd
import spacy

# -----------------------------
#  SpaCy initialisation
# -----------------------------

try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    raise RuntimeError(
        "SpaCy model 'en_core_web_sm' is not installed. "
        "Run `python -m spacy download en_core_web_sm` first."
    )

# -----------------------------
#  Chunk‑marking helpers
# -----------------------------
def _mark_chunk(cq: str, spans, chunktype: str, offset: int, counter: int):
    """Replace spans inside *cq* with the placeholder *chunktype{counter}*."""
    for (start, end) in spans:
        cq = cq[: start - offset] + f"{chunktype}{counter}" + cq[end - offset :]
        offset += (end - start) - len(chunktype) - 1  # minus one digit
    return cq, offset


def extract_EC_chunks(cq: str) -> str:
    """Replace entity chunks with numbered EC placeholders."""

    doc = nlp(cq)
    rejecting_ec = {
        "what","which","when","where","who","type","types","kinds","kind",
        "category","categories","difference","differences","extent","i","we",
        "respect","there","not","the main types","the possible types",
        "the types","the difference","the differences","the main categories",
    }

    counter, offset = 1, 0

    # Special: How + ADJ + VERB
    if len(doc) >= 3 and doc[0].text.lower() == "how" and doc[1].pos_ == "ADJ" and doc[2].pos_ == "VERB":
        start, end = doc[1].idx, doc[1].idx + len(doc[1])
        cq, offset = _mark_chunk(cq, [(start, end)], "EC", offset, counter)
        counter += 1

    for chunk in doc.noun_chunks:
        start, end = chunk.start_char, chunk.end_char
        text = cq[start - offset : end - offset]
        if text.lower() in rejecting_ec:
            continue
        cq, offset = _mark_chunk(cq, [(start, end)], "EC", offset, counter)
        counter += 1

    # Ending adjective/verb
    if len(doc) >= 3 and doc[-2].pos_ in {"VERB", "ADJ", "ADV"} and doc[-1].text == "?":
        start, end = doc[-2].idx, doc[-2].idx + len(doc[-2])
        cq, offset = _mark_chunk(cq, [(start, end)], "EC", offset, counter)

    return cq


def _is_auxiliary(token, chunk_token_ids):
    return token.head.i in chunk_token_ids and token.dep_ == "aux" and token.i not in chunk_token_ids


def _get_pc_span(group: str, doc):
    id_tags = group.split(",")
    ids = [int(it.split("::")[0]) for it in id_tags]
    aux = next((t for t in doc if _is_auxiliary(t, ids)), None)
    return (doc[ids[0]].idx, doc[ids[-1]].idx + len(doc[ids[-1]]), aux)


def _reject_subspans(spans):
    filtered = []
    for i, (s_beg, s_end, *_rest) in enumerate(spans):
        if not any(i != j and s_beg >= o_beg and s_end <= o_end for j, (o_beg, o_end, *_) in enumerate(spans)):
            filtered.append((s_beg, s_end))
    return filtered


def get_PCs_as_spans(cq: str):
    doc = nlp(cq)
    pos_text = ",".join(f"{i}::{t.pos_}" for i, t in enumerate(doc))
    regexes = [
        r"([0-9]+::(PART|VERB),?)*([0-9]+::VERB)",
        r"([0-9]+::(PART|VERB),?)+([0-9]+::AD(J|V),)+([0-9]+::ADP)",
        r"([0-9]+::(PART|VERB),?)+([0-9]+::ADP)",
    ]
    spans = []
    for rx in regexes:
        for m in re.finditer(rx, pos_text):
            spans.append(_get_pc_span(m.group(), doc))
    return _reject_subspans(spans)


def extract_PC_chunks(cq: str) -> str:
    rejecting_pc = {"is","are","was","were","do","does","did","have","had","can","could"}
    offset, counter = 0, 1
    for begin, end in get_PCs_as_spans(cq):
        if cq[begin - offset : end - offset].lower() in rejecting_pc:
            continue
        cq, offset = _mark_chunk(cq, [(begin, end)], "PC", offset, counter)
        counter += 1
    return cq


def to_cnl_pattern(question: str) -> str:
    pattern = extract_EC_chunks(question)
    pattern = extract_PC_chunks(pattern)
    return re.sub(r"\s+", " ", pattern).strip()


def load_questions(path: Path, column: str = "question") -> pd.DataFrame:
    df = pd.read_csv(path)
    if column not in df.columns:
        raise KeyError(f"Column '{column}' not found in {path}.")
    return df[[column]].rename(columns={column: "question"})


def load_templates(path: Path) -> pd.DataFrame:
    return pd.read_csv(path, sep=";", header=None, names=["ID", "pattern"])


def attach_templates(df_q: pd.DataFrame, templates: pd.DataFrame) -> pd.DataFrame:
    df_q["pattern"] = df_q["question"].apply(to_cnl_pattern)
    return df_q.merge(templates, how="left", on="pattern", indicator="Exist")


def main():
    parser = argparse.ArgumentParser(description="Generate CNL templates from research questions CSV.")
    parser.add_argument("--questions", required=True, help="CSV file containing research questions (column: 'question').")
    parser.add_argument("--templates", default="CLaROv2.csv", help="CLaRO template CSV.")
    parser.add_argument("--output", default="questions_with_templates.csv", help="Output CSV path.")
    args = parser.parse_args()

    df_q = load_questions(Path(args.questions))
    templates = load_templates(Path(args.templates))
    merged = attach_templates(df_q, templates)
    merged.to_csv(args.output, index=False)
    print(f"✅  Saved {len(merged)} rows → {args.output}")


if __name__ == "__main__":
    main()
