In notebooks, always use `!pip install package_name`. In the terminal, use `pip install package_name`.

- `!pip` is used **inside Jupyter notebooks** or IPython environments to run shell commands (like installing packages) from a code cell.
- `pip` (without the exclamation mark) is used **in the terminal/command prompt** or in scripts, not inside notebook cells.



[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\s2589602\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\s2589602\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
usage: ipykernel_launcher.py [-h] --input_dir INPUT_DIR [--model MODEL]
                             [--ontology ONTOLOGY] [--output_csv OUTPUT_CSV]
                             [--output_json OUTPUT_JSON] [--jobs JOBS]
ipykernel_launcher.py: error: the following arguments are required: --input_dir


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:

with open(ONTOLOGY_PATH, "r", encoding="utf-8") as f:
    ont = yaml.safe_load(f) or {}

# Load model once & expand ontology
w2v_parent = Word2Vec.load(MODEL_PATH)
ont_expanded = expand_ontology(w2v_parent, ont)

kpi_priority_order = list(ont.get("kpi_resolution", {}).keys())
input_kpi_priority_order = list(ont.get("input_resolution", {}).keys())

def load_jsons(dirpath):
    for fn in os.listdir(dirpath):
        if fn.endswith(".json"):
            p = os.path.join(dirpath, fn)
            with open(p, "r", encoding="utf-8") as f:
                yield fn, json.load(f)

items = list(load_jsons(INPUT_DIR))
tasks = [(fname, doc, ont_expanded, kpi_priority_order, input_kpi_priority_order) for (fname, doc) in items]

rows = []
if JOBS and JOBS > 1:
    with cf.ProcessPoolExecutor(max_workers=JOBS) as ex:
        futures = [ex.submit(worker, t) for t in tasks]
        for fut in tqdm(cf.as_completed(futures), total=len(futures), desc="Classifying"):
            try:
                fname, res = fut.result()
                rows.append((fname, res))
            except Exception as e:
                print(f"[WARN] worker failed: {e}")
else:
    for t in tqdm(tasks, total=len(tasks), desc="Classifying"):
        try:
            rows.append(worker(t))
        except Exception as e:
            print(f"[WARN] worker failed: {e}")

out_rows = []
for fname, r in rows:
    out_rows.append({
        "file": fname,
        "paradigms": to_str(r["paradigms"]),
        "model_hits": to_str(r["model_hits"]),
        "scale": to_str(r["scale"]),
        "data_types": to_str(r["data_types"]),
        "applications": to_str(r["applications"]),
        "kpis_all": to_str(r["kpis"]),
        "kpi_primary": r["kpi_primary"] or "",
        "collected_data_resolution": r["collected_data_resolution"] or "",
        "sampling_mentions": to_str([f'{h["section"]}:{h["value"] or ""}{h["unit"]}' for h in r["sampling_mentions"]]),
        "kpi_types": to_str(r.get("kpi_types")),
        "model_development": to_str(r.get("model_development")),
        "model_inputs": to_str(r.get("model_inputs")),
        "input_resolutions_all": to_str(r.get("input_resolutions")),
        "input_resolution_primary": r.get("input_resolution_primary") or "",
    })

df = pd.DataFrame(out_rows).sort_values("file")
df.to_csv(OUTPUT_CSV, index=False)
print(f"[OK] Wrote {OUTPUT_CSV} with {len(df)} rows.")

if OUTPUT_JSON:
    ev = {fname: r for fname, r in rows}
    with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
        json.dump(ev, f, ensure_ascii=False, indent=2)
    print(f"[OK] Wrote evidence JSON → {OUTPUT_JSON}")


Classifying: 100%|██████████| 548/548 [00:00<00:00, 47553.09it/s]


[WARN] worker failed: A process in the process pool was terminated abruptly while the future was running or pending.
[WARN] worker failed: A process in the process pool was terminated abruptly while the future was running or pending.
[WARN] worker failed: A process in the process pool was terminated abruptly while the future was running or pending.
[WARN] worker failed: A process in the process pool was terminated abruptly while the future was running or pending.
[WARN] worker failed: A process in the process pool was terminated abruptly while the future was running or pending.
[WARN] worker failed: A process in the process pool was terminated abruptly while the future was running or pending.
[WARN] worker failed: A process in the process pool was terminated abruptly while the future was running or pending.
[WARN] worker failed: A process in the process pool was terminated abruptly while the future was running or pending.
[WARN] worker failed: A process in the process pool was terminat

output_csv = "nlp_output/classified_papers.csv"
df.to_csv(output_csv, index=False)

print(f"[OK] Wrote {output_csv} with {len(df)} rows")