In notebooks, always use `!pip install package_name`. In the terminal, use `pip install package_name`.

- `!pip` is used **inside Jupyter notebooks** or IPython environments to run shell commands (like installing packages) from a code cell.
- `pip` (without the exclamation mark) is used **in the terminal/command prompt** or in scripts, not inside notebook cells.



In [1]:
import os, json, yaml
import pandas as pd
from tqdm import tqdm
from gensim.models import Word2Vec
import classify_papers as cp
with open("ontology.yaml", "r", encoding="utf-8") as f:
    ont = yaml.safe_load(f) or {}

w2v = Word2Vec.load("word2vec_model.model")
ont_expanded = cp.expand_ontology(w2v, ont)
kpi_priority = list(ont.get("kpi_resolution", {}).keys())
input_priority = list(ont.get("input_resolution", {}).keys())
input_dir = "papers_json"
results = []

for fname in tqdm(os.listdir(input_dir), desc="Classifying papers"):
    if not fname.endswith(".json"):
        continue
    with open(os.path.join(input_dir, fname), "r", encoding="utf-8") as f:
        doc = json.load(f)
    res = cp.analyze_paper(doc, ont_expanded, kpi_priority, input_priority)
    results.append({"file": fname, **res})

df = pd.DataFrame(results)
df.head()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\s2589602\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\s2589602\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
Classifying papers: 100%|██████████| 27/27 [30:47<00:00, 68.41s/it]


Unnamed: 0,file,paradigms,paradigm_scores,model_hits,model_evidence,scale,scale_evidence,data_types,data_evidence,sampling_mentions,...,collected_data_resolution,kpi_types,kpi_type_evidence,model_development,model_development_evidence,model_inputs,input_resolutions,input_resolution_primary,model_inputs_evidence,input_resolution_evidence
0,10.1016_j.scs.2021.102792.json,[blackbox],"{'whitebox': 0.0, 'greybox': 0.0, 'blackbox': ...","{'neural network': 0.5, 'recurrent': 0.5}","{'neural network': [('preamble', 'neural netwo...","[building, climate, occupancy, system]","{'building': [('preamble', 'aggregate', 'll up...","[air_quality, air_temperature, environment, hv...","{'metering': [('preamble', 'kw', ' s c h . , m...","[{'section': 'preamble', 'value': '102681', 'u...",...,subhourly,[],{},"[conditional, iteratively, probabilistic, real...","{'conditional': [('preamble', 'conditional', '...","[electric_load, electric_use, internal_gains, ...","[annual, daily, hourly, monthly, subhourly, ye...",subhourly,"{'occupancy': [('preamble', 'absence', 't of t...","{'subhourly': [('preamble', '30-minute', 'g. 7..."
1,10.1016_j.scs.2021.102816.json,"[whitebox, blackbox]","{'whitebox': 3.0, 'greybox': 0.0, 'blackbox': ...","{'nonlinear processes': 0.5, 'open-source': 0....","{'nonlinear processes': [('preamble', 'nonline...","[building, occupancy, system]","{'building': [('preamble', 'aggregated', '48:2...","[air_quality, air_temperature, hvac_signals, m...","{'metering': [('preamble', 'nominal', 'lothing...","[{'section': 'preamble', 'value': '2777', 'uni...",...,subhourly,[],{},"[multivariate, raw data, real-time, setpoints,...","{'multivariate': [('preamble', 'multivariate',...","[electric_load, electric_use, internal_gains, ...","[annual, daily, hourly, monthly, subhourly, ye...",subhourly,"{'occupancy': [('preamble', 'occupancy', 'he c...","{'subhourly': [('preamble', 'every', ' 2014 1 ..."
2,10.1016_j.scs.2021.102832.json,"[whitebox, blackbox]","{'whitebox': 2.0, 'greybox': 0.0, 'blackbox': ...","{'control-oriented': 0.5, 'energyplus': 0.5, '...","{'control-oriented': [('preamble', 'control-or...","[building, climate, occupancy, system]","{'building': [('preamble', 'aggregate', '34 8....","[air_quality, air_temperature, environment, hv...","{'metering': [('preamble', 'kw', '00 %-on and ...","[{'section': 'preamble', 'value': '6521', 'uni...",...,subhourly,[],{},"[deterministic, markov chain, plug loads, real...","{'deterministic': [('preamble', 'deterministic...","[electric_load, electric_use, internal_gains, ...","[annual, daily, hourly, monthly, subhourly, ye...",subhourly,"{'occupancy': [('preamble', 'absence', 'reas c...","{'subhourly': [('preamble', 'every', 'ation no..."
3,10.1016_j.scs.2021.102972.json,[blackbox],"{'whitebox': 0.0, 'greybox': 0.0, 'blackbox': ...",{'data-driven': 0.5},"{'data-driven': [('preamble', 'data-driven', '...","[building, climate, occupancy, system]","{'building': [('preamble', 'aggregated', '19:2...","[air_quality, air_temperature, environment, hv...","{'metering': [('preamble', 'kva', 'c h ev disc...","[{'section': 'preamble', 'value': '101149', 'u...",...,subhourly,[],{},"[markov chain, real-time, schedule, scheduled,...","{'markov chain': [('preamble', 'markov chain',...","[electric_load, electric_use, internal_gains, ...","[daily, hourly, monthly, subhourly]",subhourly,"{'occupancy': [('preamble', 'absence', ' throu...","{'subhourly': [('preamble', '5-minute', ' cont..."
4,10.1016_j.scs.2021.103001.json,[blackbox],"{'whitebox': 0.0, 'greybox': 0.0, 'blackbox': ...","{'decision trees': 0.5, 'k-nearest': 0.5, 'knn...","{'decision trees': [('preamble', 'decision tre...","[building, climate, occupancy]","{'building': [('preamble', 'aggregate', 'lker ...","[air_quality, hvac_signals, metering, occupancy]","{'metering': [('preamble', 'kw', '87% of the c...","[{'section': 'preamble', 'value': '101787', 'u...",...,subhourly,[],{},"[bayesian, conditional, multivariate, probabil...","{'bayesian': [('preamble', 'bayesian', 'onal p...","[electric_load, electric_use, internal_gains, ...","[annual, daily, hourly, monthly, subhourly, ye...",subhourly,"{'occupancy': [('preamble', 'occupancy', ' g h...","{'subhourly': [('preamble', 'every', 'ime fram..."


In [2]:
output_csv = "classified_papers.csv"
df.to_csv(output_csv, index=False)

print(f"[OK] Wrote {output_csv} with {len(df)} rows")

[OK] Wrote classified_papers.csv with 27 rows
