In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os
import sys

# Print the current working directory
print("Current directory:", os.getcwd())

# Change directory to your target folder on Google Drive
os.chdir("/content/drive/My Drive/Colab Notebooks/Survey Paper/Neuro-Symbolic Multi-objective RL/")
print("New directory:", os.getcwd())

Current directory: /content
New directory: /content/drive/My Drive/Colab Notebooks/Survey Paper/Neuro-Symbolic Multi-objective RL


In [5]:
# ===== 0) Setup + config =====
from __future__ import annotations

import os, sys, re, json, time, hashlib
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple
from pathlib import Path

import requests
import pandas as pd
import matplotlib.pyplot as plt

# --- Reproducible folders (repo-relative) ---
REPO_ROOT = Path(".")  # in Colab set to your mounted repo folder
DATA_RAW = REPO_ROOT / "data" / "raw"
DATA_INTERIM = REPO_ROOT / "data" / "interim"
DATA_PROCESSED = REPO_ROOT / "data" / "processed"
OUTPUTS_FIG = REPO_ROOT / "outputs" / "figures"
OUTPUTS_TABLES = REPO_ROOT / "outputs" / "tables"
SCREENING_DIR = REPO_ROOT / "screening"
BIB_DIR = REPO_ROOT / "bib"

for p in [DATA_RAW, DATA_INTERIM, DATA_PROCESSED, OUTPUTS_FIG, OUTPUTS_TABLES, SCREENING_DIR, BIB_DIR]:
    p.mkdir(parents=True, exist_ok=True)

# --- Global knobs ---
YEAR_THRESHOLD = 1980

USE_VENUE_FILTER = False  # recommended OFF for product-design cross-domain surveys


# Venues: Conf, Journal
Top 120 CS journal-conference collection


In [6]:
# Define a mapping of DBLP venue variants for our target conferences and journals
VENUE_VARIANTS = {
    "ICML": ["ICML", "International Conference on Machine Learning"],
    "NIPS": ["NIPS", "NeurIPS", "Advances in Neural Information Processing Systems"],
    "ICLR": ["ICLR", "International Conference on Learning Representations"],
    "AAAI": ["AAAI", "AAAI Conference on Artificial Intelligence"],
    "IJCAI": ["IJCAI", "International Joint Conference on Artificial Intelligence"],
    "COLT": ["COLT", "Conference on Learning Theory"],
    "AISTATS": ["AISTATS", "International Conference on Artificial Intelligence and Statistics"],
    "JMLR": ["JMLR", "Journal of Machine Learning Research"],
    "Machine Learning": ["Machine Learning"],
    "JAIR": ["JAIR", "Journal of Artificial Intelligence Research"],
    "Frontiers in AI": ["Frontiers in AI", "Frontiers in Artificial Intelligence"],
    "AI Magazine": ["AI Magazine"],
    "KDD": ["KDD", "Knowledge Discovery and Data Mining"],
    "SIGIR": ["SIGIR", "ACM SIGIR Conference on Research and Development in Information Retrieval"],
    "CIKM": ["CIKM", "ACM International Conference on Information and Knowledge Management"],
    "ICDM": ["ICDM", "IEEE International Conference on Data Mining"],
    "ECMLPKDD": ["ECMLPKDD", "European Conference on Machine Learning", "Principles and Practice of Knowledge Discovery in Databases"],
    "DMKD": ["DMKD", "Data Mining and Knowledge Discovery"],
    "ACM TKDD": ["ACM TKDD", "ACM Transactions on Knowledge Discovery from Data"],
    "DKE": ["DKE", "Data & Knowledge Engineering"],
    "BDR": ["BDR", "Big Data Research"],
    "BigData": ["IEEE BigData", "IEEE International Conference on Big Data"],
    "CVPR": ["CVPR", "IEEE Conference on Computer Vision and Pattern Recognition"],
    "ICCV": ["ICCV", "International Conference on Computer Vision"],
    "ECCV": ["ECCV", "European Conference on Computer Vision"],
    "BMVC": ["BMVC", "British Machine Vision Conference"],
    "IJCV": ["IJCV", "International Journal of Computer Vision"],
    "CVIU": ["CVIU", "Computer Vision and Image Understanding"],
    "TIP": ["TIP", "IEEE Transactions on Image Processing"],
    "TSP": ["TSP", "IEEE Transactions on Signal Processing"],
    "IEEE TCSI": ["IEEE TCSI", "IEEE Transactions on Circuits and Systems for Video Technology"],
    "ACL": ["ACL", "Annual Meeting of the Association for Computational Linguistics"],
    "NAACL": ["NAACL", "North American Chapter of the Association for Computational Linguistics"],
    "EMNLP": ["EMNLP", "Conference on Empirical Methods in Natural Language Processing"],
    "COLING": ["COLING", "International Conference on Computational Linguistics"],
    "RECSYS": ["RECSYS", "ACM Conference on Recommender Systems"],
    "CHI": ["CHI", "Conference on Human Factors in Computing Systems"],
    "PACMHCI": ["PACMHCI", "Proceedings of the ACM on Human-Computer Interaction"],
    "UbiComp": ["UbiComp", "ACM International Joint Conference on Pervasive and Ubiquitous Computing"],
    "MobiCom": ["MobiCom", "ACM International Conference on Mobile Computing and Networking"],
    "MobiSys": ["MobiSys", "International Conference on Mobile Systems, Applications, and Services"],
    "WWW": ["WWW", "World Wide Web", "The Web Conference"],
    "WSDM": ["WSDM", "Web Search and Data Mining"],
    "ICWSM": ["ICWSM", "International Conference on Web and Social Media"],
    "ICRA": ["ICRA", "IEEE International Conference on Robotics and Automation"],
    "IROS": ["IROS", "IEEE/RSJ International Conference on Intelligent Robots and Systems"],
    "RSS": ["RSS", "Robotics: Science and Systems"],
    "ACM TOIS": ["ACM TOIS", "ACM Transactions on Information Systems"],
    "ACM CSUR": ["ACM CSUR", "ACM Computing Surveys"],
    "TOCS": ["TOCS", "ACM Transactions on Computer Systems"],
    "TOMS": ["TOMS", "ACM Transactions on Mathematical Software"],
    "ACM TODS": ["ACM TODS", "ACM Transactions on Database Systems"],
    "IS": ["IS", "Information Systems"],
    "DSS": ["DSS", "Decision Support Systems"],
    "MISQ": ["MISQ", "MIS Quarterly"],
    "ISR": ["ISR", "Information Systems Research"],
    "JMIS": ["JMIS", "Journal of Management Information Systems"],
    "JASIST": ["JASIST", "Journal of the Association for Information Science and Technology"],
    "IEEE TPAMI": ["IEEE TPAMI", "IEEE Transactions on Pattern Analysis and Machine Intelligence"],
    "IEEE TKDE": ["IEEE TKDE", "IEEE Transactions on Knowledge and Data Engineering"],
    "IEEE TNNLS": ["IEEE TNNLS", "IEEE Transactions on Neural Networks and Learning Systems"],
    "IEEE TC": ["IEEE TC", "IEEE Transactions on Cybernetics"],
    "IEEE TETC": ["IEEE TETC", "IEEE Transactions on Emerging Topics in Computational Intelligence"],
    "IEEE TFS": ["IEEE TFS", "IEEE Transactions on Fuzzy Systems"],
    "IEEE TEC": ["IEEE TEC", "IEEE Transactions on Evolutionary Computation"],
    "IEEE Intelligent Systems": ["IEEE Intelligent Systems"],
    "IEEE Access": ["IEEE Access"],
    "FSS": ["FSS", "Fuzzy Sets and Systems"],
    "Swarm Intelligence": ["Swarm Intelligence"],
    "ALife": ["ALife", "Artificial Life"],
    "EC": ["EC", "Evolutionary Computation"],
    "ACM TIST": ["ACM TIST", "ACM Transactions on Intelligent Systems and Technology"],
    "ACM TORS": ["ACM TORS", "ACM Transactions on Recommender Systems"],
    "ACM TOMM": ["ACM TOMM", "ACM Transactions on Multimedia Computing, Communications, and Applications"],
    "Neurocomputing": ["Neurocomputing"],
    "KAIS": ["KAIS", "Knowledge and Information Systems"],
    "IPM": ["IPM", "Information Processing & Management"],
    "Information Fusion": ["Information Fusion"],
    "PRL": ["PRL", "Pattern Recognition Letters"],
    "PAA": ["PAA", "Pattern Analysis and Applications"],
    "IJDSA": ["IJDSA", "International Journal of Data Science and Analytics"],
    "TVCG": ["TVCG", "IEEE Transactions on Visualization and Computer Graphics"],
    "SIGGRAPH": ["SIGGRAPH", "ACM SIGGRAPH Conference"],
    "IUI": ["IUI", "ACM International Conference on Intelligent User Interfaces"],
    "HRI": ["HRI", "ACM/IEEE International Conference on Human-Robot Interaction"],
    "ICMI": ["ICMI", "International Conference on Multimodal Interaction"],
    "ICMLA": ["ICMLA", "International Conference on Machine Learning and Applications"],
    "PACMHCI": ["PACMHCI", "Proceedings of the ACM on Human-Computer Interaction"],
    "IJCNN": ["IJCNN", "International Joint Conference on Neural Networks"],
    "FAccT": ["FAccT", "ACM Conference on Fairness, Accountability, and Transparency"],
    "EAAI": ["EAAI", "Engineering Applications of Artificial Intelligence"],
    "ACM SIGMOD": ["ACM SIGMOD", "ACM SIGMOD International Conference on Management of Data"],
    "IET": ["IET", "IET Intelligent Transport Systems"],
    "IV": ["IV", "Intelligent Vehicles"],
    "SAS": ["SAS", "IEEE/ACM International Conference on Social Computing"],
    "ICIS": ["ICIS", "International Conference on Information Systems"],
    "ECIS": ["ECIS", "European Conference on Information Systems"],
    "Nature": ["Nature"],
    "Science": ["Science"],
    "Nature Machine Intelligence": ["Nature Machine Intelligence"],
    "Nature Methods": ["Nature Methods"],
    "Nature Biotechnology": ["Nature Biotechnology"],
    "Nature Communications": ["Nature Communications"],
    "PNAS": ["PNAS", "Proceedings of the National Academy of Sciences"],
    # Additional Emerging & Specialized Venues (Springer, Elsevier, MDPI, IOS Press, etc.)
    "Engineering Applications of AI": ["Engineering Applications of Artificial Intelligence", "EAAI"],
    "Expert Systems": ["Expert Systems", "Expert Systems with Applications"],
    "IOS Press": ["IOS Press"],  # Placeholder for IOS Press titles
    "MDPI Sensors": ["MDPI Sensors"],
    "Springer Journal of Data Science": ["Springer Journal of Data Science"],
    "ACM TCPS": ["ACM TCPS", "ACM Transactions on Cyber-Physical Systems"],
    "IJI": ["IJI", "International Journal of Intelligent Systems"],
    "Frontiers in Neuroscience": ["Frontiers in Neuroscience"],
    "Frontiers in Robotics and AI": ["Frontiers in Robotics and AI"],
    # Applied, industrial, and design venues
    "IEEE IT": ["IEEE IT", "IEEE International Conference on Industrial Technology"],
    "IEEM": ["IEEM", "IEEE International Conference on Industrial Engineering and Engineering Management"],
    "CIRP": ["CIRP", "CIRP Annals", "CIRP Annals - Manufacturing Technology"],
    "IJPR": ["IJPR", "International Journal of Production Research"],
    "JMS": ["JMS", "Journal of Manufacturing Systems"],
    "CI": ["CI", "Computers in Industry"],
    "IEEE TASE": ["IEEE TASE", "IEEE Transactions on Automation Science and Engineering"],
    "DS": ["DS", "Design Studies"],
    "IJD": ["IJD", "International Journal of Design"],
    "IPE": ["IPE", "International Journal of Production Economics"],
    "JIM": ["JIM", "Journal of Intelligent Manufacturing"],
    "JCP": ["JCP", "Journal of Cleaner Production"],
    "RCR": ["RCR", "Resources, Conservation and Recycling"],
    "EJOR": ["EJOR", "European Journal of Operational Research"],
    "CIE": ["CIE", "Computers & Industrial Engineering"],
    "PPC": ["PPC", "Production Planning & Control"]
}

In [7]:
# ===== 1) Helpers: normalization + IDs =====

def normalize_whitespace(s: str) -> str:
    return re.sub(r"\s+", " ", (s or "")).strip()

def normalize_title(s: str) -> str:
    s = normalize_whitespace(s).lower()
    # remove latex/braces and punctuation-ish
    s = re.sub(r"[{}$\\]", "", s)
    s = re.sub(r"[^a-z0-9\s]", "", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def extract_doi(text: str) -> Optional[str]:
    if not text:
        return None
    # permissive DOI regex
    m = re.search(r"\b10\.\d{4,9}/[-._;()/:A-Za-z0-9]+\b", text)
    return m.group(0).lower() if m else None

def safe_int(x) -> Optional[int]:
    try:
        return int(x)
    except Exception:
        return None

def make_paper_id(doi: Optional[str], arxiv_id: Optional[str], s2_id: Optional[str],
                  title: str, year: Optional[int]) -> str:
    """
    Stable ID priority:
    DOI > arXiv > SemanticScholar paperId > normalized title+year
    """
    if doi:
        return f"doi:{doi}"
    if arxiv_id:
        return f"arxiv:{arxiv_id.lower()}"
    if s2_id:
        return f"s2:{s2_id}"
    key = f"{normalize_title(title)}::{year or ''}"
    return "tpy:" + hashlib.sha1(key.encode("utf-8")).hexdigest()[:16]


### 2) Retrieval: DBLP

In [8]:
# ===== 2) Retrieval: DBLP =====

DBLP_SEARCH_URL = "https://dblp.org/search/publ/api"

def dblp_search(query: str, start: int = 0, hits: int = 100) -> List[dict]:
    params = {"q": query, "format": "json", "h": hits, "f": start}
    r = requests.get(DBLP_SEARCH_URL, params=params, timeout=60)
    r.raise_for_status()
    data = r.json()
    return data.get("result", {}).get("hits", {}).get("hit", []) or []

def dblp_fetch_all(query: str, hits: int = 100, max_pages: int = 5, sleep_s: float = 1.0) -> List[dict]:
    out = []
    start = 0
    for _ in range(max_pages):
        batch = dblp_search(query, start=start, hits=hits)
        if not batch:
            break
        out.extend(batch)
        start += hits
        time.sleep(sleep_s)
    return out

def dblp_get_bibtex(dblp_url: str) -> Optional[str]:
    if not dblp_url:
        return None
    bib_url = dblp_url + ".bib"
    r = requests.get(bib_url, timeout=60)
    if r.status_code == 200 and r.text.strip().startswith("@"):
        return r.text
    return None

def parse_dblp_hit(hit: dict) -> dict:
    info = hit.get("info", {}) or {}
    title = normalize_whitespace(info.get("title", ""))
    year = safe_int(info.get("year"))
    venue = normalize_whitespace(info.get("venue", "")) if not isinstance(info.get("venue"), list) else normalize_whitespace(" ".join(info.get("venue")))
    url = normalize_whitespace(info.get("url", ""))

    # DBLP doesn't reliably provide DOI in the json hit; try DOI from bibtex later
    return {
        "source": "DBLP",
        "title": title,
        "year": year,
        "venue": venue,
        "url": url,
        "abstract": None,
        "doi": None,
        "arxiv_id": None,
        "s2_id": None,
        "dblp_bibtex": None,
    }

def retrieve_from_dblp(queries: Dict[str, List[str]], max_pages: int = 5) -> pd.DataFrame:
    rows = []
    for topic, qlist in queries.items():
        for q in qlist:
            hits = dblp_fetch_all(q, hits=100, max_pages=max_pages, sleep_s=1.0)
            for h in hits:
                row = parse_dblp_hit(h)
                row["topic_bucket"] = topic
                row["query"] = q
                # basic filters
                if row["year"] is not None and row["year"] < YEAR_THRESHOLD:
                    continue
                if not row["title"]:
                    continue

                # fetch bibtex (optional but useful for DOI)
                if row["url"]:
                    bib = dblp_get_bibtex(row["url"])
                    row["dblp_bibtex"] = bib
                    # try extract DOI from bibtex
                    if bib:
                        doi = extract_doi(bib)
                        if doi:
                            row["doi"] = doi
                rows.append(row)
    df = pd.DataFrame(rows).drop_duplicates(subset=["source", "title", "year", "venue", "url", "query"])
    return df


## 3) Retrieval: Semantic Scholar

In [9]:
# ===== 3) Retrieval: Semantic Scholar =====

S2_URL = "https://api.semanticscholar.org/graph/v1/paper/search"

def s2_search(query: str, offset: int = 0, limit: int = 100, fields: str = None,
              max_retries: int = 6, sleep_s: float = 2.0) -> List[dict]:
    if fields is None:
        fields = "paperId,title,year,venue,abstract,url,externalIds,publicationTypes"
    params = {"query": query, "offset": offset, "limit": limit, "fields": fields}
    for i in range(max_retries):
        r = requests.get(S2_URL, params=params, timeout=60)
        if r.status_code == 200:
            return r.json().get("data", []) or []
        if r.status_code == 429:
            time.sleep(sleep_s * (i + 1))
            continue
        # other errors: stop
        raise RuntimeError(f"S2 API error {r.status_code}: {r.text[:200]}")
    return []

def s2_fetch_all(query: str, limit: int = 100, max_pages: int = 3) -> List[dict]:
    out, offset = [], 0
    for _ in range(max_pages):
        batch = s2_search(query, offset=offset, limit=limit)
        if not batch:
            break
        out.extend(batch)
        offset += limit
        time.sleep(1.0)
    return out

def parse_s2_entry(e: dict) -> dict:
    title = normalize_whitespace(e.get("title", ""))
    year = safe_int(e.get("year"))
    venue = normalize_whitespace(e.get("venue", ""))
    abstract = normalize_whitespace(e.get("abstract", "")) or None
    url = normalize_whitespace(e.get("url", ""))
    s2_id = e.get("paperId")

    external = e.get("externalIds", {}) or {}
    doi = external.get("DOI")
    if doi:
        doi = doi.lower()
    arxiv_id = external.get("ArXiv")

    return {
        "source": "SemanticScholar",
        "title": title,
        "year": year,
        "venue": venue,
        "url": url,
        "abstract": abstract,
        "doi": doi,
        "arxiv_id": arxiv_id,
        "s2_id": s2_id,
        "dblp_bibtex": None,
    }

def retrieve_from_s2(queries: Dict[str, List[str]], max_pages: int = 3) -> pd.DataFrame:
    rows = []
    for topic, qlist in queries.items():
        for q in qlist:
            results = s2_fetch_all(q, limit=100, max_pages=max_pages)
            for e in results:
                row = parse_s2_entry(e)
                row["topic_bucket"] = topic
                row["query"] = q
                if row["year"] is not None and row["year"] < YEAR_THRESHOLD:
                    continue
                if not row["title"]:
                    continue
                rows.append(row)
    df = pd.DataFrame(rows).drop_duplicates(subset=["source", "s2_id", "title", "year", "venue", "query"])
    return df


## 4) Stage 1 Run: define queries + pull + save raw

In [None]:
# ===== 4) Stage 1 run: define query buckets + retrieve =====

# Search specifically for TypeVI: Differentiable + Shielding
query_buckets = {
    "NS+Design": [
        "neuro-symbolic product design",
        "neurosymbolic CAD generation constraints",
        "neuro-symbolic manufacturing optimization",
        "neuro-symbolic assembly planning"
    ],
    "TypeVI_differentiable_solvers": [
        "differentiable optimization layer OptNet",
        "CVXPYLayers differentiable convex optimization layer",
        "SATNet differentiable satisfiability"
    ],
    "Safety_Shielding": [
        "reinforcement learning shielding temporal logic",
        "safe reinforcement learning shield"
    ],
}

df_dblp = retrieve_from_dblp(query_buckets, max_pages=3)
df_s2 = retrieve_from_s2(query_buckets, max_pages=3)

raw_path_dblp = DATA_RAW / "candidates_dblp_raw.csv"
raw_path_s2 = DATA_RAW / "candidates_s2_raw.csv"

df_dblp.to_csv(raw_path_dblp, index=False)
df_s2.to_csv(raw_path_s2, index=False)

print("DBLP raw:", len(df_dblp), "->", raw_path_dblp)
print("S2 raw:", len(df_s2), "->", raw_path_s2)



## 5) Normalize + merge

In [None]:
# ===== 5) Stage 2: normalize + merge =====

def unify_schema(df: pd.DataFrame) -> pd.DataFrame:
    # ensure all columns exist
    cols = ["source","title","year","venue","url","abstract","doi","arxiv_id","s2_id","dblp_bibtex","topic_bucket","query"]
    for c in cols:
        if c not in df.columns:
            df[c] = None
    df = df[cols].copy()
    # clean strings
    for c in ["source","title","venue","url","abstract","doi","arxiv_id","s2_id","topic_bucket","query"]:
        df[c] = df[c].astype(str).replace({"nan": ""})
        df[c] = df[c].apply(lambda x: normalize_whitespace(x) if isinstance(x, str) else x)
        df[c] = df[c].replace({"": None})
    df["year"] = pd.to_numeric(df["year"], errors="coerce").astype("Int64")
    return df

df_dblp_u = unify_schema(df_dblp)
df_s2_u = unify_schema(df_s2)

df_merged = pd.concat([df_dblp_u, df_s2_u], ignore_index=True)

# generate paper_id
df_merged["paper_id"] = df_merged.apply(
    lambda r: make_paper_id(r["doi"], r["arxiv_id"], r["s2_id"], r["title"] or "", int(r["year"]) if pd.notna(r["year"]) else None),
    axis=1
)

merged_path = DATA_INTERIM / "candidates_merged.csv"
df_merged.to_csv(merged_path, index=False)
print("Merged:", len(df_merged), "->", merged_path)


## 6) Dedup (PRISMA “duplicates removed”)

In [None]:
# ===== 6) Stage 3: dedup =====

def pick_best_row(group: pd.DataFrame) -> pd.Series:
    """
    Prefer rows with:
    - DOI present
    - abstract present
    - venue present
    - url present
    Tie-breaker: SemanticScholar > DBLP (often richer metadata)
    """
    def score(row):
        s = 0
        s += 5 if pd.notna(row.get("doi")) else 0
        s += 3 if pd.notna(row.get("abstract")) else 0
        s += 2 if pd.notna(row.get("venue")) else 0
        s += 1 if pd.notna(row.get("url")) else 0
        s += 1 if row.get("source") == "SemanticScholar" else 0
        return s
    idx = group.apply(score, axis=1).sort_values(ascending=False).index[0]
    return group.loc[idx]

# primary
dedup_primary = df_merged.groupby("paper_id", as_index=False).apply(pick_best_row).reset_index(drop=True)

# secondary fallback for missing identifiers: title+year collisions
dedup_primary["title_norm"] = dedup_primary["title"].fillna("").apply(normalize_title)
dedup_primary["ty_key"] = dedup_primary.apply(
    lambda r: f"{r['title_norm']}::{int(r['year']) if pd.notna(r['year']) else ''}",
    axis=1
)

dedup_final = dedup_primary.groupby("ty_key", as_index=False).apply(pick_best_row).reset_index(drop=True)
dedup_final = dedup_final.drop(columns=["title_norm","ty_key"])

dedup_path = DATA_PROCESSED / "candidates_dedup.csv"
dedup_final.to_csv(dedup_path, index=False)
print("Deduped:", len(dedup_final), "->", dedup_path)


## 7) Screening + taxonomy templates (manual step, but generated automatically)

In [None]:
# ===== 7) Stage 4: screening + taxonomy templates =====

def export_screening_template(df: pd.DataFrame, path: Path) -> None:
    cols = ["paper_id","title","year","venue","url","source"]
    out = df[cols].copy()
    out["decision"] = ""   # include / exclude / maybe
    out["reason"] = ""     # e.g., not NS, not design/manufacturing, no method detail, etc.
    out["notes"] = ""
    out.to_csv(path, index=False)

def export_taxonomy_template(df: pd.DataFrame, path: Path) -> None:
    cols = ["paper_id","title","year","venue","url"]
    out = df[cols].copy()
    out["kautz_type"] = ""     # I..VI
    out["Dom"] = ""            # CAD/Mfg/Asm/Mat/Topo/G/Dsgn
    out["NF"] = ""             # P/Gen/Opt/S
    out["SS"] = ""             # O/K/L/C/R
    out["constraint_prior"] = ""
    out["dataset_benchmark"] = ""
    out["key_metrics"] = ""
    out.to_csv(path, index=False)

screening_path = SCREENING_DIR / "screening_log.csv"
taxonomy_path = SCREENING_DIR / "taxonomy_labels.csv"

export_screening_template(dedup_final, screening_path)
export_taxonomy_template(dedup_final, taxonomy_path)

print("Wrote:", screening_path)
print("Wrote:", taxonomy_path)


## 8) Plots (years + venues) → outputs/figures/

In [None]:
# ===== 8) Plots =====

def plot_year_hist(df: pd.DataFrame, out_path: Path) -> None:
    years = df["year"].dropna().astype(int)
    if years.empty:
        print("No years to plot.")
        return
    plt.figure(figsize=(10,6))
    plt.hist(years, bins=min(30, max(5, years.nunique())))
    plt.xlabel("Year")
    plt.ylabel("Count")
    plt.title("Candidate papers by year (deduped)")
    plt.tight_layout()
    plt.savefig(out_path, format="svg")
    plt.close()

def plot_top_venues(df: pd.DataFrame, out_path: Path, top_n: int = 25) -> None:
    v = df["venue"].dropna()
    if v.empty:
        print("No venues to plot.")
        return
    counts = v.value_counts().head(top_n)
    plt.figure(figsize=(12,6))
    plt.bar(counts.index.astype(str), counts.values)
    plt.xticks(rotation=75, ha="right")
    plt.xlabel("Venue")
    plt.ylabel("Count")
    plt.title(f"Top {top_n} venues (deduped candidates)")
    plt.tight_layout()
    plt.savefig(out_path, format="svg")
    plt.close()

plot_year_hist(dedup_final, OUTPUTS_FIG / "candidates_by_year.svg")
plot_top_venues(dedup_final, OUTPUTS_FIG / "top_venues.svg", top_n=25)

print("Saved figures to", OUTPUTS_FIG)


## 9) Export BibTeX bundle (only from DBLP rows that have it)

In [None]:
# ===== 9) Optional: BibTeX export =====

def export_bibtex_from_dblp(df: pd.DataFrame, out_path: Path) -> None:
    bibs = df["dblp_bibtex"].dropna().astype(str)
    bibs = [b.strip() for b in bibs if b.strip().startswith("@")]
    # basic dedup by hash
    seen = set()
    uniq = []
    for b in bibs:
        h = hashlib.sha1(b.encode("utf-8")).hexdigest()
        if h not in seen:
            seen.add(h)
            uniq.append(b)
    out_path.write_text("\n\n".join(uniq) + "\n", encoding="utf-8")
    print(f"BibTeX entries written: {len(uniq)} -> {out_path}")

export_bibtex_from_dblp(df_merged, BIB_DIR / "candidates_from_dblp.bib")
