In [6]:
import sys
import platform
import pytrec_eval

# Keyword list for evaluating search engine capabilities
# Compatible with Google Scholar and AstA Paper Finder
# includes terms from numerical methods, econometrics, optimization, and dynamic systems

keywords = [
    "numerical methods",
    "finite element method",
    "stochastic processes",
    "optimization theory",
    "statistical learning",
    "panel data estimation",
    "instrumental variables estimation",
    "causal inference econometrics",
    "nonparametric econometric methods",
    "dynamic optimization in economics",
    "general equilibrium theory",
    "dynamic stochastic general equilibrium models",
    "Hamilton–Jacobi–Bellman equation economics",
    "viscosity solutions economic models",
    "nonlinear dynamical systems in economics",
    "Monte Carlo methods econometrics",
    "numerical solution of DSGE models",
    "semiparametric efficiency bound"
]

############### apply 

# Google scholar

In [4]:
!pip install google-search-results



In [2]:
"""
Google Scholar search -> pytrec_eval-ready JSON
via SerpApi

Prereqs:
  pip install google-search-results requests

Env:
  export SERPAPI_API_KEY="YOUR_KEY"

Outputs:
  - run.json   (pytrec_eval run: qid -> docid -> score)
  - qrels_template.json (empty template to fill relevance judgments)
  - docs.json  (doc metadata by docid)
"""

import os
import json
import time
import hashlib
from typing import Dict, Any, List
from serpapi import GoogleSearch

In [3]:
# -----------------------
# 2) Config
# -----------------------
import os
os.environ["SERPAPI_API_KEY"] = "04b8e4f44d6cbb9fce4af8f818407abf71876911cc0a52074b4a6dc65221063f"

API_KEY = os.getenv("SERPAPI_API_KEY")
if not API_KEY:
    raise RuntimeError("Missing SERPAPI_API_KEY environment variable.")




NUM_RESULTS_PER_QUERY = 20      # how many scholar results per query
SLEEP_SECONDS = 2               # polite delay between requests
LANG = "en"
# If you want CN interface language, change to "zh-CN" but scholar content may vary.

In [4]:


# -----------------------
# 3) Helpers
# -----------------------
def stable_docid(title: str, link: str) -> str:
    """
    Make a stable docid for pytrec_eval.
    Prefer link+title hashing to avoid collisions.
    """
    raw = (title or "").strip() + "||" + (link or "").strip()
    h = hashlib.sha1(raw.encode("utf-8")).hexdigest()
    return f"D{h[:16]}"  # short but stable

def rank_score(rank: int) -> float:
    # Higher rank => higher score. Works for pytrec_eval ranking metrics.
    return 1.0 / float(rank)

def scholar_search(query: str, num: int) -> List[Dict[str, Any]]:
    """
    Fetch results from SerpApi Google Scholar engine.
    """
    params = {
        "engine": "google_scholar",
        "q": query,
        "api_key": API_KEY,
        "hl": LANG,
        "num": num,
    }
    search = GoogleSearch(params)
    data = search.get_dict()
    return data.get("organic_results", [])

In [9]:
import pandas as pd

In [10]:

# -----------------------
# 4) Main: build run + docs + empty qrels template
# -----------------------
run: Dict[str, Dict[str, float]] = {}
qrels_template: Dict[str, Dict[str, int]] = {}
docs: Dict[str, Dict[str, Any]] = {}
rows = []

for i, q in enumerate(keywords, start=1):
    qid = f"Q{i:03d}"
    print(f"[{qid}] Searching: {q}")

    results = scholar_search(q, NUM_RESULTS_PER_QUERY)

    run[qid] = {}
    qrels_template[qid] = {}

    for r_idx, r in enumerate(results, start=1):
        title = r.get("title", "")
        link = r.get("link", "") or r.get("result_id", "")  # fallback
        snippet = r.get("snippet", "")
        publication_info = r.get("publication_info", {})
        cited_by = (r.get("inline_links", {}).get("cited_by", {}) or {}).get("total", None)

        docid = stable_docid(title, link)

        # Store run score (rank-based)
        run[qid][docid] = rank_score(r_idx)

        # Store doc metadata for later inspection / judging
        if docid not in docs:
            docs[docid] = {
                "title": title,
                "link": link,
                "snippet": snippet,
                "publication_info": publication_info,
                "cited_by": cited_by,
            }
        rows.append({
        "qid": qid,
        "query": q,
        "docid": docid,
        "rank": r_idx,
        "score": rank_score(r_idx),
        "title": title,
        "link": link,
        "snippet": snippet,
        "cited_by": cited_by,
        "relevance": 0   # 人工之后再改
        })

        # Empty qrels slot for human judgment (fill later with 0/1/2...)
        # Example: qrels_template[qid][docid] = 1
        qrels_template[qid][docid] = 0

    time.sleep(SLEEP_SECONDS)

# -----------------------
# 5) Save JSON files
# -----------------------
with open("run.json", "w", encoding="utf-8") as f:
    json.dump(run, f, ensure_ascii=False, indent=2)

with open("qrels_template.json", "w", encoding="utf-8") as f:
    json.dump(qrels_template, f, ensure_ascii=False, indent=2)

with open("docs.json", "w", encoding="utf-8") as f:
    json.dump(docs, f, ensure_ascii=False, indent=2)

print("\nDone.")
print("Wrote: run.json, qrels_template.json, docs.json")
print("Next: fill qrels_template.json with relevance labels, then evaluate with pytrec_eval.")

import pandas as pd
df = pd.DataFrame(rows)

df.to_csv(
    "judgments.csv",
    index=False,
    encoding="utf-8"
)

print("Wrote: judgments.csv")


[Q001] Searching: numerical methods
[Q002] Searching: finite element method
[Q003] Searching: stochastic processes
[Q004] Searching: optimization theory
[Q005] Searching: statistical learning
[Q006] Searching: panel data estimation
[Q007] Searching: instrumental variables estimation
[Q008] Searching: causal inference econometrics
[Q009] Searching: nonparametric econometric methods
[Q010] Searching: dynamic optimization in economics
[Q011] Searching: general equilibrium theory
[Q012] Searching: dynamic stochastic general equilibrium models
[Q013] Searching: Hamilton–Jacobi–Bellman equation economics
[Q014] Searching: viscosity solutions economic models
[Q015] Searching: nonlinear dynamical systems in economics
[Q016] Searching: Monte Carlo methods econometrics
[Q017] Searching: numerical solution of DSGE models
[Q018] Searching: semiparametric efficiency bound

Done.
Wrote: run.json, qrels_template.json, docs.json
Next: fill qrels_template.json with relevance labels, then evaluate with 

In [7]:
import json, os

for fn in ["run.json", "qrels_template.json", "docs.json"]:
    print(fn, "exists?" , os.path.exists(fn))

run = json.load(open("run.json","r",encoding="utf-8"))
qrels = json.load(open("qrels_template.json","r",encoding="utf-8"))
docs = json.load(open("docs.json","r",encoding="utf-8"))

print("queries in run:", len(run))
print("queries in qrels:", len(qrels))
print("docs:", len(docs))


run.json exists? True
qrels_template.json exists? True
docs.json exists? True
queries in run: 18
queries in qrels: 18
docs: 360


In [8]:
import json

run = json.load(open("run.json","r",encoding="utf-8"))

counts = {qid: len(d) for qid, d in run.items()}
print("min/avg/max results per query:",
      min(counts.values()),
      sum(counts.values())/len(counts),
      max(counts.values()))
print("empty qids:", [qid for qid,n in counts.items() if n==0])


min/avg/max results per query: 20 20.0 20
empty qids: []


In [9]:
import os, requests, json

API_KEY = os.getenv("SERPAPI_API_KEY")
print("SERPAPI_API_KEY loaded?", API_KEY is not None, "len=", 0 if API_KEY is None else len(API_KEY))

url = "https://serpapi.com/search"
params = {
    "engine": "google_scholar",
    "q": "numerical methods",
    "num": 5,
    "hl": "en",
    "api_key": API_KEY,
}

r = requests.get(url, params=params, timeout=30)
print("HTTP status:", r.status_code)

try:
    data = r.json()
except Exception as e:
    print("JSON decode error:", e)
    print("Raw text (first 500 chars):", r.text[:500])
    raise

print("Top-level keys:", list(data.keys()))
print("search_metadata:", data.get("search_metadata"))
print("search_metadata.status:", (data.get("search_metadata") or {}).get("status"))
print("error:", data.get("error"))
print("error_message:", data.get("error_message"))
print("organic_results length:", len(data.get("organic_results", [])))

# 保存完整响应，便于你检查/发我
with open("serpapi_debug.json", "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

print("Saved full response to serpapi_debug.json")


SERPAPI_API_KEY loaded? True len= 64
HTTP status: 200
Top-level keys: ['search_metadata', 'search_parameters', 'search_information', 'organic_results', 'related_searches', 'pagination', 'serpapi_pagination']
search_metadata: {'id': '6950cb3866bc780b189b0fa7', 'status': 'Success', 'json_endpoint': 'https://serpapi.com/searches/a19b7f07460ca09e/6950cb3866bc780b189b0fa7.json', 'created_at': '2025-12-28 06:16:24 UTC', 'processed_at': '2025-12-28 06:16:24 UTC', 'google_scholar_url': 'https://scholar.google.com/scholar?q=numerical+methods&hl=en&num=5', 'raw_html_file': 'https://serpapi.com/searches/a19b7f07460ca09e/6950cb3866bc780b189b0fa7.html', 'total_time_taken': 0.93}
search_metadata.status: Success
error: None
error_message: None
organic_results length: 5
Saved full response to serpapi_debug.json


In [None]:
import os
os.environ["SERPAPI_API_KEY"] = "PASTE_YOUR_FULL_SERPAPI_KEY_HERE"

k = os.getenv("SERPAPI_API_KEY")
print("len=", len(k))
print("prefix=", k[:4], "suffix=", k[-4:])


######## need to change api key ###############

len= 32
prefix= PAST suffix= HERE


In [12]:
import os, json, hashlib, time
from serpapi import GoogleSearch

API_KEY = os.getenv("SERPAPI_API_KEY")
assert API_KEY, "Missing SERPAPI_API_KEY"

K = 10          # 与 AstA 对齐：每个 query 10 条
HL = "en"       # Scholar 界面语言
SLEEP_S = 1.0   # 请求间隔（可调）

def stable_docid(title: str, link: str) -> str:
    raw = (title or "").strip() + "||" + (link or "").strip()
    h = hashlib.sha1(raw.encode("utf-8")).hexdigest()
    return f"D{h[:16]}"

def rank_score(rank: int) -> float:
    # pytrec_eval 只需要可排序的分数；Scholar 没有原生 score，用 rank-based 很标准
    return 1.0 / rank

def scholar_topk(query: str, k: int = K, hl: str = HL):
    params = {
        "engine": "google_scholar",
        "q": query,
        "api_key": API_KEY,
        "num": k,      # 关键：直接取 10 条
        "start": 0,    # 不翻页
        "hl": hl,
    }
    data = GoogleSearch(params).get_dict()
    return (data.get("organic_results") or [])[:k]

def build_pytrec_eval_json(queries, k: int = K):
    run = {}
    qrels_template = {}
    docs = {}

    for i, q in enumerate(queries, start=1):
        qid = f"Q{i:03d}"
        results = scholar_topk(q, k=k)

        run[qid] = {}
        qrels_template[qid] = {}

        for rank, r in enumerate(results, start=1):
            title = r.get("title", "")
            link = r.get("link", "") or r.get("result_id", "")
            snippet = r.get("snippet", "")
            pub = r.get("publication_info", {}) or {}
            cited_by = (r.get("inline_links", {}).get("cited_by", {}) or {}).get("total", None)

            docid = stable_docid(title, link)

            run[qid][docid] = rank_score(rank)

            if docid not in docs:
                docs[docid] = {
                    "title": title,
                    "link": link,
                    "snippet": snippet,
                    "publication_info": pub,
                    "cited_by": cited_by,
                }

            # 默认 0；你后面人工改成 1/2/3...
            qrels_template[qid][docid] = 1

        print(f"[{qid}] {q} -> got {len(results)} results (target {k})")
        time.sleep(SLEEP_S)
    with open("run_scholar_k10.json", "w", encoding="utf-8") as f:
        json.dump(run, f, ensure_ascii=False, indent=2)
    with open("qrels_template_k10.json", "w", encoding="utf-8") as f:
        json.dump(qrels_template, f, ensure_ascii=False, indent=2)
    with open("docs_scholar_k10.json", "w", encoding="utf-8") as f:
        json.dump(docs, f, ensure_ascii=False, indent=2)

    print("Saved: run_scholar_k10.json, qrels_template_k10.json, docs_scholar_k10.json")

# ---- 用你的 query list 替换这里（示例先放 1 个）----
queries = ["numerical methods"]

build_pytrec_eval_json(queries, k=10)

[Q001] numerical methods -> got 10 results (target 10)
Saved: run_scholar_k10.json, qrels_template_k10.json, docs_scholar_k10.json


# Asta paper finder

In [14]:
import os, json, hashlib
from typing import Dict, Any, List

keywords = [
    "numerical methods",
    "finite element method",
    "stochastic processes",
    "optimization theory",
    "statistical learning",
    "panel data estimation",
    "instrumental variables estimation",
    "causal inference econometrics",
    "nonparametric econometric methods",
    "dynamic optimization in economics",
    "general equilibrium theory",
    "dynamic stochastic general equilibrium models",
    "Hamilton–Jacobi–Bellman equation economics",
    "viscosity solutions economic models",
    "nonlinear dynamical systems in economics",
    "Monte Carlo methods econometrics",
    "numerical solution of DSGE models",
    "semiparametric efficiency bound",
]

ASTA_MCP_URL = "https://asta-tools.allen.ai/mcp/v1"
NUM_RESULTS_PER_QUERY = 50
FIELDS = "paperId,title,url,year,venue,authors,abstract,tldr,publicationDate,fieldsOfStudy,isOpenAccess,openAccessPdf"


In [15]:
def rank_score(rank: int) -> float:
    return 1.0 / float(rank)

def stable_docid(paper_id: str, title: str = "", url: str = "") -> str:
    if paper_id:
        return "A_" + paper_id.replace(":", "_")
    raw = (title or "").strip() + "||" + (url or "").strip()
    h = hashlib.sha1(raw.encode("utf-8")).hexdigest()
    return f"A_{h[:16]}"

def papers_from_calltool_result(resp) -> List[Dict[str, Any]]:
    """
    Asta MCP: resp.content is a list of TextContent, each one is a JSON string for a paper.
    """
    out = []
    content = getattr(resp, "content", None) or []
    for item in content:
        if getattr(item, "type", None) == "text":
            txt = getattr(item, "text", "")
            if not txt:
                continue
            try:
                obj = json.loads(txt)
                if isinstance(obj, dict) and ("paperId" in obj or "title" in obj):
                    out.append(obj)
            except Exception:
                # 如果偶尔不是 JSON，就跳过
                continue
        elif getattr(item, "type", None) == "json":
            obj = getattr(item, "json", None)
            if isinstance(obj, dict):
                out.append(obj)
            elif isinstance(obj, list):
                out.extend([x for x in obj if isinstance(x, dict)])
    return out

In [16]:
import sys
print(sys.version)
import mcp
print("mcp imported OK")

3.11.14 (main, Oct 21 2025, 18:27:30) [Clang 20.1.8 ]
mcp imported OK


In [18]:
async def run_asta():
    key = os.getenv("ASTA_TOOL_KEY")
    if not key:
        raise RuntimeError("Missing ASTA_TOOL_KEY in this kernel.")

    from mcp import ClientSession
    from mcp.client.streamable_http import streamablehttp_client

    headers = {"x-api-key": key}

    run: Dict[str, Dict[str, float]] = {}
    qrels_template: Dict[str, Dict[str, int]] = {}
    docs: Dict[str, Dict[str, Any]] = {}

    async with streamablehttp_client(ASTA_MCP_URL, headers=headers) as streams:
        read, write = streams[0], streams[1]
        async with ClientSession(read, write) as session:
            tools = await session.list_tools()
            tool_names = [t.name for t in tools.tools]
            print("MCP tools available:", tool_names)

            if "search_papers_by_relevance" not in tool_names:
                raise RuntimeError(f"search_papers_by_relevance not found. Available: {tool_names}")

            for i, q in enumerate(keywords, start=1):
                qid = f"Q{i:03d}"
                print(f"[{qid}] searching: {q}")

                resp = await session.call_tool(
                    "search_papers_by_relevance",
                    {"keyword": q, "fields": FIELDS, "limit": NUM_RESULTS_PER_QUERY},
                )

                papers = papers_from_calltool_result(resp)

                run[qid] = {}
                qrels_template[qid] = {}

                for r_idx, p in enumerate(papers, start=1):
                    paper_id = p.get("paperId") or p.get("paper_id") or p.get("id") or ""
                    title = p.get("title", "")
                    url = p.get("url", "")

                    docid = stable_docid(paper_id, title=title, url=url)
                    run[qid][docid] = rank_score(r_idx)
                    qrels_template[qid][docid] = 0

                    if docid not in docs:
                        docs[docid] = {
                            "paper_id": paper_id,
                            "title": title,
                            "url": url,
                            "year": p.get("year"),
                            "venue": p.get("venue"),
                            "publicationDate": p.get("publicationDate"),
                            "authors": p.get("authors"),
                            "abstract": p.get("abstract"),
                            "tldr": p.get("tldr"),
                            "fieldsOfStudy": p.get("fieldsOfStudy"),
                            "isOpenAccess": p.get("isOpenAccess"),
                            "openAccessPdf": p.get("openAccessPdf"),
                        }

    with open("asta_run.json", "w", encoding="utf-8") as f:
        json.dump(run, f, ensure_ascii=False, indent=2)
    with open("asta_qrels_template.json", "w", encoding="utf-8") as f:
        json.dump(qrels_template, f, ensure_ascii=False, indent=2)
    with open("asta_docs.json", "w", encoding="utf-8") as f:
        json.dump(docs, f, ensure_ascii=False, indent=2)

    print("\n✅ Saved files: asta_run.json, asta_qrels_template.json, asta_docs.json")
    print("docs count:", len(docs))
    print("min results/query:", min(len(v) for v in run.values()) if run else 0)

# 在 Jupyter 里运行：
await run_asta()

MCP tools available: ['get_paper', 'get_paper_batch', 'get_citations', 'search_authors_by_name', 'get_author_papers', 'search_papers_by_relevance', 'search_paper_by_title', 'snippet_search']
[Q001] searching: numerical methods
[Q002] searching: finite element method
[Q003] searching: stochastic processes
[Q004] searching: optimization theory
[Q005] searching: statistical learning
[Q006] searching: panel data estimation
[Q007] searching: instrumental variables estimation
[Q008] searching: causal inference econometrics
[Q009] searching: nonparametric econometric methods
[Q010] searching: dynamic optimization in economics
[Q011] searching: general equilibrium theory
[Q012] searching: dynamic stochastic general equilibrium models
[Q013] searching: Hamilton–Jacobi–Bellman equation economics
[Q014] searching: viscosity solutions economic models
[Q015] searching: nonlinear dynamical systems in economics
[Q016] searching: Monte Carlo methods econometrics
[Q017] searching: numerical solution of

In [13]:
import os, json

ASTA_MCP_URL = "https://asta-tools.allen.ai/mcp/v1"

async def debug_one():
    from mcp import ClientSession
    from mcp.client.streamable_http import streamablehttp_client

    key = os.getenv("ASTA_TOOL_KEY")
    headers = {"x-api-key": key}

    async with streamablehttp_client(ASTA_MCP_URL, headers=headers) as streams:
        read, write = streams[0], streams[1]
        async with ClientSession(read, write) as session:
            resp = await session.call_tool(
                "search_papers_by_relevance",
                {
                    "keyword": "numerical methods",
                    "fields": "paperId,title,url,year,abstract",
                    "limit": 5
                },
            )

            print("resp type:", type(resp))
            content = getattr(resp, "content", None)
            print("has content?", content is not None, "content len:", (len(content) if content else None))

            if content:
                for i, item in enumerate(content):
                    print(f"\n--- content[{i}] type={getattr(item,'type',None)} ---")
                    if getattr(item, "type", None) == "json":
                        print("json keys:", list(item.json.keys())[:30])
                        with open("asta_debug_payload.json", "w", encoding="utf-8") as f:
                            json.dump(item.json, f, ensure_ascii=False, indent=2)
                        print("saved asta_debug_payload.json")
                    elif getattr(item, "type", None) == "text":
                        print("text head:", item.text[:500])
                        # 如果 text 里是 JSON，也保存下来
                        try:
                            j = json.loads(item.text)
                            with open("asta_debug_payload.json", "w", encoding="utf-8") as f:
                                json.dump(j, f, ensure_ascii=False, indent=2)
                            print("saved asta_debug_payload.json (from text)")
                        except Exception:
                            pass

await debug_one()


resp type: <class 'mcp.types.CallToolResult'>
has content? True content len: 5

--- content[0] type=text ---
text head: {
  "paperId": "82482585e94192b4e9913727e461f89cd08e9725",
  "url": "https://www.semanticscholar.org/paper/82482585e94192b4e9913727e461f89cd08e9725",
  "title": "Pseudo Numerical Methods for Diffusion Models on Manifolds",
  "year": 2022,
  "openAccessPdf": {
    "url": "",
    "status": null,
    "license": null,
    "disclaimer": "Notice: Paper or abstract available at https://arxiv.org/abs/2202.09778, which is subject to the license by the author or copyright owner provided with this content.
saved asta_debug_payload.json (from text)

--- content[1] type=text ---
text head: {
  "paperId": "8705e8fe0632bd11f200455a5125692a2547a018",
  "url": "https://www.semanticscholar.org/paper/8705e8fe0632bd11f200455a5125692a2547a018",
  "title": "Riemann Solvers and Numerical Methods for Fluid Dynamics",
  "year": 1997,
  "openAccessPdf": {
    "url": "",
    "status": "CLOSED",

In [19]:
import json
run = json.load(open("asta_run.json","r",encoding="utf-8"))
docs = json.load(open("asta_docs.json","r",encoding="utf-8"))

print("queries:", len(run))
print("docs:", len(docs))
print("min results/query:", min(len(v) for v in run.values()))
print("sample Q001 top5:", list(run["Q001"].items())[:5])


queries: 18
docs: 899
min results/query: 50
sample Q001 top5: [('A_82482585e94192b4e9913727e461f89cd08e9725', 1.0), ('A_8705e8fe0632bd11f200455a5125692a2547a018', 0.5), ('A_e1053197256c6c3c0631377ec23a3f7dc1cb4781', 0.3333333333333333), ('A_ca46c28ef834d57af1a0ce1d6393cca1c8395acb', 0.25), ('A_04471528ea465e7396588756db53af14b880837a', 0.2)]


In [20]:
import json, csv

TOPK = 10  # 每个query标注前10条，省时且足够比较排序能力

run = json.load(open("asta_run.json","r",encoding="utf-8"))
docs = json.load(open("asta_docs.json","r",encoding="utf-8"))

rows = []
for qid, ranking in run.items():
    top = sorted(ranking.items(), key=lambda x: x[1], reverse=True)[:TOPK]
    for rank, (docid, score) in enumerate(top, start=1):
        d = docs.get(docid, {})
        rows.append({
            "qid": qid,
            "rank": rank,
            "docid": docid,
            "score": score,
            "title": d.get("title",""),
            "year": d.get("year",""),
            "venue": d.get("venue",""),
            "url": d.get("url",""),
            "abstract": (d.get("abstract","") or "")[:500],
            "relevance": ""  # 你填 0/1/2
        })

with open("asta_judgments_top10.csv", "w", encoding="utf-8", newline="") as f:
    w = csv.DictWriter(f, fieldnames=list(rows[0].keys()))
    w.writeheader()
    w.writerows(rows)

print("Saved: asta_judgments_top10.csv")


Saved: asta_judgments_top10.csv


In [None]:
import csv, json
from collections import defaultdict

qrels = defaultdict(dict)

with open("/Users/terreturn/Desktop/python /Project_paper_finder/asta_judgments_top10_graded.csv", "r", encoding="utf-8") as f:
    for row in csv.DictReader(f):
        rel = row["relevance"].strip()
        if rel == "":
            continue
        qrels[row["qid"]][row["docid"]] = int(rel)

with open("asta_qrels.json", "w", encoding="utf-8") as f:
    json.dump(qrels, f, ensure_ascii=False, indent=2)

print("Saved: asta_qrels.json (for pytrec_eval)")


### Asta json file done


Saved: asta_qrels.json (for pytrec_eval)


In [18]:
import json, csv

RUN_SCHOLAR = "run_scholar_k10.json"
RUN_ASTA    = "asta_run.json"
DOCS_SCHOLAR = "docs_scholar_k10.json"
DOCS_ASTA    = "asta_docs.json"
ASTA_QRELS   = "asta_qrels.json"

OUT_POOL_CSV = "pooled_judgments_top10.csv"

def load(p):
    with open(p, "r", encoding="utf-8") as f:
        return json.load(f)

run_s = load(RUN_SCHOLAR)
run_a = load(RUN_ASTA)
docs_s = load(DOCS_SCHOLAR)
docs_a = load(DOCS_ASTA)
asta_qrels = load(ASTA_QRELS)

common_qids = sorted(set(run_s) & set(run_a))
rows = []
seen = set()

for qid in common_qids:
    pool = set(run_s[qid]) | set(run_a[qid])  # ≤ 20
    for docid in pool:
        if (qid, docid) in seen:
            continue
        seen.add((qid, docid))

        meta = docs_s.get(docid) or docs_a.get(docid) or {}
        source = []
        if docid in run_s[qid]: source.append("scholar")
        if docid in run_a[qid]: source.append("asta")

        rel = ""
        if qid in asta_qrels and docid in asta_qrels[qid]:
            rel = asta_qrels[qid][docid]   # 自动填 AstA 的标注

        rows.append({
            "qid": qid,
            "docid": docid,
            "source": "+".join(source),
            "title": meta.get("title",""),
            "url": meta.get("url", meta.get("link","")),
            "snippet": meta.get("abstract", meta.get("snippet","")),
            "relevance": rel
        })

with open(OUT_POOL_CSV, "w", encoding="utf-8", newline="") as f:
    w = csv.DictWriter(
        f,
        fieldnames=["qid","docid","source","title","url","snippet","relevance"]
    )
    w.writeheader()
    w.writerows(rows)

print("Saved:", OUT_POOL_CSV)
print("Next: open it and fill missing relevance (mostly scholar-only docs).")


Saved: pooled_judgments_top10.csv
Next: open it and fill missing relevance (mostly scholar-only docs).


# Comparison

In [11]:
import json
import pytrec_eval

def load_json(path):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

def mean_metric(per_query_scores, metric):
    vals = [v[metric] for v in per_query_scores.values() if metric in v]
    return sum(vals) / len(vals) if vals else float("nan")

In [21]:
import json, pandas as pd

asta = json.load(open("useful/asta_run_from_judgments.json"))
scholar = json.load(open("useful/scholar_run.json"))

rows = []

TOPK = 10

for qid in asta.keys():
    # Asta
    for rank, (docid, score) in enumerate(
        sorted(asta[qid].items(), key=lambda x: -x[1])[:TOPK], start=1
    ):
        rows.append({
            "qid": qid,
            "docid": docid,
            "source": "asta",
            "rank": rank,
            "score": score,
            "relevance": ""
        })

    # Scholar
    for rank, (docid, score) in enumerate(
        sorted(scholar[qid].items(), key=lambda x: -x[1])[:TOPK], start=1
    ):
        rows.append({
            "qid": qid,
            "docid": docid,
            "source": "scholar",
            "rank": rank,
            "score": score,
            "relevance": ""
        })

df = pd.DataFrame(rows).drop_duplicates(subset=["qid","docid"])
df.to_csv("qrels_pooling.csv", index=False)


In [None]:
import pandas as pd, json

df = pd.read_csv("useful/qrels_pooling 2.csv")

qrels = {}
for qid, grp in df.groupby("qid"):
    qrels[qid] = {
        row.docid: int(row.relevance)
        for row in grp.itertuples()
        if int(row.relevance) > 0
    }

json.dump(qrels, open("qrels_pooled.json","w"), indent=2)


In [29]:
# 1) Load
qrels   = load_json("useful/qrels_pooled.json")
asta    = load_json("useful/asta_run_from_judgments.json")
scholar = load_json("useful/scholar_run.json")

# 2) Evaluate
metrics = {"map", "ndcg_cut_10", "recall_10", "P_10"}
evaluator = pytrec_eval.RelevanceEvaluator(qrels, metrics)

asta_per_q = evaluator.evaluate(asta)
scholar_per_q = evaluator.evaluate(scholar)

# 3) Print summary
print("=== AVERAGE METRICS ===")
for m in ["map", "ndcg_cut_10", "recall_10", "P_10"]:
    print(f"{m:12s}  ASTA={mean_metric(asta_per_q, m):.4f}   SCHOLAR={mean_metric(scholar_per_q, m):.4f}")

# 4) (Optional) per-query diffs
print("\n=== PER-QUERY NDCG@10 DIFF (ASTA - SCHOLAR) ===")
for qid in sorted(qrels.keys()):
    a = asta_per_q.get(qid, {}).get("ndcg_cut_10", float("nan"))
    s = scholar_per_q.get(qid, {}).get("ndcg_cut_10", float("nan"))
    print(f"{qid}: {a - s:+.4f}")

=== AVERAGE METRICS ===
map           ASTA=0.4891   SCHOLAR=0.4944
ndcg_cut_10   ASTA=0.8911   SCHOLAR=0.9323
recall_10     ASTA=0.4955   SCHOLAR=0.5045
P_10          ASTA=0.9444   SCHOLAR=0.9611

=== PER-QUERY NDCG@10 DIFF (ASTA - SCHOLAR) ===
Q001: -0.0392
Q002: +0.0000
Q003: +0.0000
Q004: -0.0331
Q005: +0.0000
Q006: +0.0000
Q007: +0.0694
Q008: +0.1184
Q009: +0.2393
Q010: +0.0000
Q011: +0.0000
Q012: -0.0318
Q013: -0.4485
Q014: -0.2755
Q015: -0.0298
Q016: -0.1420
Q017: +0.0464
Q018: -0.2151
