In [1]:
# Cell 0 — 安装依赖（稳妥版：装到当前内核用的 Python）
import sys, subprocess

print("Using Python:", sys.executable)  # 确认是在你想要的环境里

pkgs = [
    "arxiv",
    "duckduckgo-search",
    "pandas",
    "python-dateutil",
    "tqdm",
    "scikit-learn",
    "langchain",
    "langchain-groq",
]

subprocess.check_call([sys.executable, "-m", "pip", "install", "-U"] + pkgs)
print("✅ Done. If imports still fail, click 'Kernel -> Restart' and run again.")


Using Python: C:\Users\tangb\Desktop\找工作\生成AIプロダクト開発インターン(学生インターン)\project\arxiv-paper-agent\.venv\Scripts\python.exe
✅ Done. If imports still fail, click 'Kernel -> Restart' and run again.


In [2]:
# 你的其他参数保持不变
QUERY = "thermal image"
ARXIV_MAX = 120
DAYS_BACK = 365
TOPK = 20
SLEEP_SEC = 0

# 只导出/展示“相关”的结果
KEEP_ONLY_RELEVANT = True
# 相关性门槛：至少命中多少个关键词才算相关（1=宽松，2=更严格）
RELEVANCE_MIN_HITS = 1

import re, pandas as pd
from datetime import datetime
def sanitize_filename(s: str):
    s = re.sub(r"[^\w\s\-]+", "", (s or "").strip())
    s = re.sub(r"\s+", "_", s)
    return s[:60] or "query"

STAMP = datetime.now().strftime("%Y%m%d_%H%M")
BASE = sanitize_filename(QUERY)


In [3]:
# 🧩 Cell 2 — 从自由文本自动构造“字段检索”（只搜标题/摘要），避免噪声
import arxiv, re
from datetime import datetime, timedelta, timezone
import pandas as pd

ARXIV_MAX = ARXIV_MAX if "ARXIV_MAX" in globals() else 120
DAYS_BACK = DAYS_BACK if "DAYS_BACK" in globals() else 365

def build_query_from_free_text(user_q: str) -> str:
    """
    用户只填自由文本，例如: 'thermal image'
    自动生成 arXiv 字段化查询（仅 ti/abs）：
      (ti:"thermal image" OR abs:"thermal image")
      OR ((ti:thermal OR abs:thermal) AND (ti:image OR abs:image))
    """
    q_raw = (user_q or "").strip()
    q = q_raw.replace('"', '')  # 去掉引号，避免语法冲突
    tokens = [t for t in re.split(r"\s+", q) if t]

    # 1) 短语匹配（标题/摘要）
    phrase_clause = f'(ti:"{q}" OR abs:"{q}")'

    # 2) 逐词 AND（标题/摘要）
    and_terms = []
    for t in tokens:
        if len(t) <= 2:
            continue  # 忽略太短的停用词
        and_terms.append(f"(ti:{t} OR abs:{t})")
    and_clause = " AND ".join(and_terms)

    # 3) 组合
    final = f"({phrase_clause} OR ({and_clause}))" if and_clause else phrase_clause
    return final

def fetch_arxiv(query: str, max_results: int = 80, days_back: int = 365) -> pd.DataFrame:
    search = arxiv.Search(query=query, sort_by=arxiv.SortCriterion.SubmittedDate, max_results=max_results)
    rows, seen = [], set()
    cutoff = datetime.now(timezone.utc) - timedelta(days=days_back)

    for r in search.results():
        pub = r.published if r.published.tzinfo else r.published.replace(tzinfo=timezone.utc)
        if pub < cutoff:
            continue
        key = (r.entry_id or "").strip().lower()
        if key in seen:
            continue
        seen.add(key)
        rows.append({
            "title": (r.title or "").strip(),
            "authors": ", ".join(a.name for a in r.authors),
            "summary": (r.summary or "").strip(),
            "published_utc": pub,
            "categories": ", ".join(r.categories or []),
            "pdf_url": r.pdf_url,
            "arxiv_url": r.entry_id,
            "doi": getattr(r, "doi", None),
            "comment": getattr(r, "comment", None),
        })

    df = pd.DataFrame(rows)
    if not df.empty:
        df = df.sort_values("published_utc", ascending=False).reset_index(drop=True)
    return df

# —— 实际检索（用户仍然只填 QUERY，我们自动字段化）
ADV_QUERY = build_query_from_free_text(QUERY)
print("构造的检索式：", ADV_QUERY)

df_arxiv = fetch_arxiv(ADV_QUERY, max_results=ARXIV_MAX, days_back=DAYS_BACK)

# 如果极端情况下为空，兜底再用 all:"短语"
if df_arxiv.empty:
    safe_q = (QUERY or "").strip().replace('"', '')
    fallback = f'all:"{safe_q}"'   # 先算 safe_q，再放进 f-string（避免反斜杠）
    print("字段检索为空，使用兜底：", fallback)
    df_arxiv = fetch_arxiv(fallback, max_results=ARXIV_MAX, days_back=DAYS_BACK)

print(f"[Auto-fielded] 命中：{len(df_arxiv)} 条；最新一条日期：",
      df_arxiv["published_utc"].max().strftime("%Y-%m-%d") if len(df_arxiv) else "N/A")
df_arxiv.head(5)


构造的检索式： ((ti:"thermal image" OR abs:"thermal image") OR ((ti:thermal OR abs:thermal) AND (ti:image OR abs:image)))


  for r in search.results():


[Auto-fielded] 命中：120 条；最新一条日期： 2025-09-16


Unnamed: 0,title,authors,summary,published_utc,categories,pdf_url,arxiv_url,doi,comment
0,Long-lived coronal loops in solar active regions,"N. Vasantharaju, H. Peter, L. P. Chitta, S. Ma...",Coronal loops are plasma structures in the sol...,2025-09-16 14:13:26+00:00,astro-ph.SR,http://arxiv.org/pdf/2509.13111v1,http://arxiv.org/abs/2509.13111v1,,"17 pages, 10 figures, accepted for publication..."
1,Coupled Infrared Imaging and Multiphysics Mode...,"Vijay Kumar, Kaitlyn M. Mullin, Hyunggon Park,...",Laser heating during additive manufacturing (A...,2025-09-16 00:54:23+00:00,cond-mat.mtrl-sci,http://arxiv.org/pdf/2509.12545v1,http://arxiv.org/abs/2509.12545v1,,"28 pages, 7 Figures"
2,Liquid Helium Cryogenic TEM below 1 Å,"Suk Hyun Sung, Maya Gates, Nishkarsh Agarwal, ...",Next-generation cryogenic transmission electro...,2025-09-15 21:45:51+00:00,"physics.ins-det, cond-mat.mtrl-sci",http://arxiv.org/pdf/2509.12475v1,http://arxiv.org/abs/2509.12475v1,,
3,"Compression, Impact and Hot Rebound Flows from...","Jamal Wachira, Patrick Antolin",Understanding the processes associated with co...,2025-09-15 06:44:40+00:00,astro-ph.SR,http://arxiv.org/pdf/2509.11627v1,http://arxiv.org/abs/2509.11627v1,,"15 pages, 21 figures, submitted to MNRAS"
4,Dual Band Video Thermography Near Ambient Cond...,"Sriram Narayanan, Mani Ramanagopal, Srinivasa ...",Long-wave infrared radiation captured by a the...,2025-09-14 16:21:29+00:00,cs.CV,http://arxiv.org/pdf/2509.11334v1,http://arxiv.org/abs/2509.11334v1,,


In [4]:
# 相关性词表（覆盖 thermal/infrared 的常见写法，不要求用户改词）
TERMS = {
    "thermal", "thermal image", "thermal imaging", "thermography", "thermographic", "thermogram",
    "infrared", "infrared imaging", "ir", "ir imaging", "flir",
    "lwir", "mwir", "swir", "long-wave infrared", "longwave infrared", "radiometric", "thermovision"
}

def relevance_hits(title: str, abstract: str) -> int:
    text = f"{title or ''} {abstract or ''}".lower()
    return sum(1 for t in TERMS if t in text)

# 只“标注”分数和是否相关，不删除任何行
df_arxiv["relevance_hits"] = df_arxiv.apply(lambda r: relevance_hits(r.get("title",""), r.get("summary","")), axis=1)
df_arxiv["relevant"] = df_arxiv["relevance_hits"] >= RELEVANCE_MIN_HITS

# 方便后续处理：把相关的排在前面
df_arxiv = df_arxiv.sort_values(["relevant", "published_utc"], ascending=[False, False]).reset_index(drop=True)
print("总数:", len(df_arxiv), "| 相关:", int(df_arxiv["relevant"].sum()))
df_arxiv.head(5)[["title","relevance_hits","relevant"]]


总数: 120 | 相关: 120


Unnamed: 0,title,relevance_hits,relevant
0,Long-lived coronal loops in solar active regions,2,True
1,Coupled Infrared Imaging and Multiphysics Mode...,3,True
2,Liquid Helium Cryogenic TEM below 1 Å,2,True
3,"Compression, Impact and Hot Rebound Flows from...",2,True
4,Dual Band Video Thermography Near Ambient Cond...,6,True


In [5]:
# 🧩 Cell 4 — 读取密钥 + 初始化 Groq（完整可运行版）

import os
from getpass import getpass

# 方式一：如果你已在 PowerShell 设置了环境变量，这里会直接读到
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

# 方式二：如果上面没读到，这里交互式粘贴（输入不会显示）
if not GROQ_API_KEY:
    GROQ_API_KEY = getpass("Paste your GROQ_API_KEY (input hidden): ")
    os.environ["GROQ_API_KEY"] = GROQ_API_KEY

# 初始化 Groq + LangChain
from langchain_groq import ChatGroq
from langchain_core.messages import HumanMessage

MODEL_NAME = "llama-3.3-70b-versatile"   # 更快可用 "llama-3.1-8b-instant"
llm = ChatGroq(model=MODEL_NAME, temperature=0.2, max_tokens=512)

def summarize_one_en(title: str, abstract: str, keyword: str) -> str:
    """严格三条英文要点；不允许硬扯相关性。"""
    prompt = f"""Summarize in English using EXACTLY three one-sentence bullets.
Use your own words; do not copy phrases from the abstract.

- What it does:
- Novelty:
- Relevance to "{keyword}": (If not clearly about the topic, output exactly: Not relevant to "{keyword}".)

Title: {title}
Abstract: {abstract}
"""
    return llm.invoke([HumanMessage(content=prompt)]).content

def split_three_lines(text: str):
    lines = [l.strip() for l in (text or "").splitlines() if l.strip()]
    out = []
    import re
    for l in lines:
        l2 = l.lstrip("-*•").strip()
        l2 = re.sub(r"^(What it does|Novelty|Relevance.*?):\s*", "", l2, flags=re.I)
        if l2:
            out.append(l2)
    while len(out) < 3:
        out.append("N/A")
    return out[0], out[1], out[2]

print("Groq ready:", MODEL_NAME, "| Key loaded:", "***" + (GROQ_API_KEY[-4:] if GROQ_API_KEY else "NONE"))


Groq ready: llama-3.3-70b-versatile | Key loaded: ***jXdn


In [6]:
# 只对“相关”的做英文三要点总结；若一个也没有，就提示并跳过
from tqdm import tqdm
import time

if "df_arxiv" not in globals():
    raise RuntimeError("df_arxiv 未找到，请先运行 Cell 2。")
if "summarize_one_en" not in globals():
    raise RuntimeError("LLM 函数未定义，请先运行 Cell 4。")

# 确保三列存在
for col in ["summary_en_what","summary_en_novelty","summary_en_relevance"]:
    if col not in df_arxiv.columns:
        df_arxiv[col] = pd.NA

# 只取相关的；再按 TOPK 截断
rows_rel = df_arxiv[df_arxiv.get("relevant", False)].head(min(TOPK, len(df_arxiv)))
if rows_rel.empty:
    print(f"⚠️ 没有达到相关性门槛（RELEVANCE_MIN_HITS={RELEVANCE_MIN_HITS}）的论文。")
    print("建议：把 RELEVANCE_MIN_HITS 设为 1 或放宽关键词；本次不进行 LLM 总结。")
else:
    ok = fail = 0
    for idx, r in tqdm(rows_rel.iterrows(), total=len(rows_rel), desc="Summarizing relevant"):
        try:
            text = summarize_one_en(r.get("title",""), r.get("summary",""), QUERY)
            w, n, rel = split_three_lines(text)
            # 相关性已判定为 True，就不要 Not relevant 的字样
        except Exception as e:
            w, n = "N/A", "N/A"
            rel = f'Not summarized (error: {e.__class__.__name__})'
            fail += 1
        else:
            ok += 1
        df_arxiv.at[idx, "summary_en_what"] = w
        df_arxiv.at[idx, "summary_en_novelty"] = n
        df_arxiv.at[idx, "summary_en_relevance"] = rel
        if SLEEP_SEC:
            time.sleep(SLEEP_SEC)
    print(f"Done. 成功 {ok} 条，失败 {fail} 条。")


Summarizing relevant: 100%|████████████████████████████████████████████████████████████| 20/20 [00:15<00:00,  1.30it/s]

Done. 成功 20 条，失败 0 条。





In [7]:
# 🧩 Cell 6 — 稳健版预览：只显示已写入摘要，兼容不同相关性列
import pandas as pd

# 想展示的列，存在才显示
want_cols = [
    "title","authors","published_utc",
    "relevant","maybe_relevant","relevance_hits",
    "summary_en_what","summary_en_novelty","summary_en_relevance",
    "pdf_url","arxiv_url"
]
cols = [c for c in want_cols if c in df_arxiv.columns]

# 只看已经写入三要点的行
if "summary_en_what" in df_arxiv.columns:
    filled = df_arxiv.dropna(subset=["summary_en_what"]).copy()
else:
    filled = df_arxiv.iloc[0:0].copy()

if filled.empty:
    print("还没有任何摘要被写入。请依次运行：Cell 1→2→3→4→5，然后再运行本单元。")
else:
    # 排序：优先按“相关性”与时间
    sort_keys, ascending = [], []
    if "relevant" in filled.columns:
        sort_keys.append("relevant"); ascending.append(False)
    elif "maybe_relevant" in filled.columns:
        sort_keys.append("maybe_relevant"); ascending.append(False)
    if "relevance_hits" in filled.columns:
        sort_keys.append("relevance_hits"); ascending.append(False)
    if "published_utc" in filled.columns:
        sort_keys.append("published_utc"); ascending.append(False)
    if sort_keys:
        filled = filled.sort_values(sort_keys, ascending=ascending)

    display(filled[cols].head(10))


Unnamed: 0,title,authors,published_utc,relevant,relevance_hits,summary_en_what,summary_en_novelty,summary_en_relevance,pdf_url,arxiv_url
4,Dual Band Video Thermography Near Ambient Cond...,"Sriram Narayanan, Mani Ramanagopal, Srinivasa ...",2025-09-14 16:21:29+00:00,True,6,The method separates the reflected and emitted...,The novelty of this approach lies in its abili...,"This research is highly relevant to ""thermal i...",http://arxiv.org/pdf/2509.11334v1,http://arxiv.org/abs/2509.11334v1
6,A novel IR-SRGAN assisted super-resolution eva...,"Pengfei Zhu, Hai Zhang, Stefano Sfarra, Fabriz...",2025-09-13 16:53:14+00:00,True,5,The study evaluates the use of infrared thermo...,The novelty of this research lies in the devel...,"The research is highly relevant to ""thermal im...",http://arxiv.org/pdf/2509.10894v1,http://arxiv.org/abs/2509.10894v1
7,Viewing heat through ice: an infrared camera m...,"Gennadiy O. Kovalov, Mykola O. Chyzh, Vyachesl...",2025-09-12 17:42:27+00:00,True,5,The study utilizes an infrared camera to monit...,The novelty of this research lies in the devel...,"The research is highly relevant to ""thermal im...",http://arxiv.org/pdf/2509.10434v1,http://arxiv.org/abs/2509.10434v1
5,Real-Time Super-Resolution Imaging System Base...,"Pengfei Zhu, Ziang Wei, Ahmad Osman, Clemente ...",2025-09-13 17:00:06+00:00,True,4,The system enhances the resolution of infrared...,The novelty of this approach lies in its use o...,"The system is highly relevant to ""thermal imag...",http://arxiv.org/pdf/2509.10902v1,http://arxiv.org/abs/2509.10902v1
1,Coupled Infrared Imaging and Multiphysics Mode...,"Vijay Kumar, Kaitlyn M. Mullin, Hyunggon Park,...",2025-09-16 00:54:23+00:00,True,3,The approach combines infrared imaging and mul...,The novelty of this method lies in its ability...,"This research is highly relevant to ""thermal i...",http://arxiv.org/pdf/2509.12545v1,http://arxiv.org/abs/2509.12545v1
8,Ordinality of Visible-Thermal Image Intensitie...,"Zeqing Leo Yuan, Mani Ramanagopal, Aswin C. Sa...",2025-09-12 16:29:02+00:00,True,3,The method decomposes an image into its intrin...,The novelty of this approach lies in its abili...,"The relevance of this method to ""thermal image...",http://arxiv.org/pdf/2509.10388v1,http://arxiv.org/abs/2509.10388v1
10,TUNI: Real-time RGB-T Semantic Segmentation wi...,"Xiaodong Guo, Tong Liu, Yike Li, Zi'ang Lin, Z...",2025-09-12 07:02:45+00:00,True,3,The proposed TUNI system enables real-time sem...,The novelty of TUNI lies in its ability to int...,"The TUNI system is highly relevant to ""thermal...",http://arxiv.org/pdf/2509.10005v1,http://arxiv.org/abs/2509.10005v1
14,On the Detection of Exorings in Reflected Ligh...,"Rachel Bowens-Rubin, Mary Anne Limbach, Sam Ho...",2025-09-08 18:15:14+00:00,True,3,The study explores the possibility of detectin...,The novelty of this research lies in its inves...,"The relevance to ""thermal image"" is Not releva...",http://arxiv.org/pdf/2509.07118v1,http://arxiv.org/abs/2509.07118v1
15,Error Signals for Overcoming the Laser Power L...,"Liu Tao, Pooyan Goodarzi, Jonathan W. Richardson",2025-09-08 16:09:42+00:00,True,3,The method involves using thermal imaging to c...,The novelty of this approach lies in its abili...,The relevance of this method to thermal images...,http://arxiv.org/pdf/2509.06840v1,http://arxiv.org/abs/2509.06840v1
0,Long-lived coronal loops in solar active regions,"N. Vasantharaju, H. Peter, L. P. Chitta, S. Ma...",2025-09-16 14:13:26+00:00,True,2,The study examines the properties of coronal l...,The research presents novel findings on the st...,The investigation of coronal loops is highly r...,http://arxiv.org/pdf/2509.13111v1,http://arxiv.org/abs/2509.13111v1


In [8]:
# 只导出“相关 & 已有三要点”的结果
export_df = df_arxiv.copy()
if KEEP_ONLY_RELEVANT:
    export_df = export_df[export_df.get("relevant", False)]
export_df = export_df.dropna(subset=["summary_en_what"])

csv_out = f"arxiv_{BASE}_{STAMP}_summary_en.csv"
md_out  = f"arxiv_{BASE}_{STAMP}_summary_en.md"

export_df.to_csv(csv_out, index=False, encoding="utf-8-sig")

lines = [f"# arXiv Search — {QUERY} (generated {STAMP})\n"]
for i, r in export_df.iterrows():
    lines.append(f"## {r.get('title','N/A')}\n")
    lines.append(f"- Authors: {r.get('authors','N/A')}")
    if isinstance(r.get("published_utc"), pd.Timestamp):
        lines.append(f"- Date (UTC): {r['published_utc'].strftime('%Y-%m-%d')}")
    lines.append(f"- Links: [PDF]({r.get('pdf_url','')}) | [arXiv]({r.get('arxiv_url','')})")
    lines.append(f"- **What**: {r.get('summary_en_what','N/A')}")
    lines.append(f"- **Novelty**: {r.get('summary_en_novelty','N/A')}")
    lines.append(f"- **Relevance to “{QUERY}”**: {r.get('summary_en_relevance','N/A')}")
    lines.append("")
with open(md_out, "w", encoding="utf-8") as f:
    f.write("\n".join(lines))

print("Saved:")
print(" -", csv_out)
print(" -", md_out)
print("导出条数:", len(export_df))


Saved:
 - arxiv_thermal_image_20250917_0322_summary_en.csv
 - arxiv_thermal_image_20250917_0322_summary_en.md
导出条数: 20
