In [46]:
import arxiv
from datetime import date

def retrieve_papers_from_arxiv(keywords, start_date, end_date):
    """
    从arXiv检索论文
    :param keywords: 关键词列表
    :param start_date: 起始日期
    :param end_date: 结束日期
    :return: 符合条件的论文列表
    """
    query = " OR ".join(keywords)
    
    client = arxiv.Client()
    search = arxiv.Search(
        query=query,
        max_results=10,  # 可根据需要调整
        sort_by=arxiv.SortCriterion.SubmittedDate,
        sort_order=arxiv.SortOrder.Descending
    )
    
    # results = []
    # for result in client.results(search):
    #     published = result.published.replace(tzinfo=None)
    #     print(published)
    #     print(result)
    #     if start_date <= published.date() <= end_date:
    #         print(published.date())
    #         # print(result)
    #         results.append(result)

    results = []
    for result in client.results(search):
        published = result.published.replace(tzinfo=None)
        if start_date <= published.date() <= end_date:
            matched_keywords = [kw for kw in keywords if kw.lower() in (result.title + result.summary).lower()]
            info = {
                "keywords": matched_keywords,
                "published": result.published,
                "html_link": result.entry_id,
                "summary": result.summary,
                "authors": [author.name for author in result.authors],
                "title": result.title,
                "article_id": result.get_short_id() if hasattr(result, "get_short_id") else result.entry_id.split('/')[-1]
            }
            results.append(info)
    
    return results



In [None]:
from langchain_openai import ChatOpenAI
from langchain_core.messages import SystemMessage, HumanMessage

def translate_content(abstracts):
    """
    读取abstract列表中的摘要内容
    使用LLM将内容从英文翻译成中文
    :param content: 要翻译的英文内容
    :return: 翻译后的中文内容
    """
    # llm = ChatOpenAI(
    #     api_key="sk-bvyamawkigskflwkanovigwdxvlrmajpmnqscmeynxcswwsa",
    #     base_url="https://api.siliconflow.cn/v1",
    #     model="Qwen/Qwen3-8B",
    #     temperature=0.2,
    # )

    llm = ChatOpenAI(
    api_key = "sk-fe47540246614af99efc750075628c17", 
    base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1",
    model_name = "qwen3-235b-a22b", 
    streaming = True,
    extra_body={"enable_thinking": False},
    temperature = 0
)

    translated_list = []
    for abstract in abstracts:
        abstr = abstract["summary"].strip()
        id = abstract["article_id"]
        messages = [
            SystemMessage(content="你是一个英译中的翻译助手，负责将英文内容翻译成中文。请确保翻译准确，并保持原文的语气和风格。"),
            HumanMessage(content=f"请将以下英文内容翻译成中文：\n\n{abstr}")
        ]
        feedback = llm.invoke(messages)
        info = {
            "summary_cn": feedback.content,
            "article_id": id
        }
        translated_list.append(info)
    return translated_list




In [48]:
import pymysql

def save_to_mysql():
    print("null")

ModuleNotFoundError: No module named 'pymysql'

In [49]:
import sqlite3

def save_to_sqlite(results, translates, db_path="arxiv_papers.db"):
    """
    将论文信息和翻译结果保存到sqlite数据库
    :param results: 论文信息列表（字典）
    :param translates: 翻译信息列表（字典，含article_id和summary_cn）
    :param db_path: 数据库文件路径
    :return: "Succeeded!" 或 ("Failure", 错误信息)
    """
    try:
        summary_cn_map = {item["article_id"]: item["summary_cn"] for item in translates}

        conn = sqlite3.connect(db_path)
        cursor = conn.cursor()
        cursor.execute("""
            CREATE TABLE IF NOT EXISTS papers (
                article_id TEXT PRIMARY KEY,
                title TEXT,
                authors TEXT,
                keywords TEXT,
                published TEXT,
                html_link TEXT,
                summary TEXT,
                summary_cn TEXT
            )
        """)

        for item in results:
            article_id = item["article_id"]
            title = item["title"]
            authors = ", ".join(item["authors"])
            keywords = ", ".join(item["keywords"])
            published = str(item["published"])
            html_link = item["html_link"]
            summary = item["summary"]
            summary_cn = summary_cn_map.get(article_id, "")

            cursor.execute("""
                INSERT OR REPLACE INTO papers
                (article_id, title, authors, keywords, published, html_link, summary, summary_cn)
                VALUES (?, ?, ?, ?, ?, ?, ?, ?)
            """, (article_id, title, authors, keywords, published, html_link, summary, summary_cn))

        conn.commit()
        conn.close()
        return "Succeeded!"
    except Exception as e:
        return ("Failure", str(e))

In [52]:
from datetime import date

# 关键词列表
keywords = [
    "RAG", "agent", "LLM", "multimodal", "embedding", "rerank"
]

# 设置时间段（例如2024-07-01到2024-07-06）
start_date = date(2025, 7, 3)
end_date = date(2025, 7, 7)
# print(start_date)
# print(end_date)

results_list = retrieve_papers_from_arxiv(keywords, start_date, end_date)

translates_list = translate_content(results_list)

result = save_to_sqlite(results_list,translates_list)

print(result)
# print(results_list)
# print(translates_list)

AttributeError: 'dict' object has no attribute 'summary'