# 包下载

In [None]:
!pip install pandas -q -i https://pypi.tuna.tsinghua.edu.cn/simple
!pip install arxiv -q -i https://pypi.tuna.tsinghua.edu.cn/simple
!pip install pydantic -q -i https://pypi.tuna.tsinghua.edu.cn/simple
!pip install langgraph langchain-community langchain-ollama -q -i https://pypi.tuna.tsinghua.edu.cn/simple

In [8]:
from pydantic import BaseModel,Field
from langchain_core.output_parsers import PydanticOutputParser
from langchain_core.prompts import ChatPromptTemplate,MessagesPlaceholder
from typing import Annotated
from typing_extensions import TypedDict
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
from langchain.chat_models import ChatOpenAI
from pydantic.fields import FieldInfo
from langchain_ollama import ChatOllama
from langchain_community.llms import VLLMOpenAI
from typing import Optional,List


# LLM定义
此Demo需要花费大量的token，建议使用本地部署的LLM

In [2]:

import os
os.environ["OLLAMA_HOST"]="http://localhost:11450"

llm = ChatOllama(
    model="qwen2.5",
    temperature=0,
    max_token = 300000,    
    # other params...
)

llm_multimodal = ChatOllama(
    model="llama3.2-vision:90b",
    temperature=0,
    # other params...
)

# 文档读取

In [3]:
from langchain.document_loaders import ArxivLoader
doc = ArxivLoader(query="1706.03762", load_max_docs=1).load()

In [4]:
doc_content = doc[0].page_content

In [None]:
from IPython.display import Markdown
Markdown(doc_content)

# QA对构建

In [6]:
from langchain import PromptTemplate

template = """
As a professional reader, carefully read the provided text and complete the following tasks:
1.Generate as many questions as possible, covering different aspects such as facts, reasoning, word meaning, and structure to fully explore the understanding of the text.
2.Use direct quotations from the original text whenever possible in your answers (you may simplify or omit parts appropriately, but do not change the meaning).
3.Strictly follow the output format below, with one question-answer pair per line and no extra numbering or spacing:

question: <your question>
answer: <your answer>

Example:
question: How does the author describe the significance of the study?
answer: “This study is of critical importance to...”

Please follow the above instructions to complete the task.

"""

from langchain_core.prompts import ChatPromptTemplate,MessagesPlaceholder
paper_segment_prompt_finally = ChatPromptTemplate(
    [
        ("system",template),
        ("user","{paper}")
    ]
)

rag_chain = paper_segment_prompt_finally | llm | StrOutputParser() 

# 结构输出

In [9]:
from langchain_core.output_parsers import PydanticOutputParser

class QA(BaseModel):
    question: Optional[str] = Field(default=None,description="question in a QA pair"),
    answer: Optional[str] = Field(default=None,description="answer in a QA pair"),
        
class QA_LIST(BaseModel):
    QA_Pair : List[QA]

parse = PydanticOutputParser(pydantic_object=QA_LIST)
      
paper_segment_prompt_finally = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "Wrap the output in `json` tags\n{format_instructions}",
        ),
        ("human", "text：{paper}"),
    ]
).partial(format_instructions=parse.get_format_instructions())

QA_segment_llm = paper_segment_prompt_finally | llm | parse



# 开始构建

In [10]:
qa_list = []
for i in range(0,len(doc_content),2000):
    try:
        aq_string = rag_chain.invoke({"paper":doc_content[i:i+2000]})
        qa_pair = QA_segment_llm.invoke(aq_string)
        for d in qa_pair.QA_Pair:
            qa_list.append([d.question,d.answer])
    except:
        pass

In [12]:
qa_list_fina = []
for qa in qa_list:
    if not isinstance(qa[0],tuple):
        qa_list_fina.append(qa)

# 保存为CSV

In [15]:
import pandas as pd
df_qa = pd.DataFrame(qa_list_fina,columns=["question","answer"])
df_qa.to_csv("qa_data.csv")