In [1]:
from haystack.document_stores import InMemoryDocumentStore
from haystack.nodes import PDFToTextConverter, PreProcessor, FARMReader
from haystack.pipelines import ExtractiveQAPipeline

  from .autonotebook import tqdm as notebook_tqdm


# 提取PDF文本 & 预处理文档

In [17]:
import fitz

# 提取 PDF 文本的函数
def extract_text_from_pdf(file_path):
    doc = fitz.open(file_path)
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text()
    return text

# 提取PDF文本
pdf_path = "CSST科学白皮书_v1.2.pdf"
text = extract_text_from_pdf(pdf_path)
documents = [{"content": text, "meta": {"name": "CSST科学白皮书_v1.2.pdf"}}]

# 初始化 InMemoryDocumentStore
document_store = InMemoryDocumentStore(use_bm25=True)

#预处理文档
preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    split_by="word",
    split_length=500,
    split_respect_sentence_boundary=True,
    split_overlap=50
)
documents = preprocessor.process(documents)



Preprocessing:   0%|                                                                           | 0/1 [00:00<?, ?docs/s][AWe found one or more sentences whose split count is higher than the split length.

Preprocessing: 100%|███████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.47docs/s][A


# 文档写入，创建pipeline

In [19]:
from haystack.nodes import BM25Retriever
# 将文档写入 DocumentStore
document_store.write_documents(documents)
retriever = BM25Retriever(document_store=document_store)
# 初始化 FARMReader，使用支持中文的预训练模型
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2")

# 创建 ExtractiveQAPipeline
pipeline = ExtractiveQAPipeline(reader, retriever)

Updating BM25 representation...: 100%|████████████████████████████████████████████| 52/52 [00:00<00:00, 2482.74 docs/s]


# 定义问题并生成答案

In [20]:
# 定义问题并生成答案
questions = [
    "积分视场光谱仪是什么？",
    "多通道成像仪的研制单位是哪个？",
    "这篇文章发表于什么时候？",
    "这篇文章是哪个团队发表的？"
]

# 生成问答对
for question in questions:
    prediction = pipeline.run(
        query=question,
        params={
            "Retriever": {"top_k": 10},
            "Reader": {"top_k": 5}
        })
    answers = prediction["answers"]
    if answers:
        answer = answers[0].answer
        print(f"Question: {question}\nAnswer: {answer}\n")
    else:
        print(f"Question: {question}\nAnswer: No answer found.\n")

Inferencing Samples: 100%|███████████████████████████████████████████████████████| 13/13 [05:13<00:00, 24.10s/ Batches]


Question: 积分视场光谱仪是什么？
Answer: 如



Inferencing Samples: 100%|███████████████████████████████████████████████████████| 13/13 [05:08<00:00, 23.76s/ Batches]


Question: 多通道成像仪的研制单位是哪个？
Answer: 其中



Inferencing Samples: 100%|███████████████████████████████████████████████████████| 13/13 [05:13<00:00, 24.13s/ Batches]

Question: 这篇文章发表于什么时候？这篇文章是哪个团队发表的？
Answer: CSST






In [1]:
import torch
import torch.nn.functional as F

x1 = torch.randn(2, 3, 4)  #形状(batch_size, seq_len1, feature_dim)
x2 = torch.randn(2, 5, 4)
print("x1:", x1)
print("x2:", x2)

x1: tensor([[[-0.9459,  0.5139, -0.8711,  0.2664],
         [-0.2224,  0.0059,  1.3308, -0.1072],
         [-1.5125, -0.0313, -0.6194,  0.7078]],

        [[ 1.1933,  0.5164,  1.4111,  1.2338],
         [-0.3656, -1.7279, -1.1973,  0.1795],
         [-0.4460, -0.0504,  0.8660,  1.1754]]])
x2: tensor([[[ 0.9100,  0.5204, -0.5425,  1.0211],
         [-0.8083,  1.4383,  2.5470,  1.6260],
         [ 0.2957, -0.9842,  0.1589,  0.6096],
         [ 0.3994,  1.3025, -0.2180,  0.2563],
         [ 0.5113, -0.6748, -0.4893,  0.3817]],

        [[ 0.0955, -3.2458,  0.3186,  1.1472],
         [ 0.7497, -0.5240,  0.6566,  0.5983],
         [ 0.8276,  0.6803,  0.3474, -0.9442],
         [ 1.2801,  0.1947,  0.2949, -2.2902],
         [ 1.4130,  0.1630,  0.4990, -2.4176]]])


In [2]:
raw_weights = torch.bmm(x1, x2.transpose(1,2))
print("原始权重：", raw_weights)

原始权重： tensor([[[ 0.1513, -0.2819, -0.7615,  0.5497, -0.3026],
         [-1.0308,  3.4035,  0.0744, -0.3987, -0.8098],
         [-0.3338,  0.7507, -0.0834, -0.3285, -0.1790]],

        [[ 0.3030,  2.2887,  0.6642, -0.7813, -0.5085],
         [ 5.3980, -0.0473, -2.0636, -1.5688, -1.8297],
         [ 1.7452,  0.9638, -1.2123, -3.0172, -3.0479]]])


In [3]:
attn_weights = F.softmax(raw_weights, dim=-1)
print("归一化后的注意力权重：", attn_weights)

归一化后的注意力权重： tensor([[[2.3955e-01, 1.5534e-01, 9.6157e-02, 3.5679e-01, 1.5216e-01],
         [1.0936e-02, 9.2182e-01, 3.3026e-02, 2.0577e-02, 1.3641e-02],
         [1.3486e-01, 3.9891e-01, 1.7323e-01, 1.3558e-01, 1.5743e-01]],

        [[9.5230e-02, 6.9361e-01, 1.3666e-01, 3.2200e-02, 4.2299e-02],
         [9.9348e-01, 4.2882e-03, 5.7098e-04, 9.3647e-04, 7.2147e-04],
         [6.5506e-01, 2.9988e-01, 3.4030e-02, 5.5975e-03, 5.4282e-03]]])


In [4]:
attn_output = torch.bmm(attn_weights, x2)
print("注意力输出：", attn_output)

注意力输出： tensor([[[ 0.3412,  0.6155,  0.1287,  0.7053],
         [-0.7102,  1.3166,  2.3360,  1.5406],
         [-0.0138,  0.5438,  0.8638,  0.9868]],

        [[ 0.7432, -0.5664,  0.5638,  0.2192],
         [ 0.1008, -3.2262,  0.3201,  1.1379],
         [ 0.3304, -2.2582,  0.4218,  0.8728]]])
