In [1]:
import datetime
import time
import pymupdf as fitz  # PyMuPDF
import requests
import json
import numpy as np
from transformers import BertTokenizer
import os

  from .autonotebook import tqdm as notebook_tqdm


# 文本分割函数

In [3]:
# 初始化BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def split_text(text, max_length, max_tokens):
    """将文本按字符数和token数分割成不超过max_length字符和max_tokens的段落"""
    paragraphs = []
    current_paragraph = ""
    current_tokens = 0

    for line in text.split("\n"):
        line_tokens = tokenizer.encode(line, add_special_tokens=False)
        if (len(current_paragraph) + len(line) + 1 <= max_length) and (current_tokens + len(line_tokens) + 1 <= max_tokens):
            current_paragraph += line + "\n"
            current_tokens += len(line_tokens) + 1  # +1 for the newline token
        else:
            paragraphs.append(current_paragraph.strip())
            current_paragraph = line + "\n"
            current_tokens = len(line_tokens) + 1

    if current_paragraph:
        paragraphs.append(current_paragraph.strip())
    return paragraphs


# 从PDF中提取文本

In [8]:
import os

# 从PDF文件中提取文本函数
def extract_text_from_pdf(pdf_path):
    text = ""
    pdf_document = fitz.open(pdf_path)
    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        text += page.get_text()
    return text


# 提取PDF文件中的文本并按token数和字符数分割
pdf_files = ["csst0级数据定义.pdf", "csst1级数据定义.pdf", "CSST科学数据处理系统数据处理软件系统设计20231108.pdf"]
max_length = 18000
max_tokens = 4220
documents = []
for pdf in pdf_files:
    text = extract_text_from_pdf(pdf)
    paragraphs = split_text(text, max_length, max_tokens)
    documents.extend(paragraphs)

# 将分割后的文本块分别存储到多个txt文件中，并保存文件名到一个列表中
txt_dir = 'txt_files'
file_names = []
for idx, doc in enumerate(documents, start=1):
    file_name = os.path.join(txt_dir, f"text_file{idx}.txt")
    with open(file_name, "w", encoding="utf-8") as file:
        file.write(doc)
    file_names.append(file_name)

print("文本文件已成功生成。")
print("生成的文件名列表：", file_names)

文本文件已成功生成。
生成的文件名列表： ['txt_files/text_file1.txt', 'txt_files/text_file2.txt', 'txt_files/text_file3.txt', 'txt_files/text_file4.txt', 'txt_files/text_file5.txt', 'txt_files/text_file6.txt', 'txt_files/text_file7.txt', 'txt_files/text_file8.txt', 'txt_files/text_file9.txt', 'txt_files/text_file10.txt', 'txt_files/text_file11.txt', 'txt_files/text_file12.txt', 'txt_files/text_file13.txt', 'txt_files/text_file14.txt', 'txt_files/text_file15.txt', 'txt_files/text_file16.txt', 'txt_files/text_file17.txt', 'txt_files/text_file18.txt', 'txt_files/text_file19.txt', 'txt_files/text_file20.txt', 'txt_files/text_file21.txt', 'txt_files/text_file22.txt', 'txt_files/text_file23.txt', 'txt_files/text_file24.txt', 'txt_files/text_file25.txt', 'txt_files/text_file26.txt', 'txt_files/text_file27.txt', 'txt_files/text_file28.txt', 'txt_files/text_file29.txt', 'txt_files/text_file30.txt', 'txt_files/text_file31.txt', 'txt_files/text_file32.txt', 'txt_files/text_file33.txt', 'txt_files/text_file34.txt', '

# 问题生成prompt

In [2]:
prompt1 = '''
#01 你是一个问答对数据集处理专家。

#02 你的任务是根据我给出的内容，生成适合作为问答对数据集的问题。

#03 问题要尽量短，不要太长。

#04 一句话中只能有一个问题。

#05 最多生成15个问题。

#07 生成问题示例：

"""

"积分视场光谱仪是什么？"
"多通道成像仪的研制单位是哪个？"
介绍一下暗能量。

"""

#07 以下是我给出的内容：

"""

{{此处替换成你的内容}}

"""
'''

# 问答对生成prompt

In [3]:
prompt2 = '''
#01 你是一个问答对数据集处理专家。

#02 你的任务是根据我的问题和我给出的内容，生成对应的问答对。

#03 答案要全面，只使用我的信息，如果找不到答案，就回复从文档中找不到答案。

#04 你必须根据我的问答对示例格式来生成：

"""

{"content": "星冕仪模块三个主要观测目标是什么？", "summary": "星冕仪模块是三个主要观测目标是：1.近邻恒星高对比度成像普查。2.视向速度探测已知系外行星后随观测。3.恒星星周盘高对比度成像监测，并对恒星外星黄道尘强度分布进行定量分析。"}

{"content": "空间站多功能光学设施的任务是什么？", "summary": "空间站多功能光学设施2013年立项之初即明确以大规模天文巡天为主任务。空间站多功能光学设施是我国载人航天工程规划建设的大型空间天文望远镜，口径 2 米，兼具大视场和高像质的优异性能，并具备在轨维护升级的能力。"}

#05 我的问题如下：

"""

{{此处替换成你上一步生成的问题}}

"""

#06 我的内容如下：

"""

{{此处替换成你的内容}}

"""
'''

# 文心一言配置

In [4]:
# 设置百度文心一言的API密钥和端点
API_KEY = "MxvHfAoOFUATRfpnohbnBAYb"
SECRET_KEY = "hOW7n2JSxNJQV1UvYxoUHmoNkmxGi3eB"

def get_access_token():
    """
    使用 AK，SK 生成鉴权签名（Access Token）
    :return: access_token，或是None(如果错误)
    """
    url = "https://aip.baidubce.com/oauth/2.0/token"
    params = {"grant_type": "client_credentials", "client_id": API_KEY, "client_secret": SECRET_KEY}
    return str(requests.post(url, params=params).json().get("access_token"))

# 问题生成函数

In [5]:
def generate_question(text_content, more=False):
    url = "https://aip.baidubce.com/rpc/2.0/ai_custom/v1/wenxinworkshop/chat/completions?access_token=" + get_access_token()
    content= "生成适合作为问答对的问题"
    if more:
        content = "尽可能多生成适合作为问答对的问题"
    prompt = prompt1.replace("{{此处替换成你的内容}}", text_content)
    payload = json.dumps({
        "messages": [
            {
                "role": "user",
                "content": content
            }
        ],
        "temperature": 0.95,
        "top_p": 0.8,
        "system":prompt
    })
    headers = {
        'Content-Type': 'application/json'
    }
    start_time = time.time()
    response = requests.request("POST", url, headers=headers, data=payload)
    x = json.loads(response.text)
    print("耗时", time.time() - start_time)
    print(x)
    if response.status_code == 200:
        return x['result']
    else:
        print(f"Error: {response.status_code}")
        print(response.content)
        return None


# 问答对生成函数

In [6]:
def generate_qa(text_content, question_text=None):
    url = "https://aip.baidubce.com/rpc/2.0/ai_custom/v1/wenxinworkshop/chat/completions?access_token=" + get_access_token()
    content= "拼成问答对"
    prompt = prompt2.replace("{{此处替换成你上一步生成的问题}}", question_text).replace("{{此处替换成你的内容}}", text_content)
    payload = json.dumps({
        "messages": [
            {
                "role": "user",
                "content": content
            }
        ],
        "temperature": 0.95,
        "top_p": 0.8,
        "system":prompt
    })
    headers = {
        'Content-Type': 'application/json'
    }
    start_time = time.time()
    response = requests.request("POST", url, headers=headers, data=payload)
    x = json.loads(response.text)
    print("耗时", time.time() - start_time)
    print(x)
    if response.status_code == 200:
        return x['result']
    else:
        print(f"Error: {response.status_code}")
        print(response.content)
        return None


# 将生成的问答对写入.txt文件

In [7]:
# 将分割后的文本块分别存储到多个txt文件中，并保存文件名到一个列表中
txt_dir = 'txt_files'
def write_to_file(content):
    timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
    file_name = os.path.join(txt_dir, f"new_file_{timestamp}.txt")
    with open(file_name, "w", encoding="utf-8") as file:
        file.write(content)
    print("File 'new_file.txt' has been created and written.")

# 读取 pdf生成的txt文件

In [9]:
def read_file(file_name):
    try:
        with open(file_name, "r", encoding='utf-8') as file:
            content = file.read()
        return content
    except FileNotFoundError:
        print(f"File '{file_name}' not found.")

In [9]:
# 生成包含文件名的列表，从'text_file1.txt'到'text_file69.txt'
file_names = [f"txt_files/text_file{i}.txt" for i in range(1, 70)]

# 打印结果以验证
print(file_names)

['txt_files/text_file1.txt', 'txt_files/text_file2.txt', 'txt_files/text_file3.txt', 'txt_files/text_file4.txt', 'txt_files/text_file5.txt', 'txt_files/text_file6.txt', 'txt_files/text_file7.txt', 'txt_files/text_file8.txt', 'txt_files/text_file9.txt', 'txt_files/text_file10.txt', 'txt_files/text_file11.txt', 'txt_files/text_file12.txt', 'txt_files/text_file13.txt', 'txt_files/text_file14.txt', 'txt_files/text_file15.txt', 'txt_files/text_file16.txt', 'txt_files/text_file17.txt', 'txt_files/text_file18.txt', 'txt_files/text_file19.txt', 'txt_files/text_file20.txt', 'txt_files/text_file21.txt', 'txt_files/text_file22.txt', 'txt_files/text_file23.txt', 'txt_files/text_file24.txt', 'txt_files/text_file25.txt', 'txt_files/text_file26.txt', 'txt_files/text_file27.txt', 'txt_files/text_file28.txt', 'txt_files/text_file29.txt', 'txt_files/text_file30.txt', 'txt_files/text_file31.txt', 'txt_files/text_file32.txt', 'txt_files/text_file33.txt', 'txt_files/text_file34.txt', 'txt_files/text_file35

# 主程序

In [10]:
txt_dir = 'txt_files'
file_names = ['txt_files/text_file18.txt', 'txt_files/text_file19.txt', 'txt_files/text_file20.txt', 'txt_files/text_file21.txt', 'txt_files/text_file22.txt', 'txt_files/text_file23.txt', 'txt_files/text_file24.txt', 'txt_files/text_file25.txt', 'txt_files/text_file26.txt', 'txt_files/text_file27.txt', 'txt_files/text_file28.txt', 'txt_files/text_file29.txt', 'txt_files/text_file30.txt', 'txt_files/text_file31.txt', 'txt_files/text_file32.txt', 'txt_files/text_file33.txt', 'txt_files/text_file34.txt', 'txt_files/text_file35.txt', 'txt_files/text_file36.txt', 'txt_files/text_file37.txt', 'txt_files/text_file38.txt', 'txt_files/text_file39.txt', 'txt_files/text_file40.txt', 'txt_files/text_file41.txt', 'txt_files/text_file42.txt', 'txt_files/text_file43.txt', 'txt_files/text_file44.txt', 'txt_files/text_file45.txt', 'txt_files/text_file46.txt', 'txt_files/text_file47.txt', 'txt_files/text_file48.txt', 'txt_files/text_file49.txt', 'txt_files/text_file50.txt', 'txt_files/text_file51.txt', 'txt_files/text_file52.txt', 'txt_files/text_file53.txt', 'txt_files/text_file54.txt', 'txt_files/text_file55.txt', 'txt_files/text_file56.txt', 'txt_files/text_file57.txt', 'txt_files/text_file58.txt', 'txt_files/text_file59.txt', 'txt_files/text_file60.txt', 'txt_files/text_file61.txt', 'txt_files/text_file62.txt', 'txt_files/text_file63.txt', 'txt_files/text_file64.txt', 'txt_files/text_file65.txt', 'txt_files/text_file66.txt', 'txt_files/text_file67.txt', 'txt_files/text_file68.txt', 'txt_files/text_file69.txt']
for file in file_names:
    text_content = read_file(file)
    print(file)
    print ('text_content\n', text_content)
    question_text = generate_question(text_content=text_content, more=False)
    print('question_text\n', question_text)
    qa_text = generate_qa(text_content=text_content, question_text=question_text)
    print('qa_text\n', qa_text)
    write_to_file(qa_text)

txt_files/text_file18.txt
text_content
 DEC_PNT0
47.19525833333330
float DEC of the pointing
(degrees) at
CABSTART
EXPEND
24597671.00001160
float exposure end time
(MJD)
CABEND
24597671.00001160
float nearst absolute time
after exposure end
(MJD)
SUNANGL1
50.0
float angle between sun and
optical axis at
CABEND
MOONANG1 30.0
float angle between moon
and optical axis at
CABEND
TEL_ALT1
20.0
float angle between
groud_plane and
optical axis at C
POS_ANG1
20.0
float angle bwt y axis and
the North Pole at
88
CABEND
POSI1_X
5152.43457735
float the orbital position in X
at CABEND
POSI1_Y
-850.64604263
float the orbital position in Y
at CABEND
POSI1_Z
4297.77083803
float the orbital position in Z
at CABEND
VELO1_X
2.66632246
float The orbital velocity in
X at CABEND
VELO1_Y
6.97307976
float The orbital velocity in
Y at CABEND
VELO1_Z
-1.80175713
float The orbital velocity in
Z at CABEND
EULER1_1
float Euler angle 1
at CABEND
EULER1_2
float Euler angle 2
at CABEND
EULER1_3
float Euler angle 3
at