# 将不同文件夹下的pdf文件合并到一个文件夹下

In [1]:
import os
import shutil

def gather_pdfs(root_folder, destination_folder):
    # 创建目标文件夹，如果不存在则创建
    if not os.path.exists(destination_folder):
        os.makedirs(destination_folder)
    
    # 遍历根文件夹
    for dirpath, dirnames, filenames in os.walk(root_folder):
        for filename in filenames:
            if filename.lower().endswith('.pdf'):
                # 构建源文件路径
                source_file = os.path.join(dirpath, filename)
                # 构建目标文件路径
                destination_file = os.path.join(destination_folder, filename)
                # 复制文件到目标文件夹
                shutil.copy2(source_file, destination_file)
                print(f'Copied: {source_file} to {destination_file}')

# 使用方法
root_folder = r'/home/sunjinf/github_projet/nature_data/data_origin/origin_standrad'  # 替换为你的根文件夹路径
destination_folder = r'/home/sunjinf/github_projet/nature_data/data_origin/origin_standard'  # 替换为你的目标文件夹路径

gather_pdfs(root_folder, destination_folder)


Copied: /home/sunjinf/github_projet/nature_data/data_origin/origin_standrad/(GBT 30600-2022)高标准农田建设 通则.pdf to /home/sunjinf/github_projet/nature_data/data_origin/origin_standard/(GBT 30600-2022)高标准农田建设 通则.pdf
Copied: /home/sunjinf/github_projet/nature_data/data_origin/origin_standrad/2000国家大地坐标系转换成果质量检查与验收(报批稿).pdf to /home/sunjinf/github_projet/nature_data/data_origin/origin_standard/2000国家大地坐标系转换成果质量检查与验收(报批稿).pdf
Copied: /home/sunjinf/github_projet/nature_data/data_origin/origin_standrad/2019海洋生态资产评估技术导则（征求意见稿）.pdf to /home/sunjinf/github_projet/nature_data/data_origin/origin_standard/2019海洋生态资产评估技术导则（征求意见稿）.pdf
Copied: /home/sunjinf/github_projet/nature_data/data_origin/origin_standrad/2021大地天文测量规范（征求意见稿）.pdf to /home/sunjinf/github_projet/nature_data/data_origin/origin_standard/2021大地天文测量规范（征求意见稿）.pdf
Copied: /home/sunjinf/github_projet/nature_data/data_origin/origin_standrad/2022地球物理勘查图图式图例及用色标准（报批稿）.pdf to /home/sunjinf/github_projet/nature_data/data_origin/origin_standard/2022地

# 将不同的mk合并成一个markdown

In [1]:
import os
import json
def process_markdown_files(input_folder: str, output_file: str):
    for subdir, _, files in os.walk(input_folder):
        for file in files:
            #print(output_file)
            if file.endswith(".md"):
                folder_name = os.path.basename(subdir)
                file_path = os.path.join(subdir, file)
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = f.read()
                result = {"title": folder_name, "content": content}
                with open(output_file, 'a', encoding='utf-8') as f:
                    f.write(json.dumps(result, ensure_ascii=False) + '\n')

In [2]:
input_folder = r"/share_data/data/nature_data/out_papers_1"
output_file = r"/share_data/data/nature_data/out_papers_1_text_ori.jsonl"
process_markdown_files(input_folder, output_file)