In [11]:
from pdf2image import convert_from_path
import pytesseract
from PIL import Image
import os


In [1]:
pip install pymupdf


Looking in indexes: https://mirrors.aliyun.com/pypi/simple
Note: you may need to restart the kernel to use updated packages.


In [12]:
import json
from pdf2image import convert_from_path
import pytesseract
from PIL import Image
import os

def pdf_to_text_and_images(pdf_path, output_dir, poppler_path=None):
    """
    从 PDF 提取文本和图像，保存为 JSON 文件和 PNG 文件。
    
    :param pdf_path: 输入 PDF 文件路径
    :param output_dir: 输出文件夹路径
    :param poppler_path: Poppler 的安装路径
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # 动态生成文件夹名和 JSON 文件名
    base_name = os.path.basename(pdf_path).rsplit(".", 1)[0]
    images_folder = os.path.join(output_dir, base_name)  # 创建以 PDF 名称为基础的图像子文件夹
    if not os.path.exists(images_folder):
        os.makedirs(images_folder)
    json_output_path = os.path.join(output_dir, f"{base_name}.json")
    
    # 将 PDF 转换为图像
    images = convert_from_path(pdf_path, dpi=300, poppler_path=poppler_path)
    
    # 初始化 JSON 数据结构
    pdf_data = {"pages": []}
    
    # 处理每一页
    for i, image in enumerate(images):
        # 将当前页保存为 PNG 文件（存储到 images_folder 中）
        image_path = os.path.join(images_folder, f"{base_name}_page_{i + 1}.png")
        image.save(image_path, "PNG")
        
        # 使用 Tesseract OCR 提取文本
        text = pytesseract.image_to_string(image, lang="eng")
        
        # 添加到 JSON 数据结构
        pdf_data["pages"].append({
            "page_number": i + 1,
            "text": text,
            "image_path": image_path
        })
    
    # 将 JSON 数据写入文件
    with open(json_output_path, "w", encoding="utf-8") as json_file:
        json.dump(pdf_data, json_file, ensure_ascii=False, indent=4)
    
    print(f"JSON 数据已保存到: {json_output_path}")
    print(f"图像已保存到文件夹: {images_folder}")


In [13]:
def remove_word_from_json_file(original_json_path, word_to_remove):
    """
    从 JSON 文件中删除指定单词，并生成一个新 JSON 文件，文件名为 `原文件名-单词.json`。
    
    :param original_json_path: 原始 JSON 文件路径
    :param word_to_remove: 要删除的单词
    """
    # 读取原始文件名和路径
    base_name = os.path.basename(original_json_path).rsplit(".", 1)[0]
    output_dir = os.path.dirname(original_json_path)
    new_file_name = f"{base_name}-{word_to_remove}.json"
    new_file_path = os.path.join(output_dir, new_file_name)
    
    # 读取原始 JSON 数据
    with open(original_json_path, "r", encoding="utf-8") as file:
        json_data = json.load(file)
    
    # 遍历每一页，删除指定单词
    for page in json_data["pages"]:
        words = page["text"].split()
        filtered_words = [word for word in words if word.lower() != word_to_remove.lower()]
        page["text"] = " ".join(filtered_words)
    
    # 将新 JSON 数据写入文件
    with open(new_file_path, "w", encoding="utf-8") as new_file:
        json.dump(json_data, new_file, ensure_ascii=False, indent=4)
    
    print(f"新 JSON 数据已保存到: {new_file_path}")


In [14]:
pdf_path = "2021_NMBHOF_Financial_Statements_Parent_-_Signed_2.pdf"
output_folder = "./output"
pdf_to_text_and_images(pdf_path, output_folder)


JSON 数据已保存到: ./output/2021_NMBHOF_Financial_Statements_Parent_-_Signed_2.json
图像已保存到文件夹: ./output/2021_NMBHOF_Financial_Statements_Parent_-_Signed_2


In [15]:
json_path = "./output/2021_NMBHOF_Financial_Statements_Parent_-_Signed_2.json"
word_to_remove = "financial"

remove_word_from_json_file(json_path, word_to_remove)


新 JSON 数据已保存到: ./output/2021_NMBHOF_Financial_Statements_Parent_-_Signed_2-financial.json


In [18]:
def arabic_to_roman(num):
    """
    将阿拉伯数字转换为罗马数字。
    
    :param num: 阿拉伯数字 (1-3999)
    :return: 罗马数字字符串
    """
    if not (0 < num < 4000):
        return str(num)  # 如果数字超出范围，直接返回原数字字符串
    
    
    roman_numerals = {
        1000: "M", 900: "CM", 500: "D", 400: "CD",
        100: "C", 90: "XC", 50: "L", 40: "XL",
        10: "X", 9: "IX", 5: "V", 4: "IV", 1: "I"
    }
    result = ""
    for value, numeral in roman_numerals.items():
        while num >= value:
            result += numeral
            num -= value
    return result

import re
import json

def replace_numbers_with_roman(json_path):
    """
    将 JSON 文件中的阿拉伯数字替换为罗马数字，并保存为新文件。
    
    :param json_path: JSON 文件路径
    """
    # 读取原始 JSON 文件
    with open(json_path, "r", encoding="utf-8") as json_file:
        data = json.load(json_file)
    
    # 替换文本中的数字为罗马数字
    def replace_numbers_in_text(text):
        def convert_match_to_roman(match):
            number = int(match.group())
            return arabic_to_roman(number)
        
        # 使用正则匹配所有数字
        return re.sub(r'\b\d+\b', convert_match_to_roman, text)
    
    # 更新 JSON 数据中的文本
    updated_data = {"pages": []}
    for page in data["pages"]:
        original_text = page["text"]
        updated_text = replace_numbers_in_text(original_text)
        updated_data["pages"].append({
            "page_number": page["page_number"],
            "text": updated_text,
            "image_path": page["image_path"]
        })
    
    # 保存为新 JSON 文件
    base_name = os.path.basename(json_path).rsplit(".", 1)[0]
    output_dir = os.path.dirname(json_path)
    new_file_name = f"{base_name}-roman.json"
    new_file_path = os.path.join(output_dir, new_file_name)
    
    with open(new_file_path, "w", encoding="utf-8") as new_json_file:
        json.dump(updated_data, new_json_file, ensure_ascii=False, indent=4)
    
    print(f"已将数字替换为罗马数字，新文件保存到: {new_file_path}")


In [19]:
json_file_path = "output/2021_NMBHOF_Financial_Statements_Parent_-_Signed_2.json"
replace_numbers_with_roman(json_file_path)


已将数字替换为罗马数字，新文件保存到: output/2021_NMBHOF_Financial_Statements_Parent_-_Signed_2-roman.json


In [20]:
import random
import string
import json

def remove_random_punctuation_from_text(text):
    """
    从文本中随机删除一个标点符号。
    
    :param text: 输入的文本字符串
    :return: 删除一个标点符号后的新文本
    """
    # 定义标点符号列表
    punctuation_marks = set(string.punctuation)
    
    # 找到文本中的所有标点符号及其索引
    punctuation_indices = [(i, char) for i, char in enumerate(text) if char in punctuation_marks]
    
    # 如果没有标点符号，直接返回原文本
    if not punctuation_indices:
        return text
    
    # 随机选择一个标点符号的位置
    index_to_remove, _ = random.choice(punctuation_indices)
    
    # 删除选中的标点符号
    new_text = text[:index_to_remove] + text[index_to_remove + 1:]
    return new_text

def remove_random_punctuation_from_json(json_path):
    """
    随机删除 JSON 文件中某一段文本的一个标点符号，并保存为新文件。
    
    :param json_path: JSON 文件路径
    """
    # 读取原始 JSON 文件
    with open(json_path, "r", encoding="utf-8") as json_file:
        data = json.load(json_file)
    
    # 随机删除标点符号
    updated_data = {"pages": []}
    for page in data["pages"]:
        original_text = page["text"]
        updated_text = remove_random_punctuation_from_text(original_text)
        updated_data["pages"].append({
            "page_number": page["page_number"],
            "text": updated_text,
            "image_path": page["image_path"]
        })
    
    # 保存为新 JSON 文件
    base_name = os.path.basename(json_path).rsplit(".", 1)[0]
    output_dir = os.path.dirname(json_path)
    new_file_name = f"{base_name}-random-punctuation-removed.json"
    new_file_path = os.path.join(output_dir, new_file_name)
    
    with open(new_file_path, "w", encoding="utf-8") as new_json_file:
        json.dump(updated_data, new_json_file, ensure_ascii=False, indent=4)
    
    print(f"已随机删除标点符号，新文件保存到: {new_file_path}")


In [21]:
json_file_path = "output/2021_NMBHOF_Financial_Statements_Parent_-_Signed_2.json"
remove_random_punctuation_from_json(json_file_path)


已随机删除标点符号，新文件保存到: output/2021_NMBHOF_Financial_Statements_Parent_-_Signed_2-random-punctuation-removed.json


In [22]:
import json

def change_case_in_text(text, target_word, to_upper=True):
    """
    将文本中指定的单词变成全大写或全小写。
    
    :param text: 输入的文本字符串
    :param target_word: 需要改变大小写的目标单词
    :param to_upper: 如果为 True，将变成全大写；否则变成全小写
    :return: 修改后的新文本
    """
    # 根据用户需求改变大小写
    if to_upper:
        new_word = target_word.upper()
    else:
        new_word = target_word.lower()
    
    # 替换所有匹配的单词（不区分大小写）
    words = text.split()
    updated_words = [new_word if word.lower() == target_word.lower() else word for word in words]
    
    return " ".join(updated_words)

def change_case_in_json(json_path, target_word, to_upper=True):
    """
    将 JSON 文件中指定的单词变成全大写或全小写，并保存为新文件。
    
    :param json_path: JSON 文件路径
    :param target_word: 需要改变大小写的目标单词
    :param to_upper: 如果为 True，将变成全大写；否则变成全小写
    """
    # 读取原始 JSON 文件
    with open(json_path, "r", encoding="utf-8") as json_file:
        data = json.load(json_file)
    
    # 更新 JSON 数据中的文本
    updated_data = {"pages": []}
    for page in data["pages"]:
        original_text = page["text"]
        updated_text = change_case_in_text(original_text, target_word, to_upper)
        updated_data["pages"].append({
            "page_number": page["page_number"],
            "text": updated_text,
            "image_path": page["image_path"]
        })
    
    # 保存为新 JSON 文件
    base_name = os.path.basename(json_path).rsplit(".", 1)[0]
    output_dir = os.path.dirname(json_path)
    new_file_name = f"{base_name}-{target_word}-{'upper' if to_upper else 'lower'}.json"
    new_file_path = os.path.join(output_dir, new_file_name)
    
    with open(new_file_path, "w", encoding="utf-8") as new_json_file:
        json.dump(updated_data, new_json_file, ensure_ascii=False, indent=4)
    
    print(f"已将单词 '{target_word}' 变为 {'全大写' if to_upper else '全小写'}，新文件保存到: {new_file_path}")


In [23]:
json_file_path = "output/2021_NMBHOF_Financial_Statements_Parent_-_Signed_2.json"
target_word = "NBA"  # 需要改变大小写的单词
to_upper = False  # 如果为 True，变成全大写；如果为 False，变成全小写

change_case_in_json(json_file_path, target_word, to_upper)


已将单词 'NBA' 变为 全小写，新文件保存到: output/2021_NMBHOF_Financial_Statements_Parent_-_Signed_2-NBA-lower.json


In [24]:
import json
import random

def shuffle_word(word):
    """
    将单词的字符顺序打乱。
    
    :param word: 输入单词
    :return: 字符顺序打乱后的单词
    """
    word_list = list(word)
    random.shuffle(word_list)
    return ''.join(word_list)

def shuffle_word_in_text(text, target_word):
    """
    将文本中指定的单词字符顺序打乱。
    
    :param text: 输入文本
    :param target_word: 要打乱字符顺序的目标单词
    :return: 替换后的文本
    """
    words = text.split()
    updated_words = [
        shuffle_word(word) if word.lower() == target_word.lower() else word
        for word in words
    ]
    return " ".join(updated_words)

def shuffle_word_in_json(json_path, target_word):
    """
    将 JSON 文件中指定单词的字符顺序打乱，并保存为新文件。
    
    :param json_path: JSON 文件路径
    :param target_word: 要打乱字符顺序的目标单词
    """
    # 读取原始 JSON 文件
    with open(json_path, "r", encoding="utf-8") as json_file:
        data = json.load(json_file)
    
    # 更新 JSON 数据中的文本
    updated_data = {"pages": []}
    for page in data["pages"]:
        original_text = page["text"]
        updated_text = shuffle_word_in_text(original_text, target_word)
        updated_data["pages"].append({
            "page_number": page["page_number"],
            "text": updated_text,
            "image_path": page["image_path"]
        })
    
    # 保存为新 JSON 文件
    base_name = os.path.basename(json_path).rsplit(".", 1)[0]
    output_dir = os.path.dirname(json_path)
    new_file_name = f"{base_name}-{target_word}-shuffled.json"
    new_file_path = os.path.join(output_dir, new_file_name)
    
    with open(new_file_path, "w", encoding="utf-8") as new_json_file:
        json.dump(updated_data, new_json_file, ensure_ascii=False, indent=4)
    
    print(f"已将单词 '{target_word}' 字符顺序打乱，新文件保存到: {new_file_path}")


In [25]:
json_file_path = "output/2021_NMBHOF_Financial_Statements_Parent_-_Signed_2.json"
target_word = "example"  # 需要打乱的目标单词

shuffle_word_in_json(json_file_path, target_word)


已将单词 'example' 字符顺序打乱，新文件保存到: output/2021_NMBHOF_Financial_Statements_Parent_-_Signed_2-example-shuffled.json


In [26]:
import json
import random

def remove_spaces_in_sentence(text):
    """
    从文本中随机选择一句话，并将其中的单词连接在一起。
    
    :param text: 输入文本
    :return: 修改后的文本
    """
    # 将文本分割成句子，使用常见标点符号进行分割
    sentences = [s.strip() for s in re.split(r'(?<=[.!?]) +', text) if s.strip()]
    
    # 如果没有句子，直接返回原文本
    if not sentences:
        return text
    
    # 随机选择一句话
    index_to_modify = random.randint(0, len(sentences) - 1)
    
    # 删除该句子中的所有单词间的空格
    sentences[index_to_modify] = sentences[index_to_modify].replace(" ", "")
    
    # 重新组合成文本
    return " ".join(sentences)

def remove_spaces_in_sentence_in_json(json_path):
    """
    从 JSON 文件中随机选择一段文本的句子，并将其中的单词连接在一起。
    
    :param json_path: JSON 文件路径
    """
    # 读取原始 JSON 文件
    with open(json_path, "r", encoding="utf-8") as json_file:
        data = json.load(json_file)
    
    # 更新 JSON 数据中的文本
    updated_data = {"pages": []}
    for page in data["pages"]:
        original_text = page["text"]
        updated_text = remove_spaces_in_sentence(original_text)
        updated_data["pages"].append({
            "page_number": page["page_number"],
            "text": updated_text,
            "image_path": page["image_path"]
        })
    
    # 保存为新 JSON 文件
    base_name = os.path.basename(json_path).rsplit(".", 1)[0]
    output_dir = os.path.dirname(json_path)
    new_file_name = f"{base_name}-sentence-no-spaces.json"
    new_file_path = os.path.join(output_dir, new_file_name)
    
    with open(new_file_path, "w", encoding="utf-8") as new_json_file:
        json.dump(updated_data, new_json_file, ensure_ascii=False, indent=4)
    
    print(f"已随机删除句子中单词间距，新文件保存到: {new_file_path}")


In [27]:
json_file_path = "output/2021_NMBHOF_Financial_Statements_Parent_-_Signed_2.json"

remove_spaces_in_sentence_in_json(json_file_path)


已随机删除句子中单词间距，新文件保存到: output/2021_NMBHOF_Financial_Statements_Parent_-_Signed_2-sentence-no-spaces.json
