我如何提取这种文件中的这类图片，以及样本名信息和CN等数据呢？一共有几百页，人工去做提取的耗时很大。我需要从这类pdf文件里提取我所需要的训练模型的图片和信息，如何自动化的完成

# 提取SV断点图
四个SV断点图的坐标，x坐标以染色体区间的两端为准，y坐标以上下两条界线为准
* 此前我只用了几页原PDF文件来确定SV图像的坐标，但当应用于整个pdf文件之后，SV图像的坐标却发生了变化
* 直接使用原PDF文件产生的每一页的图像（是的，PDF子集中同一页坐标和PDF原件同一页坐标是不一样的），来确定每一个需要切割的SV的坐标后运行符合预期
* 建议：根据原PDF文件转换后的图片，确定SV坐标，更改coordinates以及w, h

In [10]:
from pdf2image import convert_from_path
import cv2
import os

# 将PDF文件转换为图片
def convert_pdf_to_images(pdf_path, output_dir, dpi=300):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    pages = convert_from_path(pdf_path, dpi=dpi)

    for i, page in enumerate(pages):
        page_filename = f"{output_dir}/page_{i+1}.png"
        page.save(page_filename, "PNG")
        print(f"Saved page as image: {page_filename}")

# 根据手动确定的SV图像坐标，为每一页切割出4个SV断点图
def extract_sv_image_from_page(image_path, output_dir, coordinates):
    img = cv2.imread(image_path)

    for idx, (x, y) in enumerate(coordinates):
        w, h = 1050, 320
        cropped_img = img[y:y+h, x:x+w]
        output_filename = f"{output_dir}/{os.path.splitext(os.path.basename(image_path))[0]}_part_{idx+1}.png"
        cv2.imwrite(output_filename, cropped_img)
        print(f"Saved cropped SV image: {output_filename}")


# 设置文件路径
pdf_path = "/Users/xurui/back_up_unit/天津大学文件/本科毕设相关/Article/high-confidence.pdf"
page_output_dir = "/Volumes/T7-shield/CS-Bachelor-Thesis/CNN_model/pdf_convert"
image_output_dir = "/Volumes/T7-shield/CS-Bachelor-Thesis/CNN_model/test-picture"

# 将PDF文件转换为图片
convert_pdf_to_images(pdf_path, page_output_dir)

coordinates = [
    (71, 150),
    (1290, 150),
    (71, 2005),
    (1290, 2005)
]

if not os.path.exists(image_output_dir):
    os.makedirs(image_output_dir)

for page_file in os.listdir(page_output_dir):
    if page_file.endswith(".png"):
        page_path = os.path.join(page_output_dir, page_file)
        extract_sv_image_from_page(page_path, image_output_dir, coordinates)

Saved page as image: /Volumes/T7-shield/CS-Bachelor-Thesis/CNN_model/pdf_convert/page_1.png
Saved page as image: /Volumes/T7-shield/CS-Bachelor-Thesis/CNN_model/pdf_convert/page_2.png
Saved page as image: /Volumes/T7-shield/CS-Bachelor-Thesis/CNN_model/pdf_convert/page_3.png
Saved page as image: /Volumes/T7-shield/CS-Bachelor-Thesis/CNN_model/pdf_convert/page_4.png
Saved page as image: /Volumes/T7-shield/CS-Bachelor-Thesis/CNN_model/pdf_convert/page_5.png
Saved page as image: /Volumes/T7-shield/CS-Bachelor-Thesis/CNN_model/pdf_convert/page_6.png
Saved page as image: /Volumes/T7-shield/CS-Bachelor-Thesis/CNN_model/pdf_convert/page_7.png
Saved page as image: /Volumes/T7-shield/CS-Bachelor-Thesis/CNN_model/pdf_convert/page_8.png
Saved page as image: /Volumes/T7-shield/CS-Bachelor-Thesis/CNN_model/pdf_convert/page_9.png
Saved page as image: /Volumes/T7-shield/CS-Bachelor-Thesis/CNN_model/pdf_convert/page_10.png
Saved page as image: /Volumes/T7-shield/CS-Bachelor-Thesis/CNN_model/pdf_conver

# 提取文本信息
使用 page_{num}\_part\_{1..4} 作为关联，提取样本名，Position, Oscillating CN （2 and 3 states） CN segments

In [7]:
import pytesseract
import cv2
import os
import pandas as pd

def extract_text_from_region(image, x, y, w, h):
    """从图像的指定区域提取文本"""
    region = image[y:y+h, x:x+w]
    gray = cv2.cvtColor(region, cv2.COLOR_BGR2GRAY)
    
    # 预处理：提高OCR准确率
    _, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY_INV)
    
    # 使用Tesseract进行OCR
    text = pytesseract.image_to_string(thresh, lang='eng')
    return text.strip(), region

# 定义每个事件的四个区域坐标
# 每个事件对应4个区域，共4个事件，总共16个区域
# 格式: (x, y, width, height)
event_regions = {
    # 第一个事件区域 (左上)
    1: [
        (684, 1150, 1390-684, 1186-1150),   # donor_idx
        (684, 1225, 1390-684, 1258-1225),   # POS
        (684, 1469, 1390-684, 1504-1469),    # Oscillating CN （2 and 3 states）
        (684, 1507, 1390-684, 1542-1507)     # CN segments
    ],
    # 第二个事件区域 (右上)
    2: [
        (1800, 1150, 2160-1800, 1186-1150),   # donor_idx
        (1800, 1225, 2160-1800, 1258-1225),   # POS
        (1800, 1469, 2160-1800, 1504-1469),    # Oscillating CN （2 and 3 states）
        (1800, 1507, 2160-1800, 1542-1507)     # CN segments
    ],
    # 第三个事件区域 (左下)
    3: [
        (700, 2870, 1070-700, 2906-2870),   # donor_idx
        (700, 2905, 1070-700, 2946-2905),   # POS
        (700, 3148, 1070-700, 3184-3148),    # Oscillating CN （2 and 3 states）
        (700, 3186, 1070-700, 3225-3186)     # CN segments
    ],
    # 第四个事件区域 (右下)
    4: [
        (1801, 2870, 2160-1801, 2906-2870),   # donor_idx
        (1801, 2905, 2160-1801, 2946-2905),   # POS
        (1801, 3148, 2160-1801, 3184-3148),    # Oscillating CN （2 and 3 states）
        (1801, 3186, 2160-1801, 3225-3186)     # CN segments
    ]
}

# 区域的特性标签 - 按照需要的列名修改
region_labels = ["donor_idx", "POS", "Oscillating_CN_2_and_3_states", "CN_segments"]

# 设置输入和输出目录
page_dir = "/Volumes/T7-shield/CS-Bachelor-Thesis/CNN_model/pdf_convert"
output_file = "/Volumes/T7-shield/CS-Bachelor-Thesis/CNN_model/extracted_text_info.tsv"
regions_output_dir = "/Volumes/T7-shield/CS-Bachelor-Thesis/CNN_model/region_images"

# 创建保存区域图像的目录
if not os.path.exists(regions_output_dir):
    os.makedirs(regions_output_dir)

# 收集所有页面的所有区域的文本信息
all_data = []

# 处理所有PNG页面
for page_file in sorted(os.listdir(page_dir)):
    if page_file.endswith(".png"):
        page_path = os.path.join(page_dir, page_file)
        page_num = page_file.split("_")[1].split(".")[0]
        print(f"处理页面 {page_num}...")
        
        # 读取页面图像
        page_img = cv2.imread(page_path)
        
        # 对每个事件区域提取文本
        for event_id, regions in event_regions.items():
            # 创建image_idx格式: page_{page}_part_{event_id}
            image_idx = f"page_{page_num}_part_{event_id}"
            
            # 创建当前事件的数据字典
            event_data = {
                "image_idx": image_idx
            }
            
            # 处理该事件的四个区域
            for region_idx, (x, y, w, h) in enumerate(regions):
                region_label = region_labels[region_idx]
                
                # 提取文本和区域图像
                text, region_img = extract_text_from_region(page_img, x, y, w, h)
                event_data[region_label] = text
                
                # 保存区域图像
                region_filename = f"{regions_output_dir}/page_{page_num}_event_{event_id}_{region_label}.png"
                cv2.imwrite(region_filename, region_img)
                print(f"保存区域图像: {region_filename}")
            
            all_data.append(event_data)

# 创建DataFrame并保存为TSV
df = pd.DataFrame(all_data)

# 设置列的顺序
columns_order = ["image_idx"] + region_labels
df = df[columns_order]

# 保存为TSV文件
df.to_csv(output_file, sep='\t', index=False)
print(f"成功提取文本信息并保存到: {output_file}")

处理页面 1...
保存区域图像: /Volumes/T7-shield/CS-Bachelor-Thesis/CNN_model/region_images/page_1_event_1_donor_idx.png
保存区域图像: /Volumes/T7-shield/CS-Bachelor-Thesis/CNN_model/region_images/page_1_event_1_POS.png
保存区域图像: /Volumes/T7-shield/CS-Bachelor-Thesis/CNN_model/region_images/page_1_event_1_Oscillating_CN_2_and_3_states.png
保存区域图像: /Volumes/T7-shield/CS-Bachelor-Thesis/CNN_model/region_images/page_1_event_1_CN_segments.png
保存区域图像: /Volumes/T7-shield/CS-Bachelor-Thesis/CNN_model/region_images/page_1_event_2_donor_idx.png
保存区域图像: /Volumes/T7-shield/CS-Bachelor-Thesis/CNN_model/region_images/page_1_event_2_POS.png
保存区域图像: /Volumes/T7-shield/CS-Bachelor-Thesis/CNN_model/region_images/page_1_event_2_Oscillating_CN_2_and_3_states.png
保存区域图像: /Volumes/T7-shield/CS-Bachelor-Thesis/CNN_model/region_images/page_1_event_2_CN_segments.png
保存区域图像: /Volumes/T7-shield/CS-Bachelor-Thesis/CNN_model/region_images/page_1_event_3_donor_idx.png
保存区域图像: /Volumes/T7-shield/CS-Bachelor-Thesis/CNN_model/region_imag