# CFST MinerU 批量解析

使用 MinerU v2.7+ 批量解析 CFST 论文 PDF (300+ 篇)。

**流程**:
1. PDF 从 Google Drive 读取
2. 解析结果保存到 Colab 本地 `/content/parsed_output/`
3. 完成后上传到 Google Drive 或下载 zip

**前置条件**:
- Runtime → Change runtime type → T4 GPU
- Google Drive `/cfst-extractor/pdfs/` 中放入所有 PDF

In [None]:
# Cell 1: 安装 MinerU + 挂载 Drive
!pip install "mineru[pipeline] @ git+https://github.com/opendatalab/MinerU.git" -q 2>&1 | tail -5

import importlib.metadata
print(f'mineru: {importlib.metadata.version("mineru")}')

from google.colab import drive
drive.mount('/content/drive')

import os, json, glob, subprocess, time, shutil

# 路径配置
PDF_DIR = '/content/drive/MyDrive/cfst-extractor/pdfs'
OUTPUT_DIR = '/content/parsed_output'  # 本地输出
LOG_FILE = '/content/parse_log.jsonl'
os.makedirs(OUTPUT_DIR, exist_ok=True)

pdf_paths = sorted(glob.glob(f'{PDF_DIR}/*.pdf'))
print(f'\n找到 {len(pdf_paths)} 篇 PDF')

In [None]:
# Cell 2: 检查 GPU + 配置环境
import torch
if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    gpu_mem = torch.cuda.get_device_properties(0).total_memory / 1024**3
    print(f'GPU: {gpu_name} ({gpu_mem:.1f} GB)')
else:
    raise RuntimeError('No GPU! Runtime -> Change runtime type -> T4 GPU')

os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
os.environ['MINERU_DEVICE_MODE'] = 'cuda'
print('环境配置完成')

In [None]:
# Cell 3: 批量解析 (支持断点续传)
def safe_name(name):
    """清理文件名中的特殊字符"""
    return name.replace('\uf03a', '_').replace(':', '_').replace('/', '_')

def is_parsed(output_dir):
    """检查是否已解析完成"""
    if not os.path.exists(output_dir):
        return False
    md_files = glob.glob(f'{output_dir}/**/*.md', recursive=True)
    json_files = glob.glob(f'{output_dir}/**/*content_list*.json', recursive=True)
    return bool(md_files and json_files)

total = len(pdf_paths)
parsed, skipped, failed = 0, 0, 0
failed_papers = []

print(f'=== 批量解析 {total} 篇 PDF ===')
print(f'输出目录: {OUTPUT_DIR}')
print()

for i, pdf_path in enumerate(pdf_paths):
    pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
    output_dir = f'{OUTPUT_DIR}/{safe_name(pdf_name)}'
    
    # 断点续传: 跳过已完成的
    if is_parsed(output_dir):
        skipped += 1
        continue
    
    print(f'[{i+1}/{total}] {pdf_name[:60]}...', end=' ', flush=True)
    start = time.time()
    
    try:
        result = subprocess.run(
            ['mineru', '-p', pdf_path, '-o', output_dir, '-m', 'auto', '-b', 'pipeline',
             '--f_draw_layout_bbox', 'false',   # 不生成 *_layout.pdf
             '--f_draw_span_bbox', 'false',     # 不生成 *_span.pdf
             '--f_dump_orig_pdf', 'false',      # 不生成 *_origin.pdf
             '--f_dump_middle_json', 'false',   # 不生成 *_middle.json
             '--f_dump_model_output', 'false',  # 不生成 *_model.json
            ],
            capture_output=True, text=True, timeout=600
        )
        elapsed = time.time() - start
        
        if result.returncode == 0 and is_parsed(output_dir):
            parsed += 1
            print(f'OK ({elapsed:.0f}s)')
            with open(LOG_FILE, 'a') as f:
                f.write(json.dumps({'paper': pdf_name, 'status': 'ok', 'time_s': round(elapsed, 1)}) + '\n')
        else:
            failed += 1
            failed_papers.append(pdf_name)
            print(f'FAIL')
            with open(LOG_FILE, 'a') as f:
                f.write(json.dumps({'paper': pdf_name, 'status': 'fail', 'error': result.stderr[-500:]}) + '\n')
    except subprocess.TimeoutExpired:
        failed += 1
        failed_papers.append(pdf_name)
        print('TIMEOUT')
    except Exception as e:
        failed += 1
        failed_papers.append(pdf_name)
        print(f'ERROR: {e}')

print(f'\n=== 完成 ===')
print(f'成功: {parsed} | 跳过: {skipped} | 失败: {failed}')
if failed_papers:
    print(f'\n失败列表:')
    for p in failed_papers[:20]:
        print(f'  - {p}')

In [None]:
# Cell 4: 验证输出统计
print('=== 输出验证 ===')
ok_count, incomplete_count, missing_count = 0, 0, 0

for pdf_path in pdf_paths:
    pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
    output_dir = f'{OUTPUT_DIR}/{safe_name(pdf_name)}'
    
    if is_parsed(output_dir):
        ok_count += 1
    elif os.path.exists(output_dir):
        incomplete_count += 1
        print(f'  [INCOMPLETE] {pdf_name[:60]}')
    else:
        missing_count += 1
        print(f'  [MISSING] {pdf_name[:60]}')

print(f'\n统计: OK={ok_count} | INCOMPLETE={incomplete_count} | MISSING={missing_count}')

# 输出目录大小
total_size = sum(os.path.getsize(os.path.join(dp, f)) 
                 for dp, dn, fn in os.walk(OUTPUT_DIR) for f in fn)
print(f'输出目录大小: {total_size / 1024**3:.2f} GB')

In [None]:
# Cell 5: 上传到 Google Drive
DRIVE_OUTPUT = '/content/drive/MyDrive/cfst-extractor/parsed'
os.makedirs(DRIVE_OUTPUT, exist_ok=True)

print(f'上传到: {DRIVE_OUTPUT}')
print('复制中...')

uploaded = 0
for item in os.listdir(OUTPUT_DIR):
    src = os.path.join(OUTPUT_DIR, item)
    dst = os.path.join(DRIVE_OUTPUT, item)
    if os.path.isdir(src) and not os.path.exists(dst):
        shutil.copytree(src, dst)
        uploaded += 1

print(f'上传完成: {uploaded} 个目录')

# 复制日志
if os.path.exists(LOG_FILE):
    shutil.copy(LOG_FILE, f'{DRIVE_OUTPUT}/../parse_log.jsonl')
    print('日志已复制')

In [None]:
# Cell 6: 下载为 zip (可选)
ZIP_PATH = '/content/cfst_parsed.zip'

print('打包中...')
!cd {OUTPUT_DIR} && zip -r {ZIP_PATH} . -q

zip_size = os.path.getsize(ZIP_PATH) / 1024**3
print(f'ZIP 大小: {zip_size:.2f} GB')

from google.colab import files
files.download(ZIP_PATH)
print('下载已开始')