Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
190 changes: 190 additions & 0 deletions modules/dataprocess/dataclean.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
import argparse
import json
import os
import sys
from functools import lru_cache
from pathlib import Path

from tqdm import tqdm

# 允许从仓库根目录直接运行脚本(无需安装为包)。
PROJECT_ROOT = Path(__file__).resolve().parents[2]
if str(PROJECT_ROOT) not in sys.path:
sys.path.insert(0, str(PROJECT_ROOT))

from config import settings


@lru_cache(maxsize=1)
def _load_clean_prompt() -> str:
candidate_paths = [
Path(__file__).resolve().parent.parent / "prompts" / "Data_clean_prompt.txt",
PROJECT_ROOT / "prompts" / "Data_clean_prompt.txt",
]
for prompt_path in candidate_paths:
if prompt_path.exists():
with open(prompt_path, "r", encoding="utf-8") as f:
return f.read().strip()
raise FileNotFoundError(
"未找到 Data_clean_prompt.txt,请确认存在于 modules/prompts/ 或 prompts/ 目录。"
)


@lru_cache(maxsize=1)
def _get_cleaner_client():
try:
from openai import OpenAI
except ImportError as exc:
raise ImportError("缺少依赖 openai,请先安装:pip install openai") from exc
return OpenAI(api_key=settings.llm_api_key, base_url=settings.llm_api_base or None)


def _get_cleaner_model_name() -> str:
env_model = os.getenv("DATACLEAN_MODEL_NAME", "").strip()
if env_model:
return env_model
return settings.llm_model or "gpt-4o-mini"


def _strip_response_wrappers(text: str) -> str:
cleaned = text.strip()
if cleaned.startswith("```") and cleaned.endswith("```"):
lines = cleaned.splitlines()
if len(lines) >= 3:
cleaned = "\n".join(lines[1:-1]).strip()
return cleaned


def clean_text_with_llm(raw_text: str) -> str:
"""使用 DataMind 配置的 OpenAI 兼容接口清洗文本。"""
if not isinstance(raw_text, str) or raw_text.strip() == "":
return raw_text

if not (settings.llm_api_key or "").strip():
raise ValueError("llm_api_key 未配置,无法调用 LLM 清洗文本。")

system_prompt = _load_clean_prompt()
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": raw_text},
]

client = _get_cleaner_client()
response = client.chat.completions.create(
model=_get_cleaner_model_name(),
messages=messages,
temperature=0,
max_tokens=4096,
)
response_text = (response.choices[0].message.content or "").strip()
cleaned = _strip_response_wrappers(response_text)

# 防止模型偶发返回空串,避免丢失原始内容
return cleaned if cleaned else raw_text


def should_skip_existing(input_file: Path, output_file: Path) -> bool:
"""
判断是否应跳过已存在的输出文件。
条件:输出文件存在,且其内部 page 数量与输入文件相同。
"""
if not output_file.exists():
return False

try:
with open(input_file, "r", encoding="utf-8") as f_in:
input_data = json.load(f_in)
with open(output_file, "r", encoding="utf-8") as f_out:
output_data = json.load(f_out)
except (json.JSONDecodeError, IOError):
return False # 文件损坏则重新处理

# 确保都是列表,且长度一致
if isinstance(input_data, list) and isinstance(output_data, list):
return len(input_data) == len(output_data)
return False


def process_single_file(input_path: Path, output_path: Path) -> None:
"""
处理单个 JSON 文件:
1. 读取原始 JSON
2. 清洗所有 text 字段
3. 写入输出路径
"""
with open(input_path, "r", encoding="utf-8") as f:
data = json.load(f)

# 确保数据格式正确
if not isinstance(data, list):
print(f"警告:{input_path} 不是预期的列表格式,跳过")
return

cleaned_data = []
for item in data:
if not isinstance(item, dict) or "text" not in item:
# 保留不符合格式的项原样
cleaned_data.append(item)
continue

# 深拷贝一份,避免修改原数据
new_item = item.copy()
raw_text = item.get("text", "")
cleaned_text = clean_text_with_llm(raw_text)
new_item["text"] = cleaned_text
cleaned_data.append(new_item)

# 确保输出目录存在
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, "w", encoding="utf-8") as f:
json.dump(cleaned_data, f, ensure_ascii=False, indent=2)


def process_wrapper(input_file: Path, output_root: Path, input_root: Path) -> str:
rel_path = input_file.relative_to(input_root)
output_file = output_root / rel_path
if should_skip_existing(input_file, output_file):
return "skipped"
process_single_file(input_file, output_file)
return "done"


def main() -> None:
parser = argparse.ArgumentParser(description="批量清洗 OCR JSON 中的 text 字段")
parser.add_argument(
"--input_root",
type=str,
required=True,
help="输入根目录,包含多个子文件夹的 JSON 文件",
)
parser.add_argument(
"--output_root",
type=str,
required=True,
help="输出根目录,保持相同的子目录结构",
)
args = parser.parse_args()

input_root = Path(args.input_root)
output_root = Path(args.output_root)

if not input_root.exists():
print(f"错误:输入目录 {input_root} 不存在")
return

# 收集所有 JSON 文件
json_files = list(input_root.rglob("*.json"))
print(f"找到 {len(json_files)} 个 JSON 文件")

from concurrent.futures import ThreadPoolExecutor, as_completed

with ThreadPoolExecutor(max_workers=3) as executor:
futures = {
executor.submit(process_wrapper, f, output_root, input_root): f
for f in json_files
}
for future in tqdm(as_completed(futures), total=len(futures), desc="处理进度"):
future.result()

if __name__ == "__main__":
main()
41 changes: 41 additions & 0 deletions modules/prompts/Data_clean_prompt.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
You are an OCR text-cleaning assistant.

Your goal is to clean noisy OCR output while preserving factual content for downstream retrieval and QA.

Focus on high-impact fixes only:

1) Line breaks and spacing
- Merge broken lines when they split one sentence/word.
- Fix obvious glued words and wrongly split words.
- Collapse excessive blank lines and repeated spaces.

2) Common OCR confusions
- Fix obvious character confusions when context is clear, such as:
- o/O <-> 0, l/I <-> 1, rn <-> m, and similar high-confidence errors.
- Fix obvious transposition errors in words (e.g., "Traning" -> "Training").

3) Duplicates and noise
- Remove duplicated lines/phrases caused by OCR repetition.
- Remove obvious stray symbols/noise characters that do not carry meaning.
- Keep meaningful punctuation and separators.

4) Math and LaTeX preservation
- Preserve valid LaTeX/math structures if present:
- `\begin{...}`, `\end{...}`, `$...$`, `$$...$$`, `\[...\]`
- Only do conservative repairs for clearly broken math tokens.
- Do not rewrite formulas semantically.

5) Table-like content preservation
- Keep table content readable.
- If table borders/alignment are broken, output a clean plain-text table-like layout.
- Do not fabricate missing cells or values.

6) Fidelity constraints
- Do NOT summarize, paraphrase, or reorder content.
- Do NOT add new facts.
- Keep dates, numbers, names, URLs, units, and domain terms.
- If uncertain, keep the original text segment.

Output requirements:
- Return cleaned text only.
- No explanations, no comments, no markdown wrappers.