diff --git a/modules/dataprocess/dataclean.py b/modules/dataprocess/dataclean.py new file mode 100644 index 0000000..fa77343 --- /dev/null +++ b/modules/dataprocess/dataclean.py @@ -0,0 +1,190 @@ +import argparse +import json +import os +import sys +from functools import lru_cache +from pathlib import Path + +from tqdm import tqdm + +# 允许从仓库根目录直接运行脚本(无需安装为包)。 +PROJECT_ROOT = Path(__file__).resolve().parents[2] +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) + +from config import settings + + +@lru_cache(maxsize=1) +def _load_clean_prompt() -> str: + candidate_paths = [ + Path(__file__).resolve().parent.parent / "prompts" / "Data_clean_prompt.txt", + PROJECT_ROOT / "prompts" / "Data_clean_prompt.txt", + ] + for prompt_path in candidate_paths: + if prompt_path.exists(): + with open(prompt_path, "r", encoding="utf-8") as f: + return f.read().strip() + raise FileNotFoundError( + "未找到 Data_clean_prompt.txt,请确认存在于 modules/prompts/ 或 prompts/ 目录。" + ) + + +@lru_cache(maxsize=1) +def _get_cleaner_client(): + try: + from openai import OpenAI + except ImportError as exc: + raise ImportError("缺少依赖 openai,请先安装:pip install openai") from exc + return OpenAI(api_key=settings.llm_api_key, base_url=settings.llm_api_base or None) + + +def _get_cleaner_model_name() -> str: + env_model = os.getenv("DATACLEAN_MODEL_NAME", "").strip() + if env_model: + return env_model + return settings.llm_model or "gpt-4o-mini" + + +def _strip_response_wrappers(text: str) -> str: + cleaned = text.strip() + if cleaned.startswith("```") and cleaned.endswith("```"): + lines = cleaned.splitlines() + if len(lines) >= 3: + cleaned = "\n".join(lines[1:-1]).strip() + return cleaned + + +def clean_text_with_llm(raw_text: str) -> str: + """使用 DataMind 配置的 OpenAI 兼容接口清洗文本。""" + if not isinstance(raw_text, str) or raw_text.strip() == "": + return raw_text + + if not (settings.llm_api_key or "").strip(): + raise ValueError("llm_api_key 未配置,无法调用 LLM 清洗文本。") + + system_prompt = _load_clean_prompt() + messages = [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": raw_text}, + ] + + client = _get_cleaner_client() + response = client.chat.completions.create( + model=_get_cleaner_model_name(), + messages=messages, + temperature=0, + max_tokens=4096, + ) + response_text = (response.choices[0].message.content or "").strip() + cleaned = _strip_response_wrappers(response_text) + + # 防止模型偶发返回空串,避免丢失原始内容 + return cleaned if cleaned else raw_text + + +def should_skip_existing(input_file: Path, output_file: Path) -> bool: + """ + 判断是否应跳过已存在的输出文件。 + 条件:输出文件存在,且其内部 page 数量与输入文件相同。 + """ + if not output_file.exists(): + return False + + try: + with open(input_file, "r", encoding="utf-8") as f_in: + input_data = json.load(f_in) + with open(output_file, "r", encoding="utf-8") as f_out: + output_data = json.load(f_out) + except (json.JSONDecodeError, IOError): + return False # 文件损坏则重新处理 + + # 确保都是列表,且长度一致 + if isinstance(input_data, list) and isinstance(output_data, list): + return len(input_data) == len(output_data) + return False + + +def process_single_file(input_path: Path, output_path: Path) -> None: + """ + 处理单个 JSON 文件: + 1. 读取原始 JSON + 2. 清洗所有 text 字段 + 3. 写入输出路径 + """ + with open(input_path, "r", encoding="utf-8") as f: + data = json.load(f) + + # 确保数据格式正确 + if not isinstance(data, list): + print(f"警告:{input_path} 不是预期的列表格式,跳过") + return + + cleaned_data = [] + for item in data: + if not isinstance(item, dict) or "text" not in item: + # 保留不符合格式的项原样 + cleaned_data.append(item) + continue + + # 深拷贝一份,避免修改原数据 + new_item = item.copy() + raw_text = item.get("text", "") + cleaned_text = clean_text_with_llm(raw_text) + new_item["text"] = cleaned_text + cleaned_data.append(new_item) + + # 确保输出目录存在 + output_path.parent.mkdir(parents=True, exist_ok=True) + with open(output_path, "w", encoding="utf-8") as f: + json.dump(cleaned_data, f, ensure_ascii=False, indent=2) + + +def process_wrapper(input_file: Path, output_root: Path, input_root: Path) -> str: + rel_path = input_file.relative_to(input_root) + output_file = output_root / rel_path + if should_skip_existing(input_file, output_file): + return "skipped" + process_single_file(input_file, output_file) + return "done" + + +def main() -> None: + parser = argparse.ArgumentParser(description="批量清洗 OCR JSON 中的 text 字段") + parser.add_argument( + "--input_root", + type=str, + required=True, + help="输入根目录,包含多个子文件夹的 JSON 文件", + ) + parser.add_argument( + "--output_root", + type=str, + required=True, + help="输出根目录,保持相同的子目录结构", + ) + args = parser.parse_args() + + input_root = Path(args.input_root) + output_root = Path(args.output_root) + + if not input_root.exists(): + print(f"错误:输入目录 {input_root} 不存在") + return + + # 收集所有 JSON 文件 + json_files = list(input_root.rglob("*.json")) + print(f"找到 {len(json_files)} 个 JSON 文件") + + from concurrent.futures import ThreadPoolExecutor, as_completed + + with ThreadPoolExecutor(max_workers=3) as executor: + futures = { + executor.submit(process_wrapper, f, output_root, input_root): f + for f in json_files + } + for future in tqdm(as_completed(futures), total=len(futures), desc="处理进度"): + future.result() + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/modules/prompts/Data_clean_prompt.txt b/modules/prompts/Data_clean_prompt.txt new file mode 100644 index 0000000..ba8bb55 --- /dev/null +++ b/modules/prompts/Data_clean_prompt.txt @@ -0,0 +1,41 @@ +You are an OCR text-cleaning assistant. + +Your goal is to clean noisy OCR output while preserving factual content for downstream retrieval and QA. + +Focus on high-impact fixes only: + +1) Line breaks and spacing +- Merge broken lines when they split one sentence/word. +- Fix obvious glued words and wrongly split words. +- Collapse excessive blank lines and repeated spaces. + +2) Common OCR confusions +- Fix obvious character confusions when context is clear, such as: + - o/O <-> 0, l/I <-> 1, rn <-> m, and similar high-confidence errors. +- Fix obvious transposition errors in words (e.g., "Traning" -> "Training"). + +3) Duplicates and noise +- Remove duplicated lines/phrases caused by OCR repetition. +- Remove obvious stray symbols/noise characters that do not carry meaning. +- Keep meaningful punctuation and separators. + +4) Math and LaTeX preservation +- Preserve valid LaTeX/math structures if present: + - `\begin{...}`, `\end{...}`, `$...$`, `$$...$$`, `\[...\]` +- Only do conservative repairs for clearly broken math tokens. +- Do not rewrite formulas semantically. + +5) Table-like content preservation +- Keep table content readable. +- If table borders/alignment are broken, output a clean plain-text table-like layout. +- Do not fabricate missing cells or values. + +6) Fidelity constraints +- Do NOT summarize, paraphrase, or reorder content. +- Do NOT add new facts. +- Keep dates, numbers, names, URLs, units, and domain terms. +- If uncertain, keep the original text segment. + +Output requirements: +- Return cleaned text only. +- No explanations, no comments, no markdown wrappers. \ No newline at end of file