Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions dataflow/operators/generate/KnowledgeCleaning/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from .CorpusTextSplitter import CorpusTextSplitter
from .KnowledgeExtractor import KnowledgeExtractor
from .KnowledgeCleaner import KnowledgeCleaner
from .MultiHopQAGenerator import MultiHopQAGenerator
from .corpus_text_splitter import CorpusTextSplitter
from .knowledge_extractor import KnowledgeExtractor
from .knowledge_cleaner import KnowledgeCleaner
from .multihop_qa_generator import MultiHopQAGenerator

__all__ = [
"CorpusTextSplitter",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,105 @@
from dataflow import get_logger
from dataflow.utils.storage import DataFlowStorage
from dataflow.core import OperatorABC
from dataflow.utils.kbcleaning import _parse_pdf_to_md,_parse_doc_to_md,_parse_xml_to_md
import os
from pathlib import Path
from trafilatura import fetch_url, extract

def _parse_pdf_to_md(
input_pdf_path: str,
output_dir: str,
lang: str = "ch",
parse_method: str = "auto" # 解析方法:auto/txt/ocr
):
"""
将PDF转换为Markdown(仅使用Pipeline后端)
"""
try:
from mineru.data.data_reader_writer import FileBasedDataWriter
from mineru.backend.pipeline.pipeline_analyze import doc_analyze as pipeline_doc_analyze
from mineru.backend.pipeline.pipeline_middle_json_mkcontent import union_make as pipeline_union_make
from mineru.backend.pipeline.model_json_to_middle_json import result_to_middle_json as pipeline_result_to_middle_json
from mineru.utils.enum_class import MakeMode
except:
raise Exception(
"""
MinerU is not installed in this environment yet.
Please refer to https://github.com/opendatalab/mineru to install.
Or you can just execute 'pip install mineru[pipeline]' and 'mineru-models-download' to fix this error.
please make sure you have gpu on your machine.
"""
)

logger=get_logger()
# 读取PDF文件
pdf_bytes = Path(input_pdf_path).read_bytes()
pdf_name = Path(input_pdf_path).stem

# 解析PDF
infer_results, all_image_lists, all_pdf_docs, _, ocr_enabled_list = pipeline_doc_analyze(
[pdf_bytes], [lang], parse_method=parse_method
)

# 准备输出目录
image_dir = os.path.join(output_dir, f"{pdf_name}_images")
os.makedirs(image_dir, exist_ok=True)
image_writer = FileBasedDataWriter(image_dir)
md_writer = FileBasedDataWriter(output_dir)

# 生成中间结果和Markdown
middle_json = pipeline_result_to_middle_json(
infer_results[0], all_image_lists[0], all_pdf_docs[0],
image_writer, lang, ocr_enabled_list[0], True
)
md_content = pipeline_union_make(middle_json["pdf_info"], MakeMode.MM_MD, os.path.basename(image_dir))
# 保存Markdown
md_writer.write_string(f"{pdf_name}_pdf.md", md_content)
logger.info(f"Markdown saved to: {os.path.join(output_dir, f'{pdf_name}_pdf.md')}")

return os.path.join(output_dir,f"{pdf_name}_pdf.md")

def _parse_doc_to_md(input_file: str, output_file: str):
"""
support conversion of doc/ppt/pptx/pdf files to markdowns
"""
try:
from magic_doc.docconv import DocConverter
except:
raise Exception(
"""
Fairy-doc is not installed in this environment yet.
Please refer to https://github.com/opendatalab/magic-doc to install.
Or you can just execute 'apt-get/yum/brew install libreoffice' and 'pip install fairy-doc[gpu]' to fix this error.
please make sure you have gpu on your machine.
"""
)
logger=get_logger()
converter = DocConverter(s3_config=None)
markdown_content, time_cost = converter.convert(input_file, conv_timeout=300)
logger.info("time cost: ", time_cost)
with open(output_file, "w",encoding='utf-8') as f:
f.write(markdown_content)
return output_file

def _parse_xml_to_md(raw_file:str=None, url:str=None, output_file:str=None):
logger=get_logger()
if(url):
downloaded=fetch_url(url)
elif(raw_file):
with open(raw_file, "r", encoding='utf-8') as f:
downloaded=f.read()
else:
raise Exception("Please provide at least one of file path and url string.")

try:
result=extract(downloaded, output_format="markdown", with_metadata=True)
logger.info(f"Extracted content is written into {output_file}")
with open(output_file,"w", encoding="utf-8") as f:
f.write(result)
except Exception as e:
logger.error("Error during extract this file or link: ", e)

return output_file

@OPERATOR_REGISTRY.register()
class KnowledgeExtractor(OperatorABC):
Expand All @@ -14,6 +111,7 @@ class KnowledgeExtractor(OperatorABC):
def __init__(self, intermediate_dir: str = "intermediate", lang: str = "en"):
self.logger = get_logger()
self.intermediate_dir=intermediate_dir
os.makedirs(self.intermediate_dir, exist_ok=True)
self.lang=lang

@staticmethod
Expand Down Expand Up @@ -62,21 +160,6 @@ def run(self, storage:DataFlowStorage ,raw_file=None, url=None):
raw_file_suffix_no_dot=raw_file_suffix.replace(".","")
output_file=os.path.join(self.intermediate_dir,f"{raw_file_name}_{raw_file_suffix_no_dot}.md")
if(raw_file_suffix==".pdf"):
try:
from mineru.data.data_reader_writer import FileBasedDataWriter
from mineru.backend.pipeline.pipeline_analyze import doc_analyze as pipeline_doc_analyze
from mineru.backend.pipeline.pipeline_middle_json_mkcontent import union_make as pipeline_union_make
from mineru.backend.pipeline.model_json_to_middle_json import result_to_middle_json as pipeline_result_to_middle_json
from mineru.utils.enum_class import MakeMode
except:
raise Exception(
"""
MinerU is not installed in this environment yet.
Please refer to https://github.com/opendatalab/mineru to install.
Or you can just execute 'pip install mineru[pipeline]' and 'mineru-models-download' to fix this error.
please make sure you have gpu on your machine.
"""
)
# optional: 是否从本地加载OCR模型
os.environ['MINERU_MODEL_SOURCE'] = "local"
output_file=_parse_pdf_to_md(
Expand All @@ -86,17 +169,6 @@ def run(self, storage:DataFlowStorage ,raw_file=None, url=None):
"txt"
)
elif(raw_file_suffix in [".doc", ".docx", ".pptx", ".ppt"]):
try:
from magic_doc.docconv import DocConverter
except:
raise Exception(
"""
Fairy-doc is not installed in this environment yet.
Please refer to https://github.com/opendatalab/magic-doc to install.
Or you can just execute 'apt-get/yum/brew install libreoffice' and 'pip install fairy-doc[gpu]' to fix this error.
please make sure you have gpu on your machine.
"""
)
if(raw_file_suffix==".docx"):
raise Exception("Function Under Maintaining...Please try .doc format file instead.")
output_file=_parse_doc_to_md(raw_file, output_file)
Expand Down
8 changes: 4 additions & 4 deletions dataflow/operators/generate/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,10 @@
"SQLFilter": (cur_path + "Text2SQL/SQLFilter.py", "SQLFilter"),
"Text2SQLDifficultyClassifier": (cur_path + "Text2SQL/Text2SQLDifficultyClassifier.py", "Text2SQLDifficultyClassifier"),
# KBC
"CorpusTextSplitter": (cur_path + "KnowledgeCleaning/CorpusTextSplitter.py", "CorpusTextSplitter"),
"KnowledgeExtractor": (cur_path + "KnowledgeCleaning/KnowledgeExtractor.py", "KnowledgeExtractor"),
"KnowledgeCleaner": (cur_path + "KnowledgeCleaning/KnowledgeCleaner.py", "KnowledgeCleaner"),
"MultiHopQAGenerator": (cur_path + "KnowledgeCleaning/MultiHopQAGenerator.py", "MultiHopQAGenerator"),
"CorpusTextSplitter": (cur_path + "KnowledgeCleaning/corpus_text_splitter.py", "CorpusTextSplitter"),
"KnowledgeExtractor": (cur_path + "KnowledgeCleaning/knowledge_extractor.py", "KnowledgeExtractor"),
"KnowledgeCleaner": (cur_path + "KnowledgeCleaning/knowledge_cleaner.py", "KnowledgeCleaner"),
"MultiHopQAGenerator": (cur_path + "KnowledgeCleaning/multihop_qa_generator.py", "MultiHopQAGenerator"),
"AutoPromptGenerator": (cur_path + "AgenticRAG/AutoPromptGenerator.py", "AutoPromptGenerator"),
"QAScorer": (cur_path + "AgenticRAG/QAScorer.py", "QAScorer"),
"QAGenerator": (cur_path + "AgenticRAG/QAGenerator.py", "QAGenerator"),
Expand Down
73 changes: 0 additions & 73 deletions dataflow/utils/kbcleaning.py

This file was deleted.