diff --git a/ragflows/api.py b/ragflows/api.py index ce500f5..726565d 100644 --- a/ragflows/api.py +++ b/ragflows/api.py @@ -4,6 +4,8 @@ # date:2024-08-23 16:46 # describe: +import json +import os import time import requests from ragflows import configs, ragflowdb @@ -177,3 +179,70 @@ def parse_chunks_with_check(filename, doc_id=None): def is_succeed(response): # 20250208:增加对code字段的判断,因为新版ragflow返回字段名由retcode改为code了,保留retcode兼容旧版ragflow return response.get("retcode") == 0 or response.get("code") == 0 + +# @timeutils.monitor +def set_document_metadata(doc_id, filepath) -> bool: + """设置文档元数据 + + Args: + doc_id (str): 文档ID + filepath (str): 需要设置元数据的文件路径,用于读取 文件名+元数据后缀 的json文件 + + Returns: + bool: 是否成功 + """ + + # 没有配置元数据后缀,跳过 + if not configs.METADATA_SUFFIX: + return False + + if not doc_id: + timeutils.print_log(F'设置文档元数据失败: doc_id为空,跳过') + return False + + # 构建元数据文件路径-移除原文件后缀再拼接元数据后缀 + filepath_without_ext = os.path.splitext(filepath)[0] + metadata_filepath = filepath_without_ext + configs.METADATA_SUFFIX + + # 检查元数据文件是否存在 + if not os.path.exists(metadata_filepath): + timeutils.print_log(f'元数据文件不存在,跳过: {metadata_filepath}') + return False + + # 读取元数据文件内容 + try: + with open(metadata_filepath, 'r', encoding='utf-8') as f: + metadata = f.read().strip() + except Exception as e: + timeutils.print_log(f'设置文档元数据失败: 读取元数据文件出错,跳过: {e}') + return False + + # 判断metadata是否json + try: + json.loads(metadata) + except: + timeutils.print_log(f'设置文档元数据失败: metadata不是json格式,跳过') + return False + + # 开始设置元数据 + url = f"{configs.API_URL}/document/set_meta" + data = { + "doc_id": doc_id, + "meta": metadata + } + + try: + r = requests.post(url, json=data, headers=configs.get_header()) + + if is_succeed(r.json()): + timeutils.print_log(F'设置文档元数据成功: {doc_id}') + return True + else: + timeutils.print_log(F'设置文档元数据失败:{doc_id},{r.text}') + return False + + except Exception as e: + timeutils.print_log(f'设置文档元数据失败: 请求异常,跳过: {e}') + return False + + \ No newline at end of file diff --git a/ragflows/configs.demo.py b/ragflows/configs.demo.py index e07a9aa..12c46aa 100644 --- a/ragflows/configs.demo.py +++ b/ragflows/configs.demo.py @@ -37,5 +37,8 @@ # 首次上传后解析文件的等待时间 FIRST_PARSE_WAIT_TIME = 0 +# 元数据后缀,需要跟上传文件放在同一目录,json格式。只有当该配置不为空时才会自动添加/更新元数据信息 +METADATA_SUFFIX = '' # 例如:.meta.json + def get_header(): return {'authorization': AUTHORIZATION} \ No newline at end of file diff --git a/ragflows/main.py b/ragflows/main.py index 4147440..5f75394 100644 --- a/ragflows/main.py +++ b/ragflows/main.py @@ -108,6 +108,11 @@ def main(): continue file_path = doc_files[i] + + # 如果配置了元数据后缀,且文件是元数据后缀,则跳过 + if configs.METADATA_SUFFIX and file_path.endswith(configs.METADATA_SUFFIX): + continue + file_path = file_path.replace(os.sep, '/') filename = os.path.basename(file_path) @@ -126,6 +131,11 @@ def main(): # 如果文件已存在,则判断是否已经对文件进行了切片解析 if ragflowdb.exist_name(filename): doc_item = ragflowdb.get_doc_item_by_name(filename) + + # 检查配置并更新元数据 + doc_id = doc_item.get('id') + api.set_document_metadata(doc_id, file_path) + if configs.ONLY_UPLOAD: timeutils.print_log(f"{file_path} 已存在,跳过\n") elif doc_item.get('progress') == 1: @@ -149,6 +159,12 @@ def main(): timeutils.print_log(f'{file_path} 上传失败:{response.get("text")}') continue + # 解析doc_id + doc_id = response.get('data')[0].get('id') if response.get('data') else None + + # 检查配置并更新元数据 + api.set_document_metadata(doc_id, file_path) + # 仅上传,跳过切片解析 if configs.ONLY_UPLOAD: continue @@ -161,7 +177,6 @@ def main(): # 上传成功,开始切片 timeutils.print_log(f'{file_path},开始切片并等待解析完毕') - doc_id = response.get('data')[0].get('id') if response.get('data') else None status = api.parse_chunks_with_check(filename, doc_id) timeutils.print_log(file_path, "切片状态:", status, "\n") diff --git a/scripts/launcher.py b/scripts/launcher.py index e2ccae3..37e57a6 100644 --- a/scripts/launcher.py +++ b/scripts/launcher.py @@ -81,7 +81,7 @@ def __init__(self): self.geometry("800x750") # 版本和仓库信息 - self.version = "v1.0.4" # 版本号 + self.version = "v1.0.5-alpha" # 版本号 self.github_repo = "https://github.com/Samge0/ragflow-upload" # GitHub仓库地址 # 自定义图标 @@ -112,6 +112,7 @@ def __init__(self): "ONLY_UPLOAD": {"type": bool, "label": "仅上传文件", "default": "False"}, "ENABLE_PROGRESS_LOG": {"type": bool, "label": "打印切片进度日志", "default": "True"}, "UI_START_INDEX": {"type": int, "label": "起始文件序号", "default": "1"}, # 从1开始计数,更符合非编程用户习惯 + "METADATA_SUFFIX": {"type": str, "label": "元数据文件后缀", "default": ""}, } self.create_ui() diff --git a/scripts/requirements.txt b/scripts/requirements.txt index b1cb6e4..66e3886 100644 --- a/scripts/requirements.txt +++ b/scripts/requirements.txt @@ -1,3 +1,3 @@ customtkinter==5.2.2 pyinstaller==6.13.0 -Pillow==11.2.1 \ No newline at end of file +Pillow==11.3.0 \ No newline at end of file