Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 69 additions & 0 deletions ragflows/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
# date:2024-08-23 16:46
# describe:

import json
import os
import time
import requests
from ragflows import configs, ragflowdb
Expand Down Expand Up @@ -177,3 +179,70 @@ def parse_chunks_with_check(filename, doc_id=None):
def is_succeed(response):
# 20250208:增加对code字段的判断,因为新版ragflow返回字段名由retcode改为code了,保留retcode兼容旧版ragflow
return response.get("retcode") == 0 or response.get("code") == 0

# @timeutils.monitor
def set_document_metadata(doc_id, filepath) -> bool:
"""设置文档元数据

Args:
doc_id (str): 文档ID
filepath (str): 需要设置元数据的文件路径,用于读取 文件名+元数据后缀 的json文件

Returns:
bool: 是否成功
"""

# 没有配置元数据后缀,跳过
if not configs.METADATA_SUFFIX:
return False

if not doc_id:
timeutils.print_log(F'设置文档元数据失败: doc_id为空,跳过')
return False

# 构建元数据文件路径-移除原文件后缀再拼接元数据后缀
filepath_without_ext = os.path.splitext(filepath)[0]
metadata_filepath = filepath_without_ext + configs.METADATA_SUFFIX

# 检查元数据文件是否存在
if not os.path.exists(metadata_filepath):
timeutils.print_log(f'元数据文件不存在,跳过: {metadata_filepath}')
return False

# 读取元数据文件内容
try:
with open(metadata_filepath, 'r', encoding='utf-8') as f:
metadata = f.read().strip()
except Exception as e:
timeutils.print_log(f'设置文档元数据失败: 读取元数据文件出错,跳过: {e}')
return False

# 判断metadata是否json
try:
json.loads(metadata)
except:
timeutils.print_log(f'设置文档元数据失败: metadata不是json格式,跳过')
return False

# 开始设置元数据
url = f"{configs.API_URL}/document/set_meta"
data = {
"doc_id": doc_id,
"meta": metadata
}

try:
r = requests.post(url, json=data, headers=configs.get_header())

if is_succeed(r.json()):
timeutils.print_log(F'设置文档元数据成功: {doc_id}')
return True
else:
timeutils.print_log(F'设置文档元数据失败:{doc_id},{r.text}')
return False

except Exception as e:
timeutils.print_log(f'设置文档元数据失败: 请求异常,跳过: {e}')
return False


3 changes: 3 additions & 0 deletions ragflows/configs.demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,5 +37,8 @@
# 首次上传后解析文件的等待时间
FIRST_PARSE_WAIT_TIME = 0

# 元数据后缀,需要跟上传文件放在同一目录,json格式。只有当该配置不为空时才会自动添加/更新元数据信息
METADATA_SUFFIX = '' # 例如:.meta.json

def get_header():
return {'authorization': AUTHORIZATION}
17 changes: 16 additions & 1 deletion ragflows/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,11 @@ def main():
continue

file_path = doc_files[i]

# 如果配置了元数据后缀,且文件是元数据后缀,则跳过
if configs.METADATA_SUFFIX and file_path.endswith(configs.METADATA_SUFFIX):
continue

file_path = file_path.replace(os.sep, '/')
filename = os.path.basename(file_path)

Expand All @@ -126,6 +131,11 @@ def main():
# 如果文件已存在,则判断是否已经对文件进行了切片解析
if ragflowdb.exist_name(filename):
doc_item = ragflowdb.get_doc_item_by_name(filename)

# 检查配置并更新元数据
doc_id = doc_item.get('id')
api.set_document_metadata(doc_id, file_path)

if configs.ONLY_UPLOAD:
timeutils.print_log(f"{file_path} 已存在,跳过\n")
elif doc_item.get('progress') == 1:
Expand All @@ -149,6 +159,12 @@ def main():
timeutils.print_log(f'{file_path} 上传失败:{response.get("text")}')
continue

# 解析doc_id
doc_id = response.get('data')[0].get('id') if response.get('data') else None

# 检查配置并更新元数据
api.set_document_metadata(doc_id, file_path)

# 仅上传,跳过切片解析
if configs.ONLY_UPLOAD:
continue
Expand All @@ -161,7 +177,6 @@ def main():

# 上传成功,开始切片
timeutils.print_log(f'{file_path},开始切片并等待解析完毕')
doc_id = response.get('data')[0].get('id') if response.get('data') else None
status = api.parse_chunks_with_check(filename, doc_id)
timeutils.print_log(file_path, "切片状态:", status, "\n")

Expand Down
3 changes: 2 additions & 1 deletion scripts/launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def __init__(self):
self.geometry("800x750")

# 版本和仓库信息
self.version = "v1.0.4" # 版本号
self.version = "v1.0.5-alpha" # 版本号
self.github_repo = "https://github.com/Samge0/ragflow-upload" # GitHub仓库地址

# 自定义图标
Expand Down Expand Up @@ -112,6 +112,7 @@ def __init__(self):
"ONLY_UPLOAD": {"type": bool, "label": "仅上传文件", "default": "False"},
"ENABLE_PROGRESS_LOG": {"type": bool, "label": "打印切片进度日志", "default": "True"},
"UI_START_INDEX": {"type": int, "label": "起始文件序号", "default": "1"}, # 从1开始计数,更符合非编程用户习惯
"METADATA_SUFFIX": {"type": str, "label": "元数据文件后缀", "default": ""},
}

self.create_ui()
Expand Down
2 changes: 1 addition & 1 deletion scripts/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
customtkinter==5.2.2
pyinstaller==6.13.0
Pillow==11.2.1
Pillow==11.3.0