Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 0 additions & 63 deletions .github/workflows/validate.yaml

This file was deleted.

20 changes: 20 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# .pre-commit-config.yaml
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.6.8
hooks:
- id: ruff
args: [--fix]
- id: ruff-format

- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.6.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
- id: check-yaml

- repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.13.0
hooks:
- id: mypy
1 change: 1 addition & 0 deletions .woodpecker/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This folder contains scripts for testing in our own CI/CD platform based on Woodpecker CI, located in the USTC Knowledge Computing Laboratory. These scripts may contain many hard-coded absolute URLs, so they should not be used elsewhere.
50 changes: 50 additions & 0 deletions .woodpecker/pre-commit.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
when:
- event: pull_request

variables:
- &pip_config
PIP_INDEX_URL: https://mirrors.ustc.edu.cn/pypi/simple
- &pre_commit_config
PRE_COMMIT_HOME: /woodpecker/cache/pre-commit
- &minio_config
MINIO_HOST: https://s3.kclab.cloud
MINIO_BUCKET: cache-53030

steps:
- name: open cache
image: minio/mc:latest
environment:
<<: [*pre_commit_config, *minio_config]
MINIO_ACCESS_KEY:
from_secret: B2sE0fKv1Y1lOpZtge5u
MINIO_SECRET_KEY:
from_secret: JRx4MbrMbfUfQjIEm7speT52kQgjt0zafvlAuYxW
commands:
- mkdir --parents $${PRE_COMMIT_HOME}
- touch $${PRE_COMMIT_HOME}/.keep
- mc alias set minio $${MINIO_HOST} $${MINIO_ACCESS_KEY} $${MINIO_SECRET_KEY}
- mc cp --recursive minio/$${MINIO_BUCKET}/${CI_REPO}/pre-commit/ $${PRE_COMMIT_HOME}/
failure: ignore

- name: pre-commit
image: python:3.12
environment:
<<: [*pre_commit_config, *pip_config]
commands:
- pip install pre-commit
- pip install '.[dev]'
- pre-commit run --all-files

- name: save cache
image: minio/mc:latest
environment:
<<: [*pre_commit_config, *minio_config]
MINIO_ACCESS_KEY:
from_secret: minio_access_key
MINIO_SECRET_KEY:
from_secret: minio_secret_key
commands:
- mc alias set minio $${MINIO_HOST} $${MINIO_ACCESS_KEY} $${MINIO_SECRET_KEY}
- mc cp --recursive $${PRE_COMMIT_HOME}/ minio/$${MINIO_BUCKET}/${CI_REPO}/pre-commit/
when:
event: push
11 changes: 5 additions & 6 deletions enhancers/base_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@
from typing import Any

from openai import AsyncOpenAI
from openai.types.chat import ParsedChatCompletionMessage
from pydantic import BaseModel
from tenacity import retry, stop_after_attempt, wait_exponential

from parsers.base_models import ChunkData
from parsers.base_models import DataItem

MAX_RETRIES = 3
WAIT_TIME = 4
Expand All @@ -24,12 +25,12 @@ def __init__(self, model_name: str, base_url: str, api_key: str):
self.system_prompt = "You are a helpful assistant."

@abstractmethod
async def enhance(self, information: ChunkData) -> ChunkData:
async def enhance(self, information: DataItem) -> DataItem:
"""增强信息"""
pass

@retry(stop=stop_after_attempt(MAX_RETRIES), wait=wait_exponential(multiplier=MULTIPLIER, min=WAIT_TIME, max=WAIT_MAX_TIME))
async def get_structured_response(self, user_prompt: list[dict[str, Any]], response_format: JsonResponseFormat) -> str|None:
async def get_structured_response(self, user_prompt: list[dict[str, Any]], response_format: JsonResponseFormat) -> ParsedChatCompletionMessage:
"""获取结构化响应"""
response = await self.client.chat.completions.parse(
model=self.model_name,
Expand All @@ -39,6 +40,4 @@ async def get_structured_response(self, user_prompt: list[dict[str, Any]], respo
],
response_format=response_format # type: ignore
)
if response.choices[0].message.refusal:
raise ValueError(f"模型拒绝了请求: {response.choices[0].message.refusal}")
return response.choices[0].message.parsed
return response.choices[0].message
17 changes: 14 additions & 3 deletions parsers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,14 @@
# Parsers package

from .base_models import ChunkData, ChunkType, DocumentData, DocumentParser
from .base_models import (
ChunkType,
DocumentData,
DocumentParser,
FormulaDataItem,
ImageDataItem,
TableDataItem,
TextDataItem,
)
from .parser_registry import (
PARSER_REGISTRY,
get_parser,
Expand All @@ -10,10 +18,13 @@
)

__all__ = [
'ChunkType',
'DocumentData',
'DocumentParser',
'ChunkData',
'ChunkType',
'TableDataItem',
'TextDataItem',
'ImageDataItem',
'FormulaDataItem',
'PARSER_REGISTRY',
'register_parser',
'get_parser',
Expand Down
33 changes: 17 additions & 16 deletions parsers/base_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,12 @@ class ChunkType(str, Enum):
TABLE = "table"
FORMULA = "formula"

class TableDataItem(BaseModel):
class DataItem(BaseModel):
"""数据项基类"""
type: str|None = None # 数据项类型
name: str|None = None # 数据项名称

class TableDataItem(DataItem):
"""表格数据类"""
rows: int # 行数
columns: int # 列数
Expand All @@ -26,39 +31,35 @@ class TableDataItem(BaseModel):
data: list[list[str]] = Field(default_factory=list) # 数据
caption: list[str] = Field(default_factory=list) # 表格标题
footnote: list[str] = Field(default_factory=list) # 表格注脚
description: str|None = None # 表格描述

class TextDataItem(BaseModel):
class TextDataItem(DataItem):
"""文本数据类"""
text: str # 文本
text_level: int|None = None # 文本级别

class ImageDataItem(BaseModel):
class ImageDataItem(DataItem):
"""图片数据类"""
uri: str|None = None # 图片 URI
caption: list[str] = Field(default_factory=list) # 图片标题
footnote: list[str] = Field(default_factory=list) # 图片注脚
description: str|None = None # 图片描述

class FormulaDataItem(BaseModel):
class FormulaDataItem(DataItem):
"""公式数据类"""
text: str # 公式
text_format: str|None = None # 公式格式

class ChunkData(BaseModel):
"""块数据类"""
type: ChunkType
name: str|None = None
content: TableDataItem|TextDataItem|ImageDataItem|FormulaDataItem
description: str|None = None
description: str|None = None # 公式描述

class DocumentData(BaseModel):
"""解析结果类"""
title: str|None = None
texts: list[ChunkData] = Field(default_factory=list)
tables: list[ChunkData] = Field(default_factory=list)
images: list[ChunkData] = Field(default_factory=list)
formulas: list[ChunkData] = Field(default_factory=list)
texts: list[TextDataItem] = Field(default_factory=list)
tables: list[TableDataItem] = Field(default_factory=list)
images: list[ImageDataItem] = Field(default_factory=list)
formulas: list[FormulaDataItem] = Field(default_factory=list)
processing_time: float = 0
success: bool
success: bool = False
error_message: str | None = None

class DocumentParser(ABC):
Expand Down
Loading