In [1]:
from langchain_ollama import ChatOllama

ollama_llm = ChatOllama(
    model="deepseek-v2",
    temperature=0.7,
    )


In [2]:
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv, find_dotenv
import os

_=load_dotenv(find_dotenv())
DEEPSEEK_API = os.getenv("DEEPSEEK_API")
BASE_URL = os.getenv("DEEPSEEK_URL")
MODEL_NAME = os.getenv("DEEPSEEK_MODEL")
llm = ChatOpenAI(api_key=DEEPSEEK_API, base_url=BASE_URL, model=MODEL_NAME)

In [3]:
llm.invoke("你好")

AIMessage(content='你好！欢迎使用聊天机器人。有什么我可以帮助你的吗？', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 13, 'prompt_tokens': 4, 'total_tokens': 17, 'prompt_cache_hit_tokens': 0, 'prompt_cache_miss_tokens': 4}, 'model_name': 'deepseek-chat', 'system_fingerprint': 'fp_7e0991cad4', 'finish_reason': 'stop', 'logprobs': None}, id='run-21539e85-42fc-47d4-afe8-d410053ff9e1-0', usage_metadata={'input_tokens': 4, 'output_tokens': 13, 'total_tokens': 17})

In [4]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_openai import ChatOpenAI

tagging_prompt = ChatPromptTemplate.from_template(
    """
Extract the desired information from the following passage.

Only extract the properties mentioned in the 'Classification' function.

Passage:
{input}
"""
)


class Classification(BaseModel):
    sentiment: str = Field(description="The sentiment of the text")
    aggressiveness: int = Field(
        description="How aggressive the text is on a scale from 1 to 10"
    )
    language: str = Field(description="The language the text is written in")


# llm=llm.with_structured_output(Classification)
tagging_chain = tagging_prompt | llm

In [5]:
inp = "Estoy increiblemente contento de haberte conocido! Creo que seremos muy buenos amigos!"
tagging_chain.invoke({"input": inp})

AIMessage(content="The passage provided does not contain any information related to a 'Classification' function or properties. It is a personal message expressing happiness about meeting someone and the expectation of becoming good friends.", additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 36, 'prompt_tokens': 56, 'total_tokens': 92, 'prompt_cache_hit_tokens': 0, 'prompt_cache_miss_tokens': 56}, 'model_name': 'deepseek-chat', 'system_fingerprint': 'fp_7e0991cad4', 'finish_reason': 'stop', 'logprobs': None}, id='run-43330c53-5839-4463-a951-37bb4fcf48ff-0', usage_metadata={'input_tokens': 56, 'output_tokens': 36, 'total_tokens': 92})

In [6]:
from typing import Optional

from langchain_core.pydantic_v1 import BaseModel, Field


class Person(BaseModel):
    """Information about a person."""

    # ^ Doc-string for the entity Person.
    # This doc-string is sent to the LLM as the description of the schema Person,
    # and it can help to improve extraction results.

    # Note that:
    # 1. Each field is an `optional` -- this allows the model to decline to extract it!
    # 2. Each field has a `description` -- this description is used by the LLM.
    # Having a good description can help improve extraction results.
    name: Optional[str] = Field(default=None, description="The name of the person")
    hair_color: Optional[str] = Field(
        default=None, description="The color of the person's hair if known"
    )
    height_in_meters: Optional[str] = Field(
        default=None, description="Height measured in meters"
    )

In [7]:
from typing import Optional

from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.pydantic_v1 import BaseModel, Field

# Define a custom prompt to provide instructions and any additional context.
# 1) You can add examples into the prompt template to improve extraction quality
# 2) Introduce additional parameters to take context into account (e.g., include metadata
#    about the document from which the text was extracted.)
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an expert extraction algorithm. "
            "Only extract relevant information from the text. "
            "If you do not know the value of an attribute asked to extract, "
            "return null for the attribute's value.",
        ),
        # Please see the how-to about improving performance with
        # reference examples.
        # MessagesPlaceholder('examples'),
        ("human", "{text}"),
    ]
)

In [8]:
prompt_performance_template = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an expert extraction algorithm. "
            "Only extract relevant information from the text. "
            "If you do not know the value of an attribute asked to extract, "
            "return null for the attribute's value.",
        ),
        # Please see the how-to about improving performance with
        # reference examples.
        # MessagesPlaceholder('examples'),
        ("human", "{text},现在日期是{date}"),
    ]
)

In [9]:
import sys
sys.path.append('..')  # 将 src 目录添加到 PYTHONPATH  # 假设当前工作目录是notebook目录
print(sys.path)

['/home/yepeng/ChatBI/chat-bi/src/notebook', '/home/yepeng/miniconda3/envs/chatbi/lib/python312.zip', '/home/yepeng/miniconda3/envs/chatbi/lib/python3.12', '/home/yepeng/miniconda3/envs/chatbi/lib/python3.12/lib-dynload', '', '/home/yepeng/miniconda3/envs/chatbi/lib/python3.12/site-packages', '/home/yepeng/miniconda3/envs/chatbi/lib/python3.12/site-packages/setuptools/_vendor', '..']


In [10]:
from entity.extraction import PerformanceQuerySchema

In [11]:


runnable =prompt_performance_template | llm.with_structured_output(schema=PerformanceQuerySchema)

In [12]:
# text = "去年国贸能化公司的利润率"
# runnable.invoke({"text": text,"date":"2024-08-25"}).dict()

{'indicator': 'GROSS_MARGIN_RATE',
 'aggregation': 'YEAR',
 'start_time': '2023-01-01',
 'end_time': '2023-12-31',
 'scope': 'COMPANY',
 'sort_type': None,
 'operator': None,
 'value': None,
 'company_name': '国贸能化公司'}

In [13]:
from entity.extraction_example import Example, tool_example_to_messages
from entity.extraction import *

examples = [
    (
        "去年集团利润率为负的公司,当前日期是2024-08-25，查询用户为集团用户",
        PerformanceQuerySchema(
            indicator="GROSS_MARGIN_RATE",
            aggregation="YEAR",
            start_time="2023-01-01",
            end_time="2023-12-31",
            scope="GROUP",
            sort_type="DESC",
            operator="<",
            value="0",
        ),
    ),
    (
        "国贸能化公司今年上半年的销售额大于1000万的月份,当前日期是2024-08-25",
        PerformanceQuerySchema(
            indicator="SALES",
            aggregation="MONTH",
            start_time="2024-01-01",
            end_time="2024-06-30",
            scope="GROUP",
            sort_type="DESC",
            operator=">",
            value="10000000",
            company_name="国贸能化",
        ),
    ),
]


messages = []

for text, tool_call in examples:
    messages.extend(
        tool_example_to_messages({"input": text, "tool_calls": [tool_call]})
    )

In [14]:
example_prompt = prompt.invoke({"text": "this is some text", "examples": messages})

for message in example_prompt.messages:
    print(f"{message.type}: {message}")

system: content="You are an expert extraction algorithm. Only extract relevant information from the text. If you do not know the value of an attribute asked to extract, return null for the attribute's value."
human: content='this is some text'


In [16]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

# Define a custom prompt to provide instructions and any additional context.
# 1) You can add examples into the prompt template to improve extraction quality
# 2) Introduce additional parameters to take context into account (e.g., include metadata
#    about the document from which the text was extracted.)
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an expert extraction algorithm. "
            "Only extract relevant information from the text. "
            "If you do not know the value of an attribute asked "
            "to extract, return null for the attribute's value.",
        ),
        # ↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓
        MessagesPlaceholder("examples"),  # <-- EXAMPLES!
        # ↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑
        MessagesPlaceholder("company_name_example"),
        ("human", "{text}"),
    ]
)

In [17]:
runnable_with_examples = prompt | llm.with_structured_output(
    schema=PerformanceQuerySchema,
    method="function_calling",
    include_raw=False,
)

messages = []

for text, tool_call in examples:
    messages.extend(
        tool_example_to_messages({"input": text, "tool_calls": [tool_call]})
    )

In [18]:
messages

[HumanMessage(content='去年集团利润率为负的公司,当前日期是2024-08-25，查询用户为集团用户'),
 AIMessage(content='', tool_calls=[{'name': 'PerformanceQuerySchema', 'args': {'indicator': 'GROSS_MARGIN_RATE', 'aggregation': 'YEAR', 'start_time': '2023-01-01', 'end_time': '2023-12-31', 'scope': 'GROUP', 'sort_type': 'DESC', 'operator': '<', 'value': '0', 'company_name': None}, 'id': '7ed4a8f9-05a0-4870-8214-7e239abd3fc0', 'type': 'tool_call'}]),
 ToolMessage(content='You have correctly called this tool.', tool_call_id='7ed4a8f9-05a0-4870-8214-7e239abd3fc0'),
 HumanMessage(content='国贸能化公司今年上半年的销售额大于1000万的月份,当前日期是2024-08-25'),
 AIMessage(content='', tool_calls=[{'name': 'PerformanceQuerySchema', 'args': {'indicator': 'SALES', 'aggregation': 'MONTH', 'start_time': '2024-01-01', 'end_time': '2024-06-30', 'scope': 'GROUP', 'sort_type': 'DESC', 'operator': '>', 'value': '10000000', 'company_name': '国贸能化'}, 'id': '1b06fa36-52f1-4d3e-8526-f7fc1c35f103', 'type': 'tool_call'}]),
 ToolMessage(content='You have correctly called 

In [26]:
from langchain_core.messages import (
    AIMessage,
    BaseMessage,
    HumanMessage,
    SystemMessage,
    ToolMessage,
)
company_name_examples=[HumanMessage("company name examples:湖北国贸能源化工有限公司,湖北国贸金属矿产有限公司,湖北国贸汽车有限公司,湖北国际贸易集团有限公司,湖北国贸农产品有限公司,武汉鼎联丰国际贸易有限公司,湖北国贸农产品有限公司武汉分公司,湖北南方大集实业有限公司,湖北南方大集实业有限公司东西湖分公司,湖北南方大集实业有限公司慈惠分公司,湖北南方大集实业有限公司江汉分公司,湖北南方大集实业有限公司能源分公司,湖北南方工贸有限公司,湖北南方集团有限公司,湖北国贸供应链管理有限公司,湖北华中能源发展有限公司,湖北国贸汽车有限公司红安分公司,company_name如果要取值，提取后的名称必须从例子里选择，如果没有相符的公司名则返回company_name='company_name_not_found'")]

In [27]:
text = "去年集团利润率为负的公司,当前日期是2024-08-25，查询用户为集团用户"
print(runnable_with_examples.invoke({"text": text, "examples": messages,"company_name_example": company_name_examples}))


indicator='GROSS_MARGIN_RATE' aggregation='YEAR' start_time='2023-01-01' end_time='2023-12-31' scope='GROUP' sort_type='DESC' operator='<' value='0' company_name='company_name_not_found'


In [29]:
text = "今年国贸供应链公司的销售额,当前日期是2024-08-25，查询用户为集团用户"
print(runnable_with_examples.invoke({"text": text, "examples": messages,"company_name_example": company_name_examples}))

indicator='SALES' aggregation='YEAR' start_time='2024-01-01' end_time='2024-12-31' scope='COMPANY' sort_type=None operator=None value=None company_name='湖北国贸供应链管理有限公司'
