本地数据加载

In [None]:
import os
import glob
from langchain.document_loaders import TextLoader


# 创建一个空列表来存储所有文档
documents = []
# 获取目录下所有json文件的路径
json_files = glob.glob("your_path")
# 遍历所有json文件并加载
for json_file in json_files:
    loader = TextLoader(json_file)
    documents_tem = loader.load()
    # 将每个文档添加到列表中
    documents.extend(documents_tem)

文档分割

In [None]:
# 文档分割
from langchain.text_splitter import CharacterTextSplitter
# 创建拆分器
text_splitter = CharacterTextSplitter(chunk_size=128, chunk_overlap=0)
# 拆分文档
documents = text_splitter.split_documents(documents)

数据向量化&入库

In [None]:
# 接下来对分割后的数据进行embedding，并写入数据库。这里选用
# m3e-base作为embedding模型，向量数据库选用Chroma

from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.vectorstores import Chroma
import re
import json

# embedding model: m3e-base
model_name = "your_path"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True}
embedding = HuggingFaceBgeEmbeddings(
                model_name=model_name,
                model_kwargs=model_kwargs,
                encode_kwargs=encode_kwargs,
                query_instruction="为文本生成向量表示用于文本检索"
            )

# load data to Chroma db
db_tem = Chroma.from_documents(documents, embedding)

构建生成函数

In [None]:
import requests
# ChatGPT API的URL
gpt_api_url = "******"
# ChatGPT应用名称s
app_name = "******"

def generate(prompt, max_tokens=4096, temperature=0.0, model="gpt-4"):
    if model in ["gpt-3.5-turbo", "gpt-4"]:
        task = {
            "app_name": app_name,
            "data": {
                "model": model,
                "max_tokens": max_tokens,
                "temperature": temperature,
                "messages": [{"role": "user", "content": prompt}]
            }
        }
        for retry in range(3):
            try:
                response = requests.post(gpt_api_url, json=task).json()
                content = response['data']['choices'][0]['message']['content']
                return content
            except Exception as e:
                print(f"API request error: {e}")
                pass
        raise Exception("Failed to generate")
    
    # For older models, use the completion API with max_tokens=1024
    params = {
        "model": model,
        "max_tokens": min(max_tokens, 1024),
        "temperature": temperature,
        "prompt": prompt
    }
    for retry in range(3):
        try:
            return openai.Completion.create(**params)["choices"][0]["text"]
        except:
            pass

生成  Task Planning

In [None]:
input_query = '''Write a description of the apartment "Lovely Room 1, Garden, Best Area, Legal rental" and specify how much it costs to stay there for a month!'''
REACT_INSTRUCTION = """You're an AI assistant who needs to help users with complex question. We can start by breaking this problem down into simple tasks. Solve a question answering task with interleaving Thought, Action, Observation steps. Thought can reason about the current situation, and Actions: 
You may take as many steps as necessary.
Here are some examples:
Question: Write an announcement about housing in the apartment "Skylit Midtown Castle" and specify how much it costs for a month's stay.
Thought 1: I need to break down the question into sub-tasks.
Action 1: Decompose[question]
Observation 1: \nTask_1:I need to inquire about the apartment "Skylit Midtown Castle".\nTask_2:Get the price of the apartment for one night or one month from the info. If it's for one night, multiply the unit price by thirty. \nTask_3:Write a housing announcement and give one month's rent based on the referenced information.
Thought 2: I need to fix Task_1 first.Read the table in path "/raid/shnu/MTM/Tool_Iteration/data/airbnb/Airbnb_Open_Data.xlsx" to get the information of "Skylit Midtown Castle". Maybe I can solve this with the above Python code, or I can write a python program to solve this problem, and the field name is "NAME". The number of fields for its related information is large, and I hope it is possible to print all the information. And would like to add this line of code "pd.options.display.max_columns = 50".
Action 2: Get[Tool] Or Maker[Tool]
Observation 2: ans_Tool_1
Thought 3: Now use the tool to load the path "/raid/shnu/MTM/Tool_Iteration/data/airbnb/Airbnb_Open_Data.csv" to get the reference corpus and find the "Skylit Midtown Castle" apartment and get information about it. If the tool is not available, recreate the tool.
Action 3: Get[knowledge]
Observation 3: ans_Knowledge
Thought 4: Extract one month's or single day's rent based on the information obtained in the previous step.
Action 4: Extract[data]
Observation 4: ans_data
Thought 5: Search for a calculator tool or write a calculator function in python.
Action 5: Get[Tool] Or Maker[Tool]
Observation 5: ans_Tool_2
Thought 6: Use this function to calculate one month's rent for the apartment "Skylit Midtown Castle". If the tool does not solve the task at hand you can try to optimize the tool, but the optimization presupposes that the tool can still have the previous functionality.
Action 6: Calculator[Money] Or Refine[Tool]
Observation 6: ans_Money
Thought 7: Write a notice on housing in the apartment "Skylit Midtown Castle" and calculate the monthly rent according to 'ans_Knowledge'.
Action 7: Write[Notice]
Observation 7: ans_Notice
Thought 8: After performing the above steps, the task is completed
Action 8: Finish[Task]

Question: Write a note about housing in the "BlissArtsSpace!" apartments and give a clear idea of how much it costs to stay there for a month.
Thought 1: I need to break down the question into sub-tasks.
Action 1: Decompose[question]
Observation 1: \nTask_1:I need to inquire about the apartment "BlissArtsSpace!".\nTask_2:Get the price of the apartment for one night or one month from the info. If it's for one night, multiply the unit price by thirty. \nTask_3:Write a housing announcement and give one month's rent based on the referenced information.
Thought 2: I need to fix Task_1 first.Read the table in path "/raid/shnu/MTM/Tool_Iteration/data/airbnb/Airbnb_Open_Data.csv" to get the information of "BlissArtsSpace!". Maybe I can solve this with the above Python code, or I can write a python program to solve this problem,  and the field name is "NAME". The number of fields for its related information is large, and I hope it is possible to print all the information. And would like to add this line of code "pd.options.display.max_columns = 50".
Action 2: Get[Tool] Or Maker[Tool]
Observation 2: ans_Tool_1
Thought 3: Now use the tool to load the path "/raid/shnu/MTM/Tool_Iteration/data/airbnb/Airbnb_Open_Data.csv" to get the reference corpus and find the "BlissArtsSpace!" apartment and get information about it. If the tool is not available, recreate the tool.
Action 3: Get[knowledge]
Observation 3: ans_Knowledge
Thought 4: Extract one month's or single day's rent based on the information obtained in the previous step.
Action 4: Extract[data]
Observation 4: ans_data
Thought 5: Search for a calculator tool or write a calculator function in python.
Action 5: Get[Tool] Or Maker[Tool]
Observation 5: ans_Tool_2
Thought 6: Use this function to calculate one month's rent for the apartment "BlissArtsSpace!". If the tool does not solve the task at hand you can try to optimize the tool, but the optimization presupposes that the tool can still have the previous functionality.
Action 6: Calculator[Money] Or Refine[Tool]
Observation 6: ans_Money
Thought 7: Write a notice on housing in the apartment "BlissArtsSpace!" and calculate the monthly rent according to 'ans_Knowledge'.
Action 7: Write[Notice]
Observation 7: ans_Notice
Thought 8: After performing the above steps, the task is completed
Action 8: Finish[Task]
(END OF EXAMPLES)
Question: {question}"""
prompt = REACT_INSTRUCTION.format(question=input_query)
result_tem = generate(prompt=prompt)
result_tem

根据子任务生成解决方案

In [None]:
tool_1_prompt = re.findall(r"Thought 2:(.*?)\nAction 2", result_tem, re.DOTALL)[0]
tool_1_result_tem = generate(prompt=tool_1_prompt)
tool_1_result_tem

使用工具解决子任务

In [None]:
tool_1_result = re.findall(r"```python\n(.*)\n```\n\n", tool_1_result_tem, re.DOTALL)[0]
tool_1_result = tool_1_result.replace("\'", "\"")
local_vars = {}
exec(tool_1_result, globals(),local_vars)
data_tem = local_vars['row']

根据需要寻找工具

In [None]:
input_query = "Search for a calculator tool or write a calculator function in python."
# similarity search
sim_message = db_tem.similarity_search(input_query, k=3)

# extracting tool
content = re.findall(r"content(.*?)}", str(sim_message[0]), re.DOTALL)[-1]
wrapper = content.replace("\\\\n","\n").replace("\\\\","\\").replace("\\'", "'")[4:-2]
example = re.findall(r"Use cases:\n\n(.*)", wrapper, re.DOTALL)
tool_2_func = re.findall(r"```python\n(.*?)\n```", wrapper, re.DOTALL)[0]

提取价格信息

In [None]:
ans_data_prompt = "请你从这些数据：" + str(data_tem) + "提取出这间旅店的价格相关信息！，包括“price”、“service”和“minimum nights”！"
ans_data = generate(prompt=ans_data_prompt)
ans_data

优化并验证工具

In [None]:
expression_prompt = "有一个任务是从相关信息中提取表达式，例如：\n这间旅店的价格相关信息如下：\n价格（price）：$340\n服务费（service fee）：$168\n最少入住晚数（minimum nights）：9.0晚。\n表达式1：340*30+168;\n表达式2：340*9+168\n" + ans_data
expression = generate(prompt=expression_prompt)
ans_expression_1 = re.findall(r"表达式1：(.*);\n表达式2", expression, re.DOTALL)
ans_expression_2 = re.findall(r"表达式2：(.*)", expression, re.DOTALL)

for retry in range(3):
    try:
        tool_2_prompt = tool_2_func + "\n\n" + "expression_1 = " + ans_expression_1[0] + "\n" + "expression_2 = " + ans_expression_2[0] + "\n" + "ans_money_1 = calculator(expression_1)" + "\n" + "ans_money_2 = calculator(expression_2)" + "\n" + "print(ans_money_1)" + "\n" + "print(ans_money_2)"
        local_vars = {}
        exec(tool_2_prompt, globals(),local_vars)
        ans_money_1 = local_vars['expression_1']
        ans_money_2 = local_vars['expression_2']
        break
    except Exception as e:
        print("ERROR: failed to generate ans_money！",e)
        tool_2_prompt += "\n" + "Failed to execute the function due to the error. You can rewrite the entire function to accommodate the new task, but in a way that is compatible with the role of the previous function."
        tool_2_func_refine = generate(prompt=tool_2_prompt)
        print("tool_2_func_refine:", tool_2_func_refine)
        tool_2_func = re.findall(r"```python\n(.*?)```", tool_2_func_refine, re.DOTALL)[0]
        print("tool_2_func:", tool_2_func)

解决子任务

In [None]:
exec(tool_2_func+"\nnum1 = 9\nnum2 = 3\noperator = '/'\nresult = calculator(num1, num2, operator)\nprint(result)", globals())

生成最终答案

In [None]:
ans_Description_prompt = '''Write a description on housing in the apartment "Lovely Room 1, Garden, Best Area, Legal rental" and calculate the monthly rent according to ''' + str(data_tem) + "\n" + "one month price: " + str(ans_money)
ans_Description = generate(prompt=ans_Description_prompt)
ans_Description

整个ReAct过程

In [None]:
result = "Question:" + input_query + "\n" + result_tem.replace("ans_Tool_1",str(tool_1_result)).replace("ans_Knowledge",str(data_tem)).replace("ans_data",str(ans_data)).replace("ans_Tool_2",str(tool_2_func)).replace("ans_Money",str(ans_money)).replace("ans_Description",str(ans_Description))
result