In [34]:
%env API_KEY=替换为自己的
%env BASE_URL=https://api.deepbricks.ai/v1

env: API_KEY=替换为自己的
env: BASE_URL=https://api.deepbricks.ai/v1


In [2]:
%%capture --no-stderr
!pip install -U langchain langchain_community pypdf sentence_transformers chromadb trulens_eval langchain_openai

In [2]:
import langchain, langchain_community, pypdf, sentence_transformers, chromadb, trulens_eval, langchain_openai

for module in (langchain, langchain_community, langchain_openai, pypdf, sentence_transformers, chromadb, trulens_eval):
    print(f"{module.__name__:<30}{module.__version__ if hasattr(module, '__version__') else ''}")

2024-07-29 22:25:17.955344: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-07-29 22:25:17.984357: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


langchain                     0.2.7
langchain_community           0.2.7
langchain_openai              
pypdf                         4.2.0
sentence_transformers         2.7.0
chromadb                      0.5.3
trulens_eval                  0.33.0


In [3]:
!pip list |grep -i langchain-openai

langchain-openai                                  0.1.7


In [4]:
import os
import pandas as pd

In [11]:
# 如果已经下载到本地，可以替换为本地路径
EMBEDDING_MODEL_PATH = 'BAAI/bge-large-zh-v1.5'
dt = '20240713'
version = 'v1'

output_dir = os.path.join(os.path.pardir, 'outputs', f'{version}_{dt}')

加载问答对

In [6]:
qa_df = pd.read_excel(os.path.join(output_dir, 'question_answer.xlsx'))

# 文档处理

In [7]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader(os.path.join(os.path.pardir, 'data', '2024全球经济金融展望报告.pdf'))
documents = loader.load()

In [8]:
from uuid import uuid4
import os
import pickle

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma

def split_docs(documents, filepath, chunk_size=400, chunk_overlap=40, seperators=['\n\n\n', '\n\n'], force_split=False):
    if os.path.exists(filepath) and not force_split:
        print('found cache, restoring...')
        return pickle.load(open(filepath, 'rb'))

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=seperators
    )
    split_docs = splitter.split_documents(documents)
    for chunk in split_docs:
        chunk.metadata['uuid'] = str(uuid4())

    pickle.dump(split_docs, open(filepath, 'wb'))

    return split_docs

In [9]:
splitted_docs = split_docs(documents, os.path.join(output_dir, 'split_docs.pkl'), chunk_size=500, chunk_overlap=50)

found cache, restoring...


向量化

In [12]:
from langchain.embeddings import HuggingFaceBgeEmbeddings
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'device: {device}')

embeddings = HuggingFaceBgeEmbeddings(
    model_name=EMBEDDING_MODEL_PATH,
    model_kwargs={'device': device},
    encode_kwargs={'normalize_embeddings': True}
)

device: cuda


In [13]:
from tqdm.auto import tqdm

def get_vector_db(docs, store_path, force_rebuild=False):
    if not os.path.exists(store_path):
        force_rebuild = True

    if force_rebuild:
        vector_db = Chroma.from_documents(
            docs,
            embedding=embeddings,
            persist_directory=store_path
        )
    else:
        vector_db = Chroma(
            persist_directory=store_path,
            embedding_function=embeddings
        )
    return vector_db

In [14]:
vector_db = get_vector_db(splitted_docs, store_path=os.path.join(os.path.pardir, output_dir, 'chromadb', 'bge_large_v1.5'))

# 问答全流程

## 构建QA Chain

In [15]:
from langchain.llms import Ollama
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import PromptTemplate

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

llm = Ollama(
    model='qwen2:7b-instruct',
    base_url="http://localhost:11434"
)

prompt_tmpl = """
你是一个金融分析师，擅长根据所获取的信息片段，对问题进行分析和推理。
你的任务是根据所获取的信息片段（<<<<context>>><<<</context>>>之间的内容）回答问题。
回答保持简洁，不必重复问题，不要要添加描述性解释和与答案无关的任何内容。
已知信息：
<<<<context>>>
{context}
<<<</context>>>

问题：{question}
请回答：
"""
prompt = PromptTemplate.from_template(prompt_tmpl)
retriever = vector_db.as_retriever(search_kwargs={'k': 4})

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [16]:
print(rag_chain.invoke('2023年10月美国ISM制造业PMI指数较上月有何变化？'))

2023年10月美国ISM制造业PMI指数较上个月大幅下降了2.3个百分点。


## 准备测试集

In [24]:
prediction_df = qa_df[qa_df['dataset'] == 'test'][['uuid', 'question', 'qa_type', 'answer']].rename(columns={'answer': 'ref_answer'})

In [21]:
answer_dict = {}

for idx, row in tqdm(prediction_df.iterrows(), total=len(prediction_df)):
    uuid = row['uuid']
    question = row['question']
    answer = rag_chain.invoke(question)
    answer_dict[question] = {
        'uuid': uuid,
        'ref_answer': row['answer'],
        'gen_answer': answer
    }

  0%|          | 0/100 [00:00<?, ?it/s]

In [26]:
prediction_df.loc[:, 'gen_answer'] = prediction_df['question'].apply(lambda q: answer_dict[q]['gen_answer'])

# 评估

In [27]:
prediction_df.head(5)

Unnamed: 0,uuid,question,qa_type,ref_answer,gen_answer
0,e73a0c9d-d42b-4350-a4c3-b38bf67c68a5,报告的发布机构是什么？,detailed,中国银行研究院,报告的发布机构是中国银行研究院。
1,e73a0c9d-d42b-4350-a4c3-b38bf67c68a5,报告的发布日期是什么时候？,detailed,2023年12月12日,报告的发布日期是2023年12月12日。
2,e73a0c9d-d42b-4350-a4c3-b38bf67c68a5,2023年全球经济增长有什么特点？,detailed,全球经济增长动力持续回落，各国复苏分化，发达经济体增速放缓，新兴经济体表现稳定。,2023年全球经济增长呈现“复苏+分化”的特点。发达经济体增速明显放缓，其中欧元区和英国经济...
3,e73a0c9d-d42b-4350-a4c3-b38bf67c68a5,全球贸易增长情况如何？,detailed,全球贸易增长乏力。,根据所给信息片段，全球货物贸易量和价格指数均承压下行。在2023年1-8月期间，全球货物贸易...
5,e73a0c9d-d42b-4350-a4c3-b38bf67c68a5,展望2024年，全球经济复苏的预期如何？,detailed,全球经济复苏预计将依旧疲软。,预计2024年全球经济复苏将保持疲软态势，并可能加剧分化。发达经济体面临“遏通胀和稳增长”的...


In [28]:
from langchain_openai import ChatOpenAI

judge_llm = ChatOpenAI(
    api_key=os.environ['API_KEY'],
    base_url=os.environ['BASE_URL'],
    # model_name='qwen2-72b-instruct'
    # model_name='qwen-long' 0.68
    model_name='gpt-4o-mini'
)

def evaluate(question, ref_answer, gen_answer):
    """
    对预测结果进行打分
    :param question: 问题
    :param ref_answer: 参考答案
    :param gen_answer: 生成的答案
    :return 打分模型原始返回结果
    """
    prompt = """
你是一个经济学博士，现在我有一系列问题，有一个助手已经对这些问题进行了回答，你需要参照参考答案，评价这个助手的回答是否正确，仅回复“是”或“否”即可，不要带其他描述性内容或无关信息。
问题：
<question>
{{question}}
</question>

参考答案：
<ref_answer>
{{ref_answer}}
</ref_answer>

助手回答：
<gen_answer>
{{gen_answer}}
</gen_answer>
请评价：
    """.replace('{{question}}', question).replace('{{ref_answer}}', str(ref_answer)).replace('{{gen_answer}}', gen_answer).strip()
    result = judge_llm.invoke(prompt).content
    return result

或者也可以使用Ollama提供的模型

In [19]:
# from langchain.llms import Ollama

# judge_llm = Ollama(
#     model='qwen2:72b-instruct',
#     base_url="http://localhost:11434"
# )

In [29]:
score_dict = {}

for idx, row in tqdm(prediction_df.iterrows(), total=len(prediction_df)):
    question = row['question']
    if question in score_dict:
        continue
    ref_answer = answer_dict[question]['ref_answer']
    gen_answer = answer_dict[question]['gen_answer']
    score_dict[question] = evaluate(question, ref_answer, gen_answer)

  0%|          | 0/100 [00:00<?, ?it/s]

In [30]:
prediction_df.loc[:, 'raw_score'] = prediction_df['question'].apply(lambda q: score_dict[q])

In [31]:
prediction_df['raw_score'].unique()

array(['是', '否'], dtype=object)

In [32]:
prediction_df.loc[:, 'score'] = prediction_df['raw_score'].replace({'是': 1, '否': 0})
_ = prediction_df.pop('raw_score')

  prediction_df.loc[:, 'score'] = prediction_df['raw_score'].replace({'是': 1, '否': 0})


In [33]:
prediction_df['score'].mean()

0.74