In [1]:
# !pip install --upgrade openai

In [2]:
# !pip install pdfminer.six

In [20]:
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer

In [21]:
def extract_text_from_pdf(filename, page_numbers=None, min_line_length=10):
    '''从 PDF 文件中（按指定页码）提取文字'''
    paragraphs = []
    buffer = ''
    full_text = ''
    # 提取全部文本
    for i, page_layout in enumerate(extract_pages(filename)):
        # 如果指定了页码范围，跳过范围外的页
        if page_numbers is not None and i not in page_numbers:
            continue
        for element in page_layout:
            if isinstance(element, LTTextContainer):
                full_text += element.get_text() + '\n'
    # 按空行分隔，将文本重新组织成段落
    # 但上面将所有的文本框中的文本用换行符切割，并不是pdf原生的段落
    lines = full_text.split('\n')
    for text in lines:
        # 当遇到一个比较小的字符长度文本（通常是标题），那么就把之前积累的文本内容作为一个段落 
        if len(text) >= min_line_length:
            # 如果text不以'-'结尾，则在text前面添加一个空格，即单词和单词之间的空格，否则将'-'去除
            buffer += (' '+text) if not text.endswith('-') else text.strip('-')
        elif buffer:
            paragraphs.append(buffer)
            buffer = ''
    if buffer:
        paragraphs.append(buffer)
    return paragraphs

In [22]:
paragraphs = extract_text_from_pdf("llama2.pdf", min_line_length=15)

In [23]:
for para in paragraphs[:5]:
    print(para+"\n")

 Llama 2: Open Foundation and Fine-Tuned Chat Models

 Hugo Touvron∗ Louis Martin† Kevin Stone† Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale Dan Bikel Lukas Blecher Cristian Canton Ferrer Moya Chen Guillem Cucurull David Esiobu Jude Fernandes Jeremy Fu Wenyin Fu Brian Fuller Cynthia Gao Vedanuj Goswami Naman Goyal Anthony Hartshorn Saghar Hosseini Rui Hou Hakan Inan Marcin Kardas Viktor Kerkez Madian Khabsa Isabel Kloumann Artem Korenev Punit Singh Koura Marie-Anne Lachaux Thibaut Lavril Jenya Lee Diana Liskovich Yinghai Lu Yuning Mao Xavier Martinet Todor Mihaylov Pushkar Mishra Igor Molybog Yixin Nie Andrew Poulton Jeremy Reizenstein Rashi Rungta Kalyan Saladi Alan Schelten Ruan Silva Eric Michael Smith Ranjan Subramanian Xiaoqing Ellen Tan Binh Tang Ross Taylor Adina Williams Jian Xiang Kuan Puxin Xu Zheng Yan Iliyan Zarov Yuchen Zhang Angela Fan Melanie Kambadur Sharan Narang Aurelien Rodriguez Robert Stojnic Sergey Edu

In [24]:
print(len(paragraphs))

92


## LLM封装接口

In [25]:
from openai import OpenAI
import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

client = OpenAI()

In [26]:
def get_completion(prompt, model="gpt-3.5-turbo"):
    messages = [{"role": "user", "content": prompt}]
    response = client.chat.completions.create(
        model = model,
        messages = messages,
        temperature = 0.2,
    )
    return response.choices[0].message.content

In [27]:
def build_prompt(prompt_template, **kwargs):
    # **kwargs 是一个包含所有参数的 Python 字典。
    inputs = {}
    for k, v in kwargs.items():
        if isinstance(v, list) and all(isinstance(elem, str) for elem in v):
            # 这个条件判断用于检查当前参数值 v 是否是一个字符串列表。如果 v 是一个列表，并且该列表中的所有元素都是字符串，则返回 True。
            val = '\n\n'.join(v)
            # 如果上面的条件为真，那么将列表中的所有字符串用两个换行符 \n\n 连接起来，得到一个新的字符串 val。
        else:
            val = v
        inputs[k] = val
    return prompt_template.format(**inputs)

In [28]:
prompt_template = """
你是一个问答机器人。
你的任务是根据下述给定的已知信息回答用户问题。

已知信息:
{context}

用户问：
{query}

如果已知信息不包含用户问题的答案，或者已知信息不足以回答用户的问题，请直接回复"我无法回答您的问题"。
请不要输出已知信息中不包含的信息或答案。
请用中文回答用户问题。
"""

In [29]:
user_query = "Do you know llama 2?"
# user_query = "HEllo"

# 1. 检索
search_results = "Llama 2 is a large language model developed by Meta (formerly known as Facebook). It is part of the Llama (Large Language Model Meta AI) family and is designed to generate human-like text based on the input it receives."

# 2. 构建 Prompt
# build_prompt中的后两个参数对应prompt_template中的{context}和{query}
prompt = build_prompt(prompt_template, context = search_results, query=user_query)
print("===Prompt===")
print(prompt)

# 3. 调用 LLM
response = get_completion(prompt)

print("===回复===")
print(response)

===Prompt===

你是一个问答机器人。
你的任务是根据下述给定的已知信息回答用户问题。

已知信息:
Llama 2 is a large language model developed by Meta (formerly known as Facebook). It is part of the Llama (Large Language Model Meta AI) family and is designed to generate human-like text based on the input it receives.

用户问：
Do you know llama 2?

如果已知信息不包含用户问题的答案，或者已知信息不足以回答用户的问题，请直接回复"我无法回答您的问题"。
请不要输出已知信息中不包含的信息或答案。
请用中文回答用户问题。

===回复===
是的，我知道Llama 2是由Meta（原名Facebook）开发的一种大型语言模型，它是Llama（Large Language Model Meta AI）家族的一部分，旨在根据接收到的输入生成类似人类的文本。


In [30]:
# 为了演示方便，只取两页（第一章）
paragraphs = extract_text_from_pdf(
    "llama2.pdf",
    page_numbers=[2, 3],
    min_line_length=10
)

In [31]:
print(len(paragraphs))

14


In [32]:
# 上面的对于文档的分割粒度太大，有可能导致检索不精确，粒度太小又可能信息不全
# 此外，问题的答案也有可能跨越两个句子
# 因此，采用部分重叠式的切割文本，使上下文更完整
from nltk.tokenize import sent_tokenize
import json

# sent_tokenize()方法是针对英文的实现，并非中文
def split_text(paragraphs, chunk_size=300, overlap_size=100):
    '''按指定 chunk_size 和 overlap_size 交叠割文本'''
    # sent_tokenize(p)把每一个段落分割成句子
    # 每个句子去除前后空白字符并将所有句子合并到sentences列表中
    sentences = [s.strip() for p in paragraphs for s in sent_tokenize(p)]
    chunks = []
    i = 0
    while i < len(sentences):
        chunk = sentences[i]
        overlap = ''
        prev_len = 0
        prev = i - 1
        # 向前计算重叠部分
        # 如果前一个句子height + 当前overlap的句子仍然小于overlap_size，则将前一个句子加到overlap中
        while prev >= 0 and len(sentences[prev])+len(overlap) <= overlap_size:
            overlap = sentences[prev] + ' ' + overlap
            prev -= 1
        chunk = overlap+chunk
        next = i + 1
        # 向后计算当前chunk
        # 向后不断添加句子，直至满足设定的chunk_size
        while next < len(sentences) and len(sentences[next])+len(chunk) <= chunk_size:
            chunk = chunk + ' ' + sentences[next]
            next += 1
        chunks.append(chunk)
        i = next
        # 最终返回的chunks中的每一个chunk都包含多个句子，这里就不再适用段落分割了
    return chunks
    return chunks

In [33]:
# 如果报Lookup Error，需要下载punkt
# import nltk
# nltk.download('punkt')


In [34]:
# 把之前分割好的段落再分成部分重叠的chunks
chunks = split_text(paragraphs, 300, 100)

In [35]:
# 可以看到不同的trunk之间确实有了重叠
for chunk in chunks[:5]:
    print(chunk + '\n')

Figure 1: Helpfulness human evaluation results for Llama 2-Chat compared to other open-source and closed-source models. Human raters compared model generations on ~4k prompts consisting of both single and multi-turn prompts. The 95% conﬁdence intervals for this evaluation are between 1% and 2%.

The 95% conﬁdence intervals for this evaluation are between 1% and 2%. More details in Section 3.4.2.

More details in Section 3.4.2. While reviewing these results, it is important to note that human evaluations can be noisy due to limitations of the prompt set, subjectivity of the review guidelines, subjectivity of individual raters, and the inherent diﬃculty of comparing generations.

Figure 2: Win-rate % for helpfulness andsafety between commercial-licensed baselines and Llama 2-Chat, according to GPT 4. To complement the human evaluation, we used a more capable model, not subject to our own guidance. Green area indicates our model is better according to GPT-4.

Green area indicates our mode

In [36]:
def get_embeddings(texts, model="text-embedding-ada-002", dimensions=None):
    '''封装 OpenAI 的 Embedding 模型接口'''
    if model == "text-embedding-ada-002":
        dimensions = None
    if dimensions:
        data = client.embeddings.create(
            input=texts, model=model, dimensions=dimensions).data
    else:
        data = client.embeddings.create(input=texts, model=model).data
    return [x.embedding for x in data]

In [37]:
import chromadb
from chromadb.config import Settings


class MyVectorDBConnector:
    def __init__(self, collection_name, embedding_fn):
        chroma_client = chromadb.Client(Settings(allow_reset=True))

        # 为了演示，实际不需要每次 reset()
        chroma_client.reset()

        # 创建一个 collection
        self.collection = chroma_client.get_or_create_collection(
            name=collection_name)
        self.embedding_fn = embedding_fn

    def add_documents(self, documents):
        '''向 collection 中添加文档与向量'''
        self.collection.add(
            embeddings=self.embedding_fn(documents),  # 每个文档的向量
            documents=documents,  # 文档的原文
            ids=[f"id{i}" for i in range(len(documents))]  # 每个文档的 id
        )

    def search(self, query, top_n):
        '''检索向量数据库'''
        results = self.collection.query(
            query_embeddings=self.embedding_fn([query]),
            n_results=top_n
        )
        return results

In [38]:
# 创建一个向量数据库对象
vector_db = MyVectorDBConnector("demo", get_embeddings)
# 向向量数据库中添加文档，参数可选择paragraphs或chunks
# vector_db.add_documents(paragraphs)
vector_db.add_documents(chunks)

In [39]:
# user_query = "Llama 2有多少参数"
user_query = "Llama 2有商业许可协议吗？"
results = vector_db.search(user_query, 5)

In [40]:
for para in results['documents'][0]:
    print(para+"\n")

2. Llama 2-Chat, a ﬁne-tuned version of Llama 2 that is optimized for dialogue use cases. We release variants of this model with 7B, 13B, and 70B parameters as well. We believe that the open release of LLMs, when done safely, will be a net beneﬁt to society.

We are releasing the following models to the general public for research and commercial use‡: 1. Llama 2, an updated version of Llama 1, trained on a new mix of publicly available data.

In this work, we develop and release Llama 2, a family of pretrained and ﬁne-tuned LLMs, Llama 2 and Llama 2-Chat, at scales up to 70B parameters. On the series of helpfulness and safety benchmarks we tested, Llama 2-Chat models generally perform better than existing open-source models.

We believe that the open release of LLMs, when done safely, will be a net beneﬁt to society. Like all LLMs, Llama 2 is a new technology that carries potential risks with use (Bender et al., 2021b; Weidinger et al., 2021; Solaiman et al., 2023).

Figure 2: Win-rate

上面只是构建了向量数据库，然后根据user_query从数据库中检索出前几个段。
接下来不仅要检索，还要把检索的结果放到prompt，最后再喂给LLM

In [41]:
class RAG_Bot:
    def __init__(self, vector_db, llm_api, n_results=2):
        self.vector_db = vector_db
        self.llm_api = llm_api
        self.n_results = n_results

    def chat(self, user_query):
        # 1. 检索
        search_results = self.vector_db.search(user_query, self.n_results)

        # 2. 构建 Prompt
        prompt = build_prompt(
            prompt_template, context=search_results['documents'][0], query=user_query)

        # 3. 调用 LLM
        response = self.llm_api(prompt)
        return response

In [42]:
# 创建一个RAG机器人
bot = RAG_Bot(
    vector_db,
    llm_api=get_completion
)

user_query = "llama 2是什么？"
user_query = "llama 2有商用许可协议吗"
user_query = "llama 2是开源的吗？"

print("====搜索结果====")
search_results = bot.vector_db.search(user_query, 5)
for doc in search_results['documents'][0]:
    print(doc + "\n")

print("====回复====")
response = bot.chat(user_query)

print(response)

====搜索结果====
2. Llama 2-Chat, a ﬁne-tuned version of Llama 2 that is optimized for dialogue use cases. We release variants of this model with 7B, 13B, and 70B parameters as well. We believe that the open release of LLMs, when done safely, will be a net beneﬁt to society.

We are releasing the following models to the general public for research and commercial use‡: 1. Llama 2, an updated version of Llama 1, trained on a new mix of publicly available data.

In this work, we develop and release Llama 2, a family of pretrained and ﬁne-tuned LLMs, Llama 2 and Llama 2-Chat, at scales up to 70B parameters. On the series of helpfulness and safety benchmarks we tested, Llama 2-Chat models generally perform better than existing open-source models.

We also share novel observations we made during the development of Llama 2 and Llama 2-Chat, such as the emergence of tool usage and temporal organization of knowledge. Figure 3: Safety human evaluation results for Llama 2-Chat compared to other open-

# RAG的改进方案

## 改进一：检索后排序(Rerank)


**问题**： 最佳答案并不一定在排在检索的前面

**解决方案**：把top k调大一些

但该方法又会有新的问题，检索的内容变多了，prompt会变大，耗费的tokens更多。这时可以采取检索后再进行一次排序

In [43]:
#  !pip install sentence_transformers

In [44]:
from sentence_transformers import CrossEncoder

model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', max_length=512) # 英文，模型较小
# model = CrossEncoder('BAAI/bge-reranker-large', max_length=512) # 多语言，国产，模型较大

  from tqdm.autonotebook import tqdm, trange


In [45]:
user_query = "how safe is llama 2"
search_results = bot.vector_db.search(user_query, 5)

# 预测模型的输入和检索到的文档的相关性得分
scores = model.predict([(user_query, doc)
                       for doc in search_results['documents'][0]])
# 按得分排序
sorted_list = sorted(
    zip(scores, search_results['documents'][0]), key=lambda x: x[0], reverse=True)
for score, doc in sorted_list:
    print(f"{score}\t{doc}\n")

6.613734245300293	We believe that the open release of LLMs, when done safely, will be a net beneﬁt to society. Like all LLMs, Llama 2 is a new technology that carries potential risks with use (Bender et al., 2021b; Weidinger et al., 2021; Solaiman et al., 2023).

5.310719013214111	In this work, we develop and release Llama 2, a family of pretrained and ﬁne-tuned LLMs, Llama 2 and Llama 2-Chat, at scales up to 70B parameters. On the series of helpfulness and safety benchmarks we tested, Llama 2-Chat models generally perform better than existing open-source models.

4.709953308105469	We provide a responsible use guide¶ and code examples‖ to facilitate the safe deployment of Llama 2 and Llama 2-Chat. More details of our responsible release strategy can be found in Section 5.3.

4.543964385986328	We also share novel observations we made during the development of Llama 2 and Llama 2-Chat, such as the emergence of tool usage and temporal organization of knowledge. Figure 3: Safety human eval

## 改进二：混合检索（Hybrid Search)

**问题**：在实际生产中，传统的关键字检索（稀疏表示）与向量表示（稠密表示）各有优劣。

文档中包含很长的专有名词，关键字检索往往更精准而向量检索容易引入概念混淆。

例如：“小细胞肺癌”和“非小细胞肺癌”。 只有一字之差，二者向量相似度较高，但二者却是不同的癌症



所以，有时候我们需要结合不同的检索算法，来达到比单一检索算法更优的效果。这就是**混合检索**。

混合检索的核心是，综合文档 $d$ 在不同检索算法下的排序名次（rank），为其生成最终排序。

一个最常用的算法叫 **Reciprocal Rank Fusion（RRF）**

$rrf(d)=\sum_{a\in A}\frac{1}{k+rank_a(d)}$

其中 $A$ 表示所有使用的检索算法的集合，$rank_a(d)$ 表示使用算法 $a$ 检索时，文档 $d$ 的排序，$k$ 是个常数。

很多向量数据库都支持混合检索，比如 [Weaviate](https://weaviate.io/blog/hybrid-search-explained)、[Pinecone](https://www.pinecone.io/learn/hybrid-search-intro/) 等。也可以根据上述原理自己实现。


1. 基于关键字检索的排序

In [46]:
from elasticsearch7 import Elasticsearch, helpers
from chinese_utils import to_keywords  # 使用中文的关键字提取函数
import os
import time


class MyEsConnector:
    def __init__(self, es_client, index_name, keyword_fn):
        self.es_client = es_client
        self.index_name = index_name
        self.keyword_fn = keyword_fn

    def add_documents(self, documents):
        '''文档灌库'''
        if self.es_client.indices.exists(index=self.index_name):
            self.es_client.indices.delete(index=self.index_name)
        self.es_client.indices.create(index=self.index_name)
        actions = [
            {
                "_index": self.index_name,
                "_source": {
                    "keywords": self.keyword_fn(doc),
                    "text": doc,
                    "id": f"doc_{i}"
                }
            }
            for i, doc in enumerate(documents)
        ]
        helpers.bulk(self.es_client, actions)
        time.sleep(1)

    def search(self, query_string, top_n=3):
        '''检索'''
        search_query = {
            "match": {
                "keywords": self.keyword_fn(query_string)
            }
        }
        res = self.es_client.search(
            index=self.index_name, query=search_query, size=top_n)
        return {
            hit["_source"]["id"]: {
                "text": hit["_source"]["text"],
                "rank": i,
            }
            for i, hit in enumerate(res["hits"]["hits"])
        }

In [47]:


# 引入配置文件
# ELASTICSEARCH_BASE_URL = os.getenv('ELASTICSEARCH_BASE_URL')
# ELASTICSEARCH_PASSWORD = os.getenv('ELASTICSEARCH_PASSWORD')
# ELASTICSEARCH_NAME= os.getenv('ELASTICSEARCH_NAME')


# es = Elasticsearch("http://localhost:9200")
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
# es = Elasticsearch(
#     hosts=[ELASTICSEARCH_BASE_URL],  # 服务地址与端口
#     http_auth=(ELASTICSEARCH_NAME, ELASTICSEARCH_PASSWORD),  # 用户名，密码
# )


# 创建 ES 连接器
es_connector = MyEsConnector(es, "demo_es_rrf", to_keywords)

# 文档灌库
chunks = ["llama 2", "ChatGLM 2"]
es_connector.add_documents(chunks)

# 关键字检索
keyword_search_results = es_connector.search(query, 3)

print(json.dumps(keyword_search_results, indent=4, ensure_ascii=False))

ConnectionError: ConnectionError(<urllib3.connection.HTTPConnection object at 0x2ecd4a840>: Failed to establish a new connection: [Errno 61] Connection refused) caused by: NewConnectionError(<urllib3.connection.HTTPConnection object at 0x2ecd4a840>: Failed to establish a new connection: [Errno 61] Connection refused)