In [1]:
import uuid
import os
import json
import chromadb
from langchain_chroma import Chroma
from asg_splitter import TextSplitting
from langchain.embeddings import HuggingFaceEmbeddings

In [4]:
asg_splitter = TextSplitting()

splitters = asg_splitter.pypdf_recursive_splitter("./Test2.pdf")
print(splitters) # [Document(page_content='...'), Document(page_content='...')]
print(len(splitters)) # 32 chunks for abstract and introduction part of Test2.pdf
print(splitters[0]) # page_content='ABSTRACT\nHigh-quality text embedding is pivotal in improving semantic textual similarity\n(STS) tasks, which are crucial components in Large Language Model (LLM) applications. However, a common challenge existing text embedding models face is'
# print(splitters[0].page_content)

[Document(page_content='ABSTRACT\nHigh-quality text embedding is pivotal in improving semantic textual similarity\n(STS) tasks, which are crucial components in Large Language Model (LLM) applications. However, a common challenge existing text embedding models face is'), Document(page_content='the problem of vanishing gradients, primarily due to their reliance on the cosine\nfunction in the optimization objective, which has saturation zones. To address this\nissue, this paper proposes a novel angle-optimized text embedding model called'), Document(page_content='AnglE. The core idea of AnglE is to introduce angle optimization in a complex'), Document(page_content='space. This novel approach effectively mitigates the adverse effects of the saturation zone in the cosine function, which can impede gradient and hinder optimization processes. To set up a comprehensive STS evaluation, we experimented on'), Document(page_content='existing short-text STS datasets and a newly collected long-text 

In [5]:
# extract the page content from the splitters
documents_list = [document.page_content for document in splitters]

# not sure if this is helpful
for i in range(len(documents_list)):
    documents_list[i] = documents_list[i].replace('\n', ' ')

print(documents_list[0])  # the first chunk
print(len(documents_list))  # 32 chunks for extracted parts

ABSTRACT High-quality text embedding is pivotal in improving semantic textual similarity (STS) tasks, which are crucial components in Large Language Model (LLM) applications. However, a common challenge existing text embedding models face is
32


In [6]:
import torch
embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
doc_results = embedder.embed_documents(documents_list)
if isinstance(doc_results, torch.Tensor): # tensor to list
    embeddings_list = doc_results.tolist()
else:
    embeddings_list = doc_results

print(f"Generated embeddings for {len(embeddings_list)} chunks.")
print(embeddings_list[0])  # the first embedding
print(embeddings_list[1])  # the second embedding
print(len(embeddings_list[0]))  # 384 dimensions


  from tqdm.autonotebook import tqdm, trange


Generated embeddings for 32 chunks.
[0.04924212396144867, -0.10569172352552414, 0.007824955508112907, 0.007421598304063082, 0.05172562599182129, 0.04049655795097351, -0.05713723599910736, -0.0012977785663679242, 0.023667607456445694, -0.0365864597260952, 0.014115212485194206, 0.021400753408670425, 0.09878640621900558, 0.06330777704715729, 0.028725605458021164, -0.022384105250239372, 0.1678217351436615, 0.03098643198609352, -0.055029261857271194, -0.07075641304254532, 0.05698004737496376, 0.03315073996782303, 0.08139237761497498, -0.009734162129461765, -0.011233063414692879, 0.008094355463981628, 0.0038464453537017107, 0.017905322834849358, 0.005341273732483387, -0.012167524546384811, 0.020715132355690002, 0.017522281035780907, 0.022497335448861122, 0.05507294461131096, -0.00874223280698061, 0.07839294523000717, -0.04733765497803688, 0.02448859252035618, -0.006414858158677816, -0.018987078219652176, 0.009277384728193283, 0.012273413129150867, 0.0003846602630801499, 0.07351070642471313, 

  attn_output = torch.nn.functional.scaled_dot_product_attention(


In [7]:
# the metadata_list should be provided from indexing, simply use file title
# used for multiple document indexing
metadata_list = [{"doc_name": "test2.pdf"} for i in range(len(documents_list))]

In [15]:
from asg_retriever import *

# no need to repeatedly add documents
cur_dir = os.getcwd()
retriever = Retriever()

collection_name = retriever.create_collection_chroma("Method2")
retriever.add_documents_chroma(
    collection_name=collection_name,
    embeddings_list=embeddings_list,
    documents_list=documents_list,
    metadata_list=metadata_list
)
collection_name = "Method2"


Logged document information to 'd:\REPO\asg_repo\logs\Method2.json'.


In [10]:
collection = retriever.get_collection_chroma(collection_name)
# document_ids = [str(uuid.uuid4()) for _ in range(documents_list)]
# doc_id = document_ids[0]  # 替换为你想查询的具体文档ID
result = collection.get(ids=["dffa16a8-fd92-4726-b64b-8a1b958671fd"], include=["embeddings", "documents", "metadatas"])
print(result)
print(result["documents"])
print(result["metadatas"])
print(result["embeddings"])
# embeddings不在json中 过长 不易处理

{'ids': ['dffa16a8-fd92-4726-b64b-8a1b958671fd'], 'embeddings': [[0.04924212396144867, -0.10569172352552414, 0.007824955508112907, 0.007421598304063082, 0.05172562599182129, 0.04049655795097351, -0.05713723599910736, -0.0012977785663679242, 0.023667607456445694, -0.0365864597260952, 0.014115212485194206, 0.021400753408670425, 0.09878640621900558, 0.06330777704715729, 0.028725605458021164, -0.022384105250239372, 0.1678217351436615, 0.03098643198609352, -0.055029261857271194, -0.07075641304254532, 0.05698004737496376, 0.03315073996782303, 0.08139237761497498, -0.009734162129461765, -0.011233063414692879, 0.008094355463981628, 0.0038464453537017107, 0.017905322834849358, 0.005341273732483387, -0.012167524546384811, 0.020715132355690002, 0.017522281035780907, 0.022497335448861122, 0.05507294461131096, -0.00874223280698061, 0.07839294523000717, -0.04733765497803688, 0.02448859252035618, -0.006414858158677816, -0.018987078219652176, 0.009277384728193283, 0.012273413129150867, 0.0003846602630

In [11]:
query_text = "The method used in the paper."
    
# rephrase the queries
rephrase_query = query_text # rephrase the query (not implemented yet)
query_embeddings = embedder.embed_query(rephrase_query)

if isinstance(query_embeddings, torch.Tensor): # tensor to list
    query_embeddings = query_embeddings.tolist()
else:
    query_embeddings = query_embeddings

print(query_embeddings) # 384 dimensions
query_result = retriever.query_chroma(collection_name = collection_name, query_embeddings= query_embeddings) # query according to the embeddings

print("The following is the full query result as a dictionary:")
print(query_result, "\n")

[-0.09615808725357056, 0.1194244846701622, -0.009963794611394405, 0.0376601405441761, -0.021836431697010994, 0.003525825683027506, -0.005507572088390589, 0.09568071365356445, -0.017696581780910492, 0.0384046696126461, 0.11727969348430634, 0.12086743116378784, 0.03456055745482445, -0.0034042242914438248, -0.1220151036977768, -0.008584139868617058, -0.053868915885686874, 0.041630372405052185, -0.0210367813706398, 0.024677131325006485, 0.07406089454889297, -0.0010363199980929494, -0.02106299437582493, -0.008246064186096191, 0.06782012432813644, -0.015851058065891266, -0.04007585346698761, -0.00888445321470499, 0.1368299424648285, -0.006521580275148153, 0.06367521733045578, 0.10149680823087692, 0.032473593950271606, 0.027422646060585976, -0.05243737995624542, 0.00904950499534607, -0.03119412623345852, 0.01526539959013462, 0.033599987626075745, 0.054752420634031296, 0.01104145310819149, -0.024458393454551697, -0.019340183585882187, -0.0072885542176663876, 0.07872515171766281, 0.001300884410

In [16]:
print(query_result)
print(query_result["documents"])
print(query_result["documents"][0][0])
print(query_result["documents"][0][2])
print(query_result["distances"])
print(query_result["metadatas"])
print(query_result["ids"])
print(query_result["embeddings"])

{'ids': [['4b367607-00a7-4052-ad97-9a4cfa1960af', 'e9b79029-682c-4d9d-a221-3d4352be5c9a', '330fc492-5660-4954-909e-45666e9ef0c1', 'c8465479-c113-41d6-8f85-3d3fd2270024', '30d7ed17-df76-4e20-8869-86e1723b9e8e']], 'distances': [[1.3527367115020752, 1.3527367115020752, 1.3794867992401123, 1.3794867992401123, 1.4134531021118164]], 'metadatas': [[{'doc_name': 'test2.pdf'}, {'doc_name': 'test2.pdf'}, {'doc_name': 'test2.pdf'}, {'doc_name': 'test2.pdf'}, {'doc_name': 'test2.pdf'}]], 'embeddings': None, 'documents': [['PN jecos(Xbi,X+ bj)/τ\uf8f9 \uf8fb, (2) where τis a temperature hyperparameter, bstands for the b-th batch, X+ biandX+ bjare the respective positive samples of XbiandXbj,mrepresents the number of positive pairs in b-th batch, Nis the batch size, and cos(·)is the cosine similarity function. 1Im Redivisor', 'PN jecos(Xbi,X+ bj)/τ\uf8f9 \uf8fb, (2) where τis a temperature hyperparameter, bstands for the b-th batch, X+ biandX+ bjare the respective positive samples of XbiandXbj,mrepr

In [17]:
# test for deleting the collection "Method2"
retriever.delete_collection_chroma("Method2")

The collection Method2 will be deleted forever!
Collection Method2 has been removed, deleting log file of this collection


In [None]:
# query_result_chunks = query_result["documents"][0]
# query_result_ids = query_result["ids"][0]

# # 将查询结果保存为json文件
# with open ("{}/retrieval/{}_{}.json".format(cur_dir, collection_name, str(uuid.uuid4())).format(), 'w') as retrieval:
#     json.dump(query_result, retrieval, indent=4)

# context = '//\n'.join(["@" + query_result_ids[i] + "//" + query_result_chunks[i] for i in range (len(query_result_chunks))])


# print("context is: ", context)

# with open ("{}/context/{}_{}.txt".format(cur_dir, collection_name, str(uuid.uuid4())).format(), 'w') as context_file:
#     context_file.write(context)
# result = context # generate the result (not implemented yet)
# print(result)

# Retrieve
Given a user input, relevant splits are retrieved from storage using a Retriever.

### 1. MultiQueryRetriever

多个检索产生不同结果 如何整合

生成输入问题的变体，以提高检索命中率。
- Pros：
  - 提高检索覆盖率：通过生成多个查询变体，可以更全面地检索相关文档。
  - 适合多样化表达：适用于用户可能使用多种表达方式查询相同信息的场景。
- Cons：
  - 增加复杂性：生成和处理多个查询需要额外的计算资源。
  - 可能带来噪音：不相关的查询变体可能会增加无关文档的检索。

**适用场景**：
- 用户查询表达多样的系统，如客服问答系统。
- 需要高覆盖率和容错率的检索任务。

### 2. MultiVectorRetriever

生成嵌入变体，以提高检索命中率。
- Pros：
  - 更精准的语义检索：通过生成嵌入的变体，可以更好地捕捉文本的语义多样性。
  - 适用于上下文丰富的文档：在长文本或上下文密集的文档中表现良好。
- Cons：
  - 计算开销大：生成多个嵌入需要更多计算资源。
  - 实现复杂：需要更复杂的嵌入生成和管理策略。

**适用场景**：
- 文档语义丰富且多样性高的检索任务，如法律文档和学术论文。
- 需要高精度语义匹配的系统。

### 3. Self Query Retriever

使用元数据过滤在向量存储检索过程中进行过滤。
- Pros：
  - 高效过滤：结合元数据进行过滤，可以提高检索效率和准确性。
  - 精准控制：适用于需要根据特定属性或标签进行精确筛选的场景。
- Cons：
  - 依赖元数据：需要高质量和一致的元数据支持。
  - 适用范围有限：对于没有元数据的文档，效果有限。

**适用场景**：
- 文档有丰富的元数据标注，如新闻文章和产品目录。
- 需要精确过滤和分类的检索任务。

考虑到论文数据量不大且上下文相关性较强，推荐使用 **MultiVectorRetriever**。理由如下：

- 上下文相关性强
- 高精度要求
- 数据量较少

### 4. HyDE (2022)

https://arxiv.org/pdf/2212.10496

HyDE是一种高级embedding method，它接受查询，生成一个虚拟答案，然后嵌入该生成的文档并将其用作最终示例。

需要提供一个基本的embedding model，以及一个用于生成这些文档的LLMChain。HyDE类有一些默认的提示 在论文中，但也可创建自己的提示。


In [None]:
from langchain.llms import OpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.chains import LLMChain, HypotheticalDocumentEmbedder
from langchain.prompts import PromptTemplate
base_embeddings = OpenAIEmbeddings()
llm = OpenAI()
# use "web_search" to generate embeddings
embeddings = HypotheticalDocumentEmbedder.from_llm(llm, base_embeddings, "web_search")
result = embeddings.embed_query("what method is used in the article?")
# 还可生成多个文档，然后将这些文档的嵌入组合起来。默认情况下取平均值来组合这些文档的嵌入。可以通过改变用于生成文档的LLM来实现.
# multi_llm = OpenAI(n=4, best_of=4)

