In [1]:
import time
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
from threading import Lock
from multiprocessing import Manager
import tiktoken
import chromadb
from chromadb.utils import embedding_functions
from raptor.utils import split_text
import os
from langchain_community.document_loaders import WebBaseLoader, TextLoader, PyPDFLoader

2024-10-10 06:06:44,354 - Loading faiss with AVX2 support.
2024-10-10 06:06:44,405 - Successfully loaded faiss with AVX2 support.


In [2]:
# set the  key in the env variable

pdf_path = 'demo/RS_zishi.pdf'
loader = PyPDFLoader(pdf_path)
pdf_content = loader.load()
text = ' '.join([page.page_content for page in pdf_content])
text1 = text[:10000]
text2 = text[10000:20000]

tokenizer = tiktoken.get_encoding("cl100k_base")
text1s = split_text(text1, tokenizer=tokenizer, max_tokens=100)
text2s = split_text(text2, tokenizer=tokenizer, max_tokens=100)
print(f'len(text1s): {len(text1s)}; len(text2s): {len(text2s)}')

client = chromadb.PersistentClient(path='./db2')
default_ef = embedding_functions.OpenAIEmbeddingFunction(api_key=os.environ["OPENAI_API_KEY"])
# default_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name='D:/python_data/model/BAAI/bge-large-zh-v1.5')
collection = client.get_or_create_collection(name=f'cinderella', embedding_function=default_ef)

2024-10-10 06:07:23,167 - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


len(text1s): 22; len(text2s): 25


2024-10-10 06:07:24,140 - Collection cinderella is not created.


In [ ]:
collection.add(documents=text1s, ids=[f'ids{i}' for i in range(len(text1s))], metadatas=[{'type': 'children'}]*len(text1s))
collection.add(documents=text2s, ids=[f'ids{len(text1s)+i}' for i in range(len(text2s))], metadatas=[{'type': 'parents'}]*len(text2s))

In [4]:
results = collection.query(query_texts='zishizhang', n_results=5, where={'type': 'children'})
results

2024-10-10 06:08:08,457 - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


{'ids': [['ids0', 'ids1', 'ids10', 'ids6', 'ids20']],
 'distances': [[0.416904749565347,
   0.4686589294122239,
   0.5147696403390115,
   0.5175526835741324,
   0.5422456518640746]],
 'metadatas': [[{'type': 'children'},
   {'type': 'children'},
   {'type': 'children'},
   {'type': 'children'},
   {'type': 'children'}]],
 'embeddings': None,
 'documents': [['Sample-Efficient Clustering and Conquer Procedures for Parallel Large-Scale Ranking and Selection Zishi Zhang Wuhan Institute of Aritificial Intelligence, Guanghua School of Management, Peking University, Beijing, China; Xiangjiang Laboratory, Changsha, China, zishizhang@stu pku edu cn Yijie Peng Wuhan Institute of Aritificial Intelligence, Guanghua School of Management, Peking University, Beijing, China;',
   'Xiangjiang Laboratory, Changsha, China, pengyijie@gsm pku edu cn We propose novel “clustering and conquer” procedures for the parallel large-scale ranking and selection (R&S) problem, which leverage correlation information f

In [7]:
client.list_collections()

[Collection(name=cinderella)]

In [5]:
collection.get()

{'ids': ['ids0',
  'ids1',
  'ids10',
  'ids11',
  'ids12',
  'ids13',
  'ids14',
  'ids15',
  'ids16',
  'ids17',
  'ids18',
  'ids19',
  'ids2',
  'ids20',
  'ids21',
  'ids22',
  'ids23',
  'ids24',
  'ids25',
  'ids26',
  'ids27',
  'ids28',
  'ids29',
  'ids3',
  'ids30',
  'ids31',
  'ids32',
  'ids33',
  'ids34',
  'ids35',
  'ids36',
  'ids37',
  'ids38',
  'ids39',
  'ids4',
  'ids40',
  'ids41',
  'ids42',
  'ids43',
  'ids44',
  'ids45',
  'ids46',
  'ids5',
  'ids6',
  'ids7',
  'ids8',
  'ids9'],
 'embeddings': None,
 'metadatas': [{'type': 'children'},
  {'type': 'children'},
  {'type': 'children'},
  {'type': 'children'},
  {'type': 'children'},
  {'type': 'children'},
  {'type': 'children'},
  {'type': 'children'},
  {'type': 'children'},
  {'type': 'children'},
  {'type': 'children'},
  {'type': 'children'},
  {'type': 'children'},
  {'type': 'children'},
  {'type': 'children'},
  {'type': 'parents'},
  {'type': 'parents'},
  {'type': 'parents'},
  {'type': 'parents'},

In [ ]:
from raptor import RetrievalAugmentation
RA = RetrievalAugmentation()
RA.add_documents(text1)
question = "Why the P3C have better performance than the EA ?"
context, layer_info = RA.retrieve(question=question, retrieve_mode='bottom_up')
print(layer_info)

answer = RA.answer_question(question=question)
print("Answer: ", answer)

In [20]:
collection.get()

{'ids': ['ids0',
  'ids1',
  'ids10',
  'ids11',
  'ids12',
  'ids13',
  'ids14',
  'ids15',
  'ids16',
  'ids17',
  'ids18',
  'ids19',
  'ids2',
  'ids20',
  'ids21',
  'ids22',
  'ids23',
  'ids24',
  'ids25',
  'ids26',
  'ids27',
  'ids28',
  'ids29',
  'ids3',
  'ids30',
  'ids31',
  'ids32',
  'ids33',
  'ids34',
  'ids35',
  'ids36',
  'ids37',
  'ids38',
  'ids39',
  'ids4',
  'ids40',
  'ids41',
  'ids42',
  'ids43',
  'ids44',
  'ids45',
  'ids46',
  'ids5',
  'ids6',
  'ids7',
  'ids8',
  'ids9'],
 'embeddings': None,
 'metadatas': [{'type': 'children'},
  {'type': 'children'},
  {'type': 'children'},
  {'type': 'children'},
  {'type': 'children'},
  {'type': 'children'},
  {'type': 'children'},
  {'type': 'children'},
  {'type': 'children'},
  {'type': 'children'},
  {'type': 'children'},
  {'type': 'children'},
  {'type': 'children'},
  {'type': 'children'},
  {'type': 'children'},
  {'type': 'parents'},
  {'type': 'parents'},
  {'type': 'parents'},
  {'type': 'parents'},

In [13]:
collection.get(ids='ids6')

{'ids': ['ids6'],
 'embeddings': None,
 'metadatas': [None],
 'documents': ['overcoming the limitations of the all pairwise comparison paradigm  For instance, Zhong and Hong (2022) propose a Knockout Tournament (KT) procedure to restrict comparisons to matches involving only two alternatives  Pei et al  (2022) compare each alternative against a common standard to avoid exhaustive pairwise comparisons  On the contrary, stage-wise methods, which predetermine the number of replications, are more naturally parallelizable  Nonetheless, a notable'],
 'uris': None,
 'data': None}

In [10]:
int('01')

1

In [11]:
a = [1, 2, 3]
b = ['a', 'b', 'c']
for i, j in zip(a, b):
    print(i, j)

1 a
2 b
3 c
