In [1]:
from langchain.schema import Document
from langchain.vectorstores import Chroma
from langchain.retrievers import ParentDocumentRetriever

In [2]:
## Text Splitting & Docloader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.storage import InMemoryStore


## BGE Embeddings

In [5]:
from langchain.embeddings import HuggingFaceBgeEmbeddings

model_name = "BAAI/bge-small-en-v1.5"
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
device = "cpu"
bge_embeddings = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs={'device': device},
    encode_kwargs=encode_kwargs
)

  from .autonotebook import tqdm as notebook_tqdm
.gitattributes: 100%|██████████| 1.52k/1.52k [00:00<?, ?B/s]
1_Pooling/config.json: 100%|██████████| 190/190 [00:00<?, ?B/s] 
README.md: 100%|██████████| 90.3k/90.3k [00:00<00:00, 302kB/s]
config.json: 100%|██████████| 743/743 [00:00<00:00, 92.3kB/s]
config_sentence_transformers.json: 100%|██████████| 124/124 [00:00<00:00, 20.2kB/s]
model.safetensors: 100%|██████████| 133M/133M [00:10<00:00, 12.9MB/s] 
pytorch_model.bin: 100%|██████████| 134M/134M [00:10<00:00, 12.6MB/s] 
sentence_bert_config.json: 100%|██████████| 52.0/52.0 [00:00<00:00, 5.83kB/s]
special_tokens_map.json: 100%|██████████| 125/125 [00:00<00:00, 29.9kB/s]
tokenizer.json: 100%|██████████| 711k/711k [00:00<00:00, 1.49MB/s]
tokenizer_config.json: 100%|██████████| 366/366 [00:00<?, ?B/s] 
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 1.42MB/s]
modules.json: 100%|██████████| 349/349 [00:00<00:00, 31.3kB/s]


In [9]:
print(len(bge_embeddings.embed_query("hello world, iron man")))

384


### Data prep

In [18]:
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import TextLoader
loader = DirectoryLoader('doc_txt/', glob="**/*.text", 
                         loader_cls=TextLoader, 
                         use_multithreading=True,
                         show_progress=True)
docs = loader.load()

100%|██████████| 20/20 [00:00<00:00, 629.11it/s]


In [27]:
print(docs[0])

page_content='Detailed Description of the Invention\n[0001]\nBACKGROUND OF THE INVENTION 1. Field of the Invention The present invention relates to a data processing device such as a microprocessor or a microcomputer, and more particularly to a technique effective when applied to a data processing device such as superscalar for parallel processing.\n[0002]\n2. Description of the Related Art A microprocessor (CPU (Central P\nrocessing unit), microcomputer, etc. ) Sequentially fetches a sequence of instructions,\nDecrypt and execute. The instructions executed by the microprocessor are now of fixed length with the aim of simplifying the decoding circuit. A microprocessor that executes fixed-length instructions in a pipeline (Pipelining)\nIt is called an ISC (Red used Instruction Set Computer) type processor.\nFIG. 1 shows a pipelined implementation method of a microprocessor. Here, for simplification, the normally existing memory access stage (M\nEM) is omitted. Individual stages (101, 10

In [28]:
print(len(docs[0].page_content))

50603


In [29]:
print(len(docs[0].metadata))

1


In [30]:
print(len(bge_embeddings.embed_query(docs[0].page_content)))

384


# 1. Retrieving full documents rather than chunks

In [31]:
# This text splitter is used to create the child documents
child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)


# The vectorstore to use to index the child chunks
vectorstore = Chroma(
    collection_name="full_documents",
    embedding_function=bge_embeddings  #OpenAIEmbeddings()
)

# The storage layer for the parent documents

store = InMemoryStore()

full_doc_retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
)

In [32]:
full_doc_retriever.add_documents(docs, ids=None)

In [33]:
# our
list(store.yield_keys())

['1ffeb640-0d03-4be4-94df-8f43554438c4',
 '40798ab3-c2ce-451f-984f-1cdf02ae97f7',
 'fef62d8d-4e5a-4e62-aa6d-1669f387b4e9',
 'b4c75ca0-78ae-4c38-b9ad-f524cb1f97ee',
 'beb8123c-bc03-4e30-929e-26bd7360ecd9',
 '3d970c8a-7368-4121-a379-d5cf2bbc7b92',
 'f34da6fd-b8f8-4cef-ae2b-0fa169228a73',
 'de0dcb4b-1fb3-462b-9716-67e5e878c7de',
 '22514750-2323-492e-9603-cefae75119e1',
 '886b236f-d9fe-469a-999a-ce993bb46ecc',
 '079360ef-401f-4a58-b2cf-d45d12399507',
 '62472a4b-c6e6-4b83-ab39-6644e1567f54',
 '8dca7025-797d-4e95-bb61-4c45b0751915',
 '3979aff4-f1e8-452d-9fa2-cba1cc3290f2',
 'b1813197-e0b0-4edf-b9cd-1736afc9a184',
 '75d59648-8cd3-4a0b-8066-11699ea31c8c',
 'df9ed6b8-f5d0-439e-b880-eed05cf27d45',
 'ad0bafec-bbd7-4e5d-9639-0667ea089cbd',
 '3c999257-e976-4b73-a582-1ff2f0cd5f37',
 '11ebccfc-3950-4cfb-b2c4-e143681a43e4']

In [37]:
# sub_docs
query1="Data processing apparatus having high speed slave store"
sub_docs = vectorstore.similarity_search(query1, k=2)
print(len(sub_docs))
print(sub_docs[0].page_content)
print(len(sub_docs[0].page_content))

2
According to the invention, there is provided data processing apparatus comprising: a main store capable of holding a sequence of multi-word blocks of instructions; a slave store having a faster access time and a smaller information capacity than the main store; an instruction buffer capable of holding one of the blocks and comprising two sections of unequal size; means for repeatedly scanning
396


In [41]:
retrieved_docs = full_doc_retriever.get_relevant_documents(query1)
print(len(retrieved_docs[0].page_content))
print(retrieved_docs[0].metadata)
print(len(retrieved_docs[1].page_content))
print(retrieved_docs[1].metadata)

14657
{'source': 'doc_txt\\doc_8.text'}
97970
{'source': 'doc_txt\\doc_2.text'}


# 2. Retrieving larger chunks from a document, when document is very big.

Sometimes, the full documents can be too big to want to retrieve them as is. In that case, what we really want to do is to first split the raw documents into larger chunks, and then split it into smaller chunks. We then index the smaller chunks, but on retrieval we retrieve the larger chunks (but still not the full documents).

In [60]:
# This text splitter is used to create the parent documents - The big chunks
# parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=0)

# This text splitter is used to create the child documents - The small chunks
# It should create documents smaller than the parent
# child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)
child_splitter = RecursiveCharacterTextSplitter(chunk_size=50, chunk_overlap=0)

# The vectorstore to use to index the child chunks
vectorstore = Chroma(collection_name="split_parents", embedding_function=bge_embeddings) #OpenAIEmbeddings()

# The storage layer for the parent documents
store.store = {}  # Clear all key-value pairs
store = None
store = InMemoryStore()

In [61]:
big_chunks_retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
)

In [62]:
big_chunks_retriever.add_documents(docs)

In [63]:
print(type(store))

<class 'langchain.storage.in_memory.InMemoryStore'>


In [65]:
print(len(list(store.yield_keys())))

20093


In [69]:
query1="Data processing apparatus having high speed slave store"
sub_docs = vectorstore.similarity_search(query1, k=4)

In [71]:
print(len(sub_docs))

4


In [72]:
print(sub_docs[0].page_content)

processing unit with fast access to data and


In [88]:
retrieved_docs = big_chunks_retriever.get_relevant_documents(query1)

In [89]:
print(len(retrieved_docs))

2


In [91]:
print(len(retrieved_docs[0].page_content))

87


In [92]:
print(len(retrieved_docs[1].page_content))

95


In [None]:
# from langchain.chains import RetrievalQA
# from langchain.llms import OpenAI

# qa = RetrievalQA.from_chain_type(llm=OpenAI(),
#                                  chain_type="stuff",
#                                  retriever=big_chunks_retriever)

In [94]:
# query = "What is Langsmith?"
# qa.run(query)