In [None]:
#https://python.langchain.com/v0.2/docs/how_to/parent_document_retriever/

In [None]:
import weaviate

client = weaviate.Client("http://localhost:8081")
weaviate_client = weaviate.connect_to_local("localhost","8081")#v4
index_name="Phoenix_test"

In [2]:
weaviate_client.collections.delete_all()
#weaviate_client.collections.list_all()

In [None]:
from weaviate.classes.config import Configure
from weaviate.classes.config import Property, DataType

weaviate_client.collections.create(
    index_name,
    #see notes above re: the docker modules that need to be enabled for text2vec* to work correctly -e ENABLE_MODULES=text2vec-ollama
    vectorizer_config=Configure.Vectorizer.text2vec_ollama( 
        model="nomic-embed-text",    
        api_endpoint="http://host.docker.internal:11434",
    ),
    # generative_config=Configure.Generative.ollama(
    #     api_endpoint = "http://host.docker.internal:11434",
    #     model="jonphi"
    # ),

    # properties=[
    #     Property(name="page_content", data_type=DataType.TEXT),
    #     Property(name="source", data_type=DataType.INT),
    # ]

)

In [4]:
from langchain_community.document_loaders import TextLoader

loaders = [
    TextLoader("/home/jonot480/Documents/paulgraham.txt"),
    TextLoader("/home/jonot480/Documents/Coke OCG.txt")
]
docs = []
for loader in loaders:
    docs.extend(loader.load())

In [6]:
from langchain_chroma import Chroma
from langchain_community.vectorstores.weaviate import Weaviate
from langchain_community.embeddings.ollama import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore

# This text splitter is used to create the child documents
child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)
# The vectorstore to use to index the child chunks
# vectorstore = Chroma(
#     collection_name="full_documents", 
#     embedding_function=OllamaEmbeddings(model="nomic-embed-text")
# )

vectorstore =  Weaviate(
    client=client,
    index_name=index_name,
    text_key="text",
    embedding=OllamaEmbeddings(model="nomic-embed-text")
)

# The storage layer for the parent documents
store = InMemoryStore()
retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
)

In [7]:
retriever.add_documents(docs, ids=None)

In [None]:
list(store.yield_keys())

In [None]:
sub_docs = vectorstore.similarity_search_by_text("effective date",k=4)
for i in sub_docs:
    print(i)
#print(sub_docs)

In [None]:
print(sub_docs[0].page_content)

In [20]:
retrieved_docs = retriever.invoke("effective date")

In [None]:
len(retrieved_docs[0].page_content)

In [22]:
# This text splitter is used to create the parent documents
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)
# This text splitter is used to create the child documents
# It should create documents smaller than the parent
child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)
# The vectorstore to use to index the child chunks
vectorstore = Chroma(
    collection_name="split_parents", embedding_function=OllamaEmbeddings(model='nomic-embed-text')
)
# The storage layer for the parent documents
store = InMemoryStore()

In [23]:
retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
)

In [24]:
retriever.add_documents(docs)

In [None]:
len(list(store.yield_keys()))

In [26]:
sub_docs = vectorstore.similarity_search("effective date")

In [None]:
print(sub_docs[0].page_content)

In [None]:
print(retrieved_docs[0].page_content)