In [1]:
! pip install --upgrade --quiet  weaviate-client

In [None]:
import weaviate

#seems like weaviate_client invokes access to collections, while client is truly just the client connection
#docker run -p 8081:8080 -p 50051:50051 -e ENABLE_MODULES=text2vec-ollama cr.weaviate.io/semitechnologies/weaviate:1.25.1
#make sure the above is running in docker

weaviate_client = weaviate.Client("http://localhost:8081")
client = weaviate.connect_to_local("localhost","8081")#v4


In [53]:
#ONLY RUN THIS IF YOU WANT TO START FROM SCRATCH!!!!
#client.collections.delete_all()
#print(client.collections.list_all())

In [4]:
from langchain_community.retrievers import (
    WeaviateHybridSearchRetriever,
)

from weaviate.classes.config import Configure

index_name="JonsTest"
#just note, from_documents will dynamically create an index without a vectorizer.  you need a vectorizer to use the hybrid search retriever
#index_name="LangChain_9ffcc707dbbf428d8c7e147134476d07"


In [None]:
client.collections.create(
    index_name,
    #see notes above re: the docker modules that need to be enabled for text2vec* to work correctly -e ENABLE_MODULES=text2vec-ollama
    vectorizer_config=Configure.Vectorizer.text2vec_ollama( 
        model="nomic-embed-text",    
        api_endpoint="http://host.docker.internal:11434",
    ),

)

In [None]:
#misc doc list append debugging/tests to get open-webui functionality running 

collection = client.collections.get(index_name)

docs = []

response = collection.query.fetch_objects()
for o in response.objects:
     #print(o.properties["text"])
     #print(o)
     #docs.append(o.properties["text"])
     docs.append(o)
     #print(o)

print(docs)

In [None]:
#this is the final docs debugging/test that was running to make sure a proper "Document" type was passed to the weaviate retriever.  
#update - i later found out this was not necessary since the *from_documents call adds the doc to your collection

from langchain_core.documents import Document

docs = []

for item in collection.iterator():
    #print(item.uuid, item.properties.get("text"))
    #docs.append([,item.properties.get("text"),item.properties])
    docs.append(Document(uuid=item.uuid,page_content=item.properties.get("text"),metadata=item.properties))

print(docs)


In [98]:

#ctrl click WeaviateHybridSearchRetriever to see available params (e.g. K, alpha, etc)...default alpha is .5 so override here to 0 if looking for pure keyword

retriever = WeaviateHybridSearchRetriever(
    client=weaviate_client,
    index_name=index_name,
    text_key="text",
    attributes=[],
    create_schema_if_missing=True
)


In [None]:

from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter #option to add recursive here

loader = TextLoader("/home/jonot480/Documents/paulgraham.txt")
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=100,chunk_overlap=0)
docs = text_splitter.split_documents(documents)


In [None]:
#this is not needed in open-webui since we run the from_documents method to populate the collection.  
#this just populates your retriever as a 1 off test
retriever.add_documents(docs)

In [None]:
query = "what is the effective date?"
result = retriever.invoke(query,score=True)

#this result is a dictionary that is needed for the open-webui citations pieces...you need to specify score=True above or else the below will break
#not sure why open-webui uses the double bracket in the dictionary....need to debug this on the citations piece

result = {
    "distances": [[d.metadata.get("score") for d in result]],
    "documents": [[d.page_content for d in result]],
    "metadatas": [[d.metadata for d in result]],
}

print(result)

In [None]:
#random tests to see for open-webui is doing with their for d in result gets...basically a way to grab data from dictionary 

#print([d.metadata.get("_additional") for d in result])
#print([[d.metadata.get("_additional").get('score') for d in result]]) #open-webui uses double brackets for some reason....haven't debugged why 
#print([d.page_content for d in result])
#print([d.metadata for d in result])
