In [None]:
# Install required libraries
!pip install llama-index
!pip install llama-index-retrievers-bm25
!pip install chromadb
!pip install PyStemmer


In [None]:
# Import necessary libraries
import os
import logging
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.llms.ollama import Ollama
from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core.query_engine import RetrieverQueryEngine
import chromadb
import Stemmer

# Configure logging
logging.basicConfig(level=logging.INFO)

In [None]:
from llama_index.core import Settings


# Configure Ollama LLM
ollama_llm = Ollama(
    model="llama3.2:latest",
    base_url="http://localhost:11434",
    temperature=0.1
)

# Configure embedding model
ollama_embedding = OllamaEmbedding(
    model_name="nomic-embed-text:latest",
    base_url="http://localhost:11434",
    ollama_additional_kwargs={"mirostat": 0}
)


Settings.llm = ollama_llm
Settings.embed_model = ollama_embedding

In [None]:
# Load documents
documents = SimpleDirectoryReader(input_files=["../data/paul_graham_essay3.txt"]).load_data()

# Initialize a SentenceSplitter to create nodes
from llama_index.core.node_parser import SentenceSplitter

splitter = SentenceSplitter(chunk_size=512)
nodes = splitter.get_nodes_from_documents(documents)


In [None]:
# Create a BM25 Retriever
bm25_retriever = BM25Retriever.from_defaults(
    nodes=nodes,
    similarity_top_k=2,
    stemmer=Stemmer.Stemmer("english"),
    language="english",
)

# Persist the BM25 Retriever to disk
bm25_retriever.persist("./bm25_retriever")

# Reload the BM25 Retriever
loaded_bm25_retriever = BM25Retriever.from_persist_dir("./bm25_retriever")


In [None]:
# Initialize a docstore to store nodes
from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import VectorStoreIndex

docstore = SimpleDocumentStore()
docstore.add_documents(nodes)

# Configure Chroma vector store
import chromadb
db = chromadb.PersistentClient(path="./chroma_db")
chroma_collection = db.get_or_create_collection("dense_vectors")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

# Create a QueryFusionRetriever with BM25 and Chroma
from llama_index.core.retrievers import QueryFusionRetriever
from llama_index.retrievers.bm25 import BM25Retriever

# Create the BM25 retriever
bm25_retriever = BM25Retriever.from_defaults(docstore=docstore, similarity_top_k=2)

# Create the VectorStoreIndex and its retriever
index = VectorStoreIndex(nodes=nodes, docstore=docstore, vector_store=vector_store)
vector_retriever = index.as_retriever(similarity_top_k=2)

# Combine retrievers into a QueryFusionRetriever
hybrid_retriever = QueryFusionRetriever(
    retrievers=[bm25_retriever, vector_retriever],
    num_queries=1,
    use_async=False,
)

# Query the hybrid retriever
query = "What happened at Viaweb and Interleaf?"
retrieved_nodes = hybrid_retriever.retrieve(query)

# Display the results
for node in retrieved_nodes:
    print(f"Node ID: {node.node_id}")
    print(f"Text: {node.text}\n")


In [None]:
# Query the hybrid retriever
retrieved_nodes = hybrid_retriever.retrieve("What happened at Viaweb and Interleaf?")

# Display retrieved nodes
for node in retrieved_nodes:
    print(f"Node ID: {node.node_id}")
    print(f"Text: {node.text}\n")


In [None]:
# Save the docstore
docstore.persist("./docstore.json")

# Reload docstore
from llama_index.core.storage.docstore import SimpleDocumentStore
docstore = SimpleDocumentStore.from_persist_path("./docstore.json")

# Reload Chroma vector store
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore

db = chromadb.PersistentClient(path="./chroma_db")
chroma_collection = db.get_or_create_collection("dense_vectors")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

# Reload nodes from the docstore
from llama_index.core import VectorStoreIndex
nodes = [docstore.docs[doc_id] for doc_id in docstore.docs.keys()]

# Recreate VectorStoreIndex using the reloaded docstore, vector store, and nodes
index = VectorStoreIndex(
    nodes=nodes,  # Ensure nodes are added to the index
    docstore=docstore,
    vector_store=vector_store,
)


In [None]:
# Access all documents in the docstore
doc_ids = list(docstore.docs.keys())
print(f"Number of documents in docstore: {len(doc_ids)}")

# Optionally, print the IDs of stored documents
print("Document IDs in docstore:")
for doc_id in doc_ids:
    print(doc_id)


In [None]:
# Check if the vector store is operational
print(f"Vector store contains collection: {chroma_collection.name}")

# Optionally, check the number of entries in the collection
print(f"Number of vectors in vector store: {len(chroma_collection.get())}")


In [None]:
# Test query to ensure the index works as expected
query = "Who is the author of this essay?"
retriever = index.as_retriever(similarity_top_k=2)
retrieved_nodes = retriever.retrieve(query)

# Print the results
print("Retrieved Nodes:")
for node in retrieved_nodes:
    print(f"Node ID: {node.node_id}")
    print(f"Text: {node.text}\n")
