In [1]:
from llama_index.core import SimpleDirectoryReader, ServiceContext
from llama_index.core.node_parser import SimpleNodeParser
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
folder_path = "./docs"
docs = SimpleDirectoryReader(input_dir=folder_path, required_exts=[".pdf"]).load_data()


Overwriting cache for 0 154




In [3]:
# Split into chunks
node_parser = SimpleNodeParser.from_defaults(chunk_size=500, chunk_overlap=50)
nodes = node_parser.get_nodes_from_documents(docs)

In [4]:

from sentence_transformers import SentenceTransformer
model_path = "D:\Download\model"  
model = SentenceTransformer(model_path)

  model_path = "D:\Download\model"
No sentence-transformers model found with name D:\Download\model. Creating a new one with mean pooling.


In [5]:
import chromadb
import os
import time
from dotenv import load_dotenv
load_dotenv()
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
from llama_index.llms.gemini import Gemini
from llama_index.core import PromptTemplate
llm = Gemini(model="models/gemini-1.5-flash")
# Initialize ChromaDB
client = chromadb.PersistentClient(path="./chroma_db")
collection = client.get_or_create_collection(name="pdf_embeddings")

prompt_template = PromptTemplate(
    "Extract the main case type(s) from the following legal document. "
    "Return only a JSON list of keywords (no sentences or explanations). "
    "Example output: [\"defamation\", \"property dispute\", \"criminal case\"]\n\n"
    "Document:\n{node}"
)
# Add embeddings with metadata
for i, node in enumerate(nodes):
    formatted_prompt = prompt_template.format(node=node.text)
    response = llm.complete(formatted_prompt)
    raw_text = response.text.strip()
    keywords_list = raw_text.split(",")  # Split by commas
    keywords_list = [kw.strip() for kw in keywords_list if kw.strip()]
    # Retry logic for handling 429 errors
    # for attempt in range(3):  # Retry up to 3 times
    #     try:
    #         response = llm.complete(formatted_prompt)
    #         raw_text = response.text.strip()
    #         # Process response to extract keywords
    #         keywords_list = raw_text.split(",")  # Split by commas
    #         keywords_list = [kw.strip() for kw in keywords_list if kw.strip()]

    #         break  # Success, exit retry loop
    #     except Exception as e:
    #         if "429" in str(e):  # Check if it's a rate limit error
    #             print(f"Rate limit hit. Retrying in {2**attempt} seconds...")
    #             time.sleep(2**attempt)  # Exponential backoff (2s, 4s, 8s)
    #         else:
    #             raise e  # Other errors should not be ignored

    collection.add( 
        ids=[str(i)],
        documents=[node.text],
        metadatas=[{"Case Type": ", ".join(keywords_list)}]
    )
    if i>10:
        break
# Print all metadata
retrieved_data = collection.get(include=["metadatas"])

# Iterate through each metadata entry
for i, metadata in enumerate(retrieved_data["metadatas"]):
    print(f"Node {i+1} Metadata: {metadata}")

Node 1 Metadata: {'Case Type': '```json\n["civil case"]\n```'}
Node 2 Metadata: {'Case Type': '```json\n["contract dispute", "property dispute"]\n```'}
Node 3 Metadata: {'Case Type': '```json\n["specific performance", "contract dispute", "recovery of money"]\n```'}
Node 4 Metadata: {'Case Type': '```json\n["contract dispute", "property dispute"]\n```'}
Node 5 Metadata: {'Case Type': '```json\n["fraud", "contract dispute"]\n```'}
Node 6 Metadata: {'Case Type': '```json\n[]\n```'}
Node 7 Metadata: {'Case Type': '```json\n["Specific Performance", "Contract Dispute"]\n```'}
Node 8 Metadata: {'Case Type': '["limitation"]'}
Node 9 Metadata: {'Case Type': '```json\n["property dispute", "eviction"]\n```'}
Node 10 Metadata: {'Case Type': '```json\n["specific performance", "property dispute", "contract dispute"]\n```'}
Node 11 Metadata: {'Case Type': '```json\n["contract dispute", "breach of contract"]\n```'}
Node 12 Metadata: {'Case Type': '```json\n["specific performance", "property dispute"]\

In [7]:

from llama_index.core import Settings
Settings.llm = None


LLM is explicitly disabled. Using MockLLM.


In [8]:
print(collection.count())

12


In [None]:
from llama_index.core import VectorStoreIndex, KeywordTableIndex, StorageContext, Document
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
import chromadb
from sentence_transformers import SentenceTransformer
embed_model = HuggingFaceEmbedding(model_name=model_path)
vector_store = ChromaVectorStore(chroma_collection=collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

chroma_docs = collection.get(include=["documents", "metadatas"])

# Convert retrieved data into LlamaIndex Document objects
docss = [
    Document(
        text=doc_text,
        metadata=meta,
        id_=doc_id  # Assign the same ID from ChromaDB
    )
    for doc_text, meta, doc_id in zip(chroma_docs["documents"], chroma_docs["metadatas"], chroma_docs["ids"])
]

vector_index = VectorStoreIndex.from_vector_store(vector_store, storage_context=storage_context,embed_model=embed_model)
keyword_index = KeywordTableIndex.from_documents(docss)


No sentence-transformers model found with name D:\Download\model. Creating a new one with mean pooling.


In [10]:
def reciprocal_rank_fusion(results1, results2, k=60):
    fused_scores = {}
    for rank, result in enumerate(results1, start=1):
        doc_id = result.node.node_id
        fused_scores[doc_id] = fused_scores.get(doc_id, 0) + 1 / (rank + k)

    for rank, result in enumerate(results2, start=1):
        doc_id = result.node.node_id  
        fused_scores[doc_id] = fused_scores.get(doc_id, 0) + 1 / (rank + k)
    sorted_results = sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    
    return sorted_results


In [11]:
query = "civil case"
vector_results = vector_index.as_retriever(similarity_top_k=5).retrieve(query)
keyword_results = keyword_index.as_retriever(similarity_top_k=5).retrieve(query)


In [None]:
for rank, result in enumerate(vector_results, start=1):
    doc_id = result.node.node_id
    print(doc_id)
for rank, result in enumerate(keyword_results, start=1):
    doc_id = result.node.id_
    print(doc_id)

2
8
11
7
1
152ae952-dd7e-48d5-85a1-44c9a868d215
5acf9388-6922-4395-93bb-5699ab59fa81
d822d58a-5a83-40a5-8957-39841c5bc5b1
f3b1b596-70be-444a-8e69-676d1136b8bc
36deb1a9-4e38-4feb-ba90-7e294ce6d32b
47065b8f-c128-4ad9-b2ec-f0337b5244b4
f3d74c77-0ea4-4d63-92e1-4acb1205f829
a65fff51-c4d4-447d-87dc-52bf8a7f0b24
d7c1e9aa-7720-49da-ad6a-92e302655d1c
04a38381-2c75-4748-9def-85749a5a799e
[NodeWithScore(node=TextNode(id_='152ae952-dd7e-48d5-85a1-44c9a868d215', embedding=None, metadata={'Case Type': '```json\n["specific performance", "property dispute", "contract dispute"]\n```'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='9', node_type='4', metadata={'Case Type': '```json\n["specific performance", "property dispute", "contract dispute"]\n```'}, hash='d64237a483e93c8e01c075ba7f447abb29325c7ca46caf34a8d24a057d4d29c0')}, metadata_template='{key}: {value}', metadata_separator='\n', text='10 \n \n18.11.1997 by

In [13]:
from llama_index.core import MetadataFilters, ExactMatchFilter

# Define a metadata filter (e.g., only search in AI-related documents)
metadata_filter = MetadataFilters(filters=[ExactMatchFilter(key="category", value="AI Research")])

# Apply filter to retriever
retriever = keyword_index.as_retriever(filters=metadata_filter)

# Retrieve documents that match the keyword and metadata
retrieved_docs = retriever.retrieve("machine learning applications")


ImportError: cannot import name 'MetadataFilters' from 'llama_index.core' (C:\Users\vivek\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\llama_index\core\__init__.py)

In [None]:
from llama_index.core.retrievers import BaseRetriever
from llama_index.core import VectorStoreIndex, KeywordTableIndex

# Create vector and keyword retrievers
vector_retriever = vector_index.as_retriever(similarity_top_k=5)
keyword_retriever = keyword_index.as_retriever(similarity_top_k=5)

# Hybrid retriever (combining both)
hybrid_retriever = BaseRetriever.from_retrievers([vector_retriever, keyword_retriever])

# Retrieve documents from both methods
retrieved_docs = reciprocal_rank_fusion(results1, results2, k=60):

# Use RRF to re-rank the results
reranked_docs = rrf.postprocess_nodes(retrieved_docs)

# Print results
for doc in reranked_docs:
    print(f"Re-Ranked Score: {doc.score}\nText: {doc.node.text}\n")


ImportError: cannot import name 'ReciprocalRankFusion' from 'llama_index.core.postprocessor' (C:\Users\vivek\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\llama_index\core\postprocessor\__init__.py)

In [None]:
from llama_index.core.indices.keyword_table import KeywordTableIndex
help(KeywordTableIndex.from_documents)

Help on method from_documents in module llama_index.core.indices.base:

from_documents(documents: Sequence[llama_index.core.schema.Document], storage_context: Optional[llama_index.core.storage.storage_context.StorageContext] = None, show_progress: bool = False, callback_manager: Optional[llama_index.core.callbacks.base.CallbackManager] = None, transformations: Optional[List[llama_index.core.schema.TransformComponent]] = None, **kwargs: Any) -> ~IndexType class method of llama_index.core.indices.keyword_table.base.KeywordTableIndex
    Create index from documents.

    Args:
        documents (Optional[Sequence[BaseDocument]]): List of documents to
            build the index from.

