In [None]:
from llama_index.core import SimpleDirectoryReader,VectorStoreIndex,SummaryIndex
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.tools import FunctionTool,QueryEngineTool
from llama_index.core.vector_stores import MetadataFilters,FilterCondition
from typing import List,Optional

In [None]:
import  nest_asyncio
nest_asyncio.apply()

In [None]:
documents = SimpleDirectoryReader('data').load_data()
print(len(documents))
print(f"Document Metadata: {documents[0].metadata}")

In [None]:
splitter = SentenceSplitter(chunk_size=1024,chunk_overlap=100)
nodes = splitter.get_nodes_from_documents(documents)
print(f"Length of nodes : {len(nodes)}")
print(f"get the content for node 0 :{nodes[0].get_content(metadata_mode='all')}")

In [None]:
import chromadb
db = chromadb.PersistentClient(path="./chroma_db_mistral")
chroma_collection = db.get_or_create_collection("multidocument-agent")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [None]:
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.core import Settings
#
Settings.embed_model = OllamaEmbedding(model_name="nomic-embed-text")
Settings.chunk_size = 1024

In [None]:
from llama_index.llms.ollama import Ollama
Settings.llm = Ollama(model="granite3.1-dense",request_timeout=500)

In [None]:
name = "BERT_arxiv"
vector_index = VectorStoreIndex(nodes,storage_context=storage_context)
vector_index.storage_context.vector_store.persist(persist_path="/content/chroma_db")
#
# Define Vectorstore Autoretrieval tool
def vector_query(query:str,page_numbers:Optional[List[str]]=None)->str:
  '''
  perform vector search over index on
  query(str): query string needs to be embedded
  page_numbers(List[str]): list of page numbers to be retrieved,
                          leave blank if we want to perform a vector search over all pages
  '''
  page_numbers = page_numbers or []
  metadata_dict = [{"key":'page_label',"value":p} for p in page_numbers]
  #
  query_engine = vector_index.as_query_engine(similarity_top_k =4,
                                              filters = MetadataFilters.from_dicts(metadata_dict,
                                                                                    condition=FilterCondition.OR)
                                              )
  #
  response = query_engine.query(query)
  return response
#
#llamiondex FunctionTool wraps any python function we feed it
vector_query_tool = FunctionTool.from_defaults(name=f"vector_tool_{name}",
                                              fn=vector_query)
# Prepare Summary Tool
summary_index = SummaryIndex(nodes)
summary_query_engine = summary_index.as_query_engine(response_mode="tree_summarize",
                                                      se_async=True,)
summary_query_tool = QueryEngineTool.from_defaults(name=f"summary_tool_{name}",
                                                    query_engine=summary_query_engine,
                                                  description=("Use ONLY IF you want to get a holistic summary of the documents."
                                              "DO NOT USE if you have specified questions over the documents."))

In [None]:
response = Settings.llm.predict_and_call([vector_query_tool],
                                "Summarize the content in page number 2",
                                verbose=True)

In [None]:
def get_doc_tools(file_path:str,name:str)->str:
  '''
  get vector query and sumnmary query tools from a document
  '''
  #load documents
  documents = SimpleDirectoryReader(input_files = [file_path]).load_data()
  print(f"length of nodes")
  splitter = SentenceSplitter(chunk_size=1024,chunk_overlap=100)
  nodes = splitter.get_nodes_from_documents(documents)
  print(f"Length of nodes : {len(nodes)}")
  #instantiate Vectorstore
  vector_index = VectorStoreIndex(nodes,storage_context=storage_context)
  vector_index.storage_context.vector_store.persist(persist_path="/content/chroma_db")
  #
  # Define Vectorstore Autoretrieval tool
  def vector_query(query:str,page_numbers:Optional[List[str]]=None)->str:
    '''
    perform vector search over index on
    query(str): query string needs to be embedded
    page_numbers(List[str]): list of page numbers to be retrieved,
                            leave blank if we want to perform a vector search over all pages
    '''
    page_numbers = page_numbers or []
    metadata_dict = [{"key":'page_label',"value":p} for p in page_numbers]
    #
    query_engine = vector_index.as_query_engine(similarity_top_k =2,
                                                filters = MetadataFilters.from_dicts(metadata_dict,
                                                                                     condition=FilterCondition.OR)
                                                )
    #
    response = query_engine.query(query)
    return response
  #
  #llamiondex FunctionTool wraps any python function we feed it
  vector_query_tool = FunctionTool.from_defaults(name=f"vector_tool_{name}",
                                                fn=vector_query)
  # Prepare Summary Tool
  summary_index = SummaryIndex(nodes)
  summary_query_engine = summary_index.as_query_engine(response_mode="tree_summarize",
                                                       se_async=True,)
  summary_query_tool = QueryEngineTool.from_defaults(name=f"summary_tool_{name}",
                                                     query_engine=summary_query_engine,
                                                    description=("Use ONLY IF you want to get a holistic summary of the documents."
                                                "DO NOT USE if you have specified questions over the documents."))
  return vector_query_tool,summary_query_tool

In [None]:
import os
root_path = "data"
file_name = []
file_path = []
for files in os.listdir(root_path):
  if files.endswith(".pdf"):
    file_name.append(files.split(".")[0])
    file_path.append(os.path.join(root_path,files))
#
print(file_name)
print(file_path)

In [None]:
papers_to_tools_dict = {}
for name,filename in zip(file_name,file_path):
  vector_query_tool,summary_query_tool = get_doc_tools(filename,name)
  papers_to_tools_dict[name] = [vector_query_tool,summary_query_tool]

In [None]:
initial_tools = [t for f in file_name for t in papers_to_tools_dict[f]]
initial_tools

In [None]:
from llama_index.core import VectorStoreIndex
from llama_index.core.objects import ObjectIndex
#
obj_index = ObjectIndex.from_objects(initial_tools,index_cls=VectorStoreIndex)

In [None]:
obj_retriever = obj_index.as_retriever(similarity_top_k=2)
tools = obj_retriever.retrieve("compare and contrast the papers self rag and corrective rag")
#
print(tools[0].metadata)
print(tools[1].metadata)

In [None]:
from llama_index.core.agent import FunctionCallingAgentWorker
from llama_index.core.agent import AgentRunner
#
agent_worker = FunctionCallingAgentWorker.from_tools(tool_retriever=obj_retriever,
                                                     llm=Settings.llm,
                                                     system_prompt="""You are an agent designed to answer queries over a set of given papers.
                                                     Please always use the tools provided to answer a question.Do not rely on prior knowledge.""",
                                                     verbose=True)
agent = AgentRunner(agent_worker)

In [None]:
response = agent.query("Compare and contrast self rag and crag.")
print(str(response))

In [None]:
response = agent.query("Summarize the IonIdea company.")
print(str(response))

In [None]:
response = agent.query("What was the first thing i told to you?")
print(str(response))

In [None]:
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.core import Settings, SimpleDirectoryReader, VectorStoreIndex
from llama_index.core.memory import ChatMemoryBuffer
from llama_index.core.chat_engine import CondensePlusContextChatEngine

llm = Ollama(model="llama3.2") 
embed_model = OllamaEmbedding(model_name="nomic-embed-text")

Settings.llm = llm
Settings.embed_model = embed_model

documents = SimpleDirectoryReader(input_dir="data", required_exts=[".pdf"]).load_data()
index = VectorStoreIndex.from_documents(documents)

retriever = index.as_retriever()
memory = ChatMemoryBuffer.from_defaults(token_limit=4500)

chat_engine = CondensePlusContextChatEngine.from_defaults(
    retriever=retriever, 
    memory=memory, 
    llm=llm,
    system_prompt=(
        "You are a retrieval-augmented AI assistant. Your task is to decide whether a query requires document retrieval. "
        "If the user asks about general knowledge, answer directly without retrieving documents. "
        "If the query is about specific documents, reports, or evidence, retrieve relevant documents and provide citations. "
        "At the end of your response, always indicate whether document retrieval was used. "
        "Respond in a clear, informative, and professional manner."
    )
)

def chat_with_proof(query):
    print(f"\n🔹 User: {query}")
    response_stream = chat_engine.stream_chat(query)

    print("\n🔹 AI Response:")
    for chunk in response_stream.response_gen:
        print(chunk, end="", flush=True)

    print("\n")  

# 6. Run Example Queries
chat_with_proof("Summarize the attached PDFs.") 
chat_with_proof("What are the key insights from the documents?") 
chat_with_proof("Explain Quantum Computing.") 
chat_with_proof("How does this relate to my previous question?") 

In [None]:
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.core import Settings, SimpleDirectoryReader, VectorStoreIndex
from llama_index.core.memory import ChatMemoryBuffer
from llama_index.core.chat_engine import CondensePlusContextChatEngine
from llama_index.core.node_parser import SentenceSplitter
import re

# 1. Initialize AI Model & Embeddings
llm = Ollama(model="llama3.2")
embed_model = OllamaEmbedding(model_name="nomic-embed-text")

# 2. Set Global AI Settings
Settings.llm = llm
Settings.embed_model = embed_model

# 3. Load and Index PDFs with proper metadata extraction
documents = SimpleDirectoryReader(
    input_dir="data", 
    required_exts=[".pdf"],
    filename_as_id=True,  # Use filename as document ID
    recursive=True       # Include subdirectories if any
).load_data()

# Print document metadata to verify it's being captured correctly
print(f"Loaded {len(documents)} documents")
for doc in documents[:2]:  # Print first 2 docs as a sample
    print(f"Document: {doc.metadata.get('file_name', 'No filename')} | "
          f"Pages: {doc.metadata.get('page_label', 'Unknown')}")

# 4. Fixed text parsing with compatible parameters
node_parser = SentenceSplitter(
    chunk_size=1024,
    chunk_overlap=200,
    separator=" ",       # Space as separator (helps with PDF text)
    paragraph_separator="\n\n"
)

# 5. Create the index with better metadata handling
index = VectorStoreIndex.from_documents(
    documents,
    node_parser=node_parser,
    show_progress=True
)

# Configure retriever to include more context for better answers
retriever = index.as_retriever(
    similarity_top_k=3
)

# 6. Create Memory-Enabled Chat Engine with optimized prompt
memory = ChatMemoryBuffer.from_defaults(token_limit=4500)

system_prompt = """
You are an AI assistant that helps users understand document content. Follow these rules strictly:

1. FIRST STEP: Determine if the query is about the document content:
   - If YES: Use document retrieval to find relevant information
   - If NO: Politely explain you're configured to only answer questions about the provided documents

2. When using document retrieval:
   - Cite sources clearly with the format [Source: {file_name}, page {page_label}]
   - Include document references only when directly quoting or paraphrasing
   - End your response with a "Sources:" section listing all documents referenced

3. Your response format:
   - Begin with "Retrieval used: Yes" or "Retrieval used: No" as appropriate
   - Be clear, concise and professional
   - Use multi-step reasoning when answering complex questions

4. You must REFUSE to answer questions unrelated to the documents, even if you know the answer.

NOTE: Always check if the metadata contains file_name and page_label before using them in citations.
"""

chat_engine = CondensePlusContextChatEngine.from_defaults(
    retriever=retriever,
    memory=memory,
    llm=llm,
    system_prompt=system_prompt,
    verbose=True
)

# 7. Function to clean up garbled text for display
def clean_text(text):
    # Remove excessive whitespace
    cleaned = re.sub(r'\s+', ' ', text)
    # Fix common OCR errors (like "t e x t" → "text")
    cleaned = re.sub(r'(\w) (\w) (\w)', r'\1\2\3', cleaned)
    cleaned = re.sub(r'(\w) (\w)', r'\1\2', cleaned)
    return cleaned.strip()

# 8. Improved streaming chat function with better metadata display
def chat_with_proof(query):
    print(f"\nUser: {query}")
    
    # Get the AI response using streaming mode
    response_stream = chat_engine.stream_chat(query)
    
    print("\nAI Response:")
    for chunk in response_stream.response_gen:
        print(chunk, end="", flush=True)
    print("\n")
    
    # Enhanced reference display with proper metadata
    if response_stream.source_nodes:
        print("Sources Used:")
        for i, node in enumerate(response_stream.source_nodes, start=1):
            metadata = node.metadata or {}
            
            # Extract proper metadata
            filename = metadata.get("file_name", 
                     metadata.get("filename", 
                     metadata.get("source", "Unknown Document")))
            
            page = metadata.get("page_label", 
                  metadata.get("page_number", 
                  metadata.get("page", "Unknown Page")))
            
            # Clean up the text excerpt
            excerpt = clean_text(node.text[:300])
            
            print(f"Source {i}: {filename} (Page {page})")
            print(f"Excerpt: {excerpt}...")
            print("-" * 50)
    else:
        print("Retrieval used: No")

# 9. Interactive session
if __name__ == "__main__":
    print("Document Q&A System Initialized. Type 'exit' to quit.")
    print("This system is configured to answer questions about the documents in the 'data' folder.")
    
    while True:
        user_query = input("\nYour question: ")
        if user_query.lower() in ["exit", "quit", "bye"]:
            print("Goodbye!")
            break
        
        chat_with_proof(user_query)

In [None]:
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.core import Settings, SimpleDirectoryReader, VectorStoreIndex
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.memory import ChatMemoryBuffer
from llama_index.core.chat_engine import CondensePlusContextChatEngine
from llama_index.core.extractors import SummaryExtractor, QuestionsAnsweredExtractor, TitleExtractor, KeywordExtractor
import chromadb
import os
import re
import uuid

# 1. Initialize AI Model & Embeddings
llm = Ollama(model="llama3.2")
embed_model = OllamaEmbedding(model_name="nomic-embed-text")

# 2. Set Global AI Settings
Settings.llm = llm
Settings.embed_model = embed_model

# 3. Initialize ChromaDB Client
chroma_client = chromadb.PersistentClient(path="./chroma_db_IonIdea")
collection_name = "document_knowledge_base"

collection = chroma_client.get_or_create_collection(collection_name)

# 4. Initialize Chat Memory with Token Limit
chat_memory = ChatMemoryBuffer(token_limit=2048)

# 5. Load and Process Documents (Only if ChromaDB is empty)
if collection.count() == 0:
    print("No existing collection found. Extracting knowledge...")
    
    documents = SimpleDirectoryReader(
        input_dir="data", 
        required_exts=[".pdf"],
        filename_as_id=True,
        recursive=True
    ).load_data()
    
    print(f"Loaded {len(documents)} documents")
    
    # 6. Extract structured knowledge (Summaries + Q&A + Knowledge Graph)
    node_parser = SentenceSplitter(chunk_size=1024, chunk_overlap=200, separator=" ", paragraph_separator="\n\n")
    extractors = [
        TitleExtractor(nodes=5),
        QuestionsAnsweredExtractor(questions=3),
        SummaryExtractor(summaries=["self"]),
        KeywordExtractor(),
    ]
    index = VectorStoreIndex.from_documents(documents, node_parser=node_parser, extractors=extractors, show_progress=True)
    
    retriever = index.as_retriever(similarity_top_k=3)
    
    # Store parsed knowledge into ChromaDB
    for doc in documents:
        metadata = doc.metadata
        filename = metadata.get("file_name", "Unknown File")
        page = metadata.get("page_label", "Unknown Page")
        content = doc.text
        
        doc_id = str(uuid.uuid4())  # Generate unique ID
        
        collection.add(
            ids=[doc_id],
            documents=[content],
            metadatas=[{"file_name": filename, "page_label": page}]
        )
        print(f"Stored: {filename} - Page {page}")
else:
    print("Existing collection found. Skipping extraction.")

# 7. System Prompt to Allow Casual Conversations but Restrict Answers to Context
system_prompt = """
You are an AI assistant designed to answer document-related queries. Your rules:

1. Casual conversations (e.g., greetings) are allowed.
2. You strictly refuse to answer any question that is outside the provided documents.
3. When retrieving information, cite sources correctly with [Source: {file_name}, Page {page_label}].
4. If the query has no relevant document, politely decline to answer.
5. You must generate responses based on retrieved information only when absolutely necessary.
"""

# 8. Function for retrieving knowledge from ChromaDB
def retrieve_knowledge(query):
    results = collection.query(query_texts=[query], n_results=3)
    
    sources = []
    for doc, meta_list in zip(results["documents"], results["metadatas"]):
        if not meta_list:  # Ensure there is metadata
            continue
        meta = meta_list[0]  # Extract the first metadata dictionary
        filename = meta.get("file_name", "Unknown File")
        page = meta.get("page_label", "Unknown Page")
        sources.append((filename, page, doc[0]))  # doc[0] since documents are also lists
    
    return sources

# 9. Function to clean retrieved text
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Remove excessive whitespace
    return text.strip()

# 10. Chat Function with Proper Citations
def chat_with_proof(query):
    print(f"\nUser: {query}")
    
    sources = retrieve_knowledge(query)
    
    if not sources:
        print("AI: I'm sorry, but I can only answer questions related to the provided documents.")
        return
    
    print("\nAI Response:")
    
    response_context = ""
    for filename, page, excerpt in sources:
        excerpt = clean_text(excerpt[:300])
        response_context += f"{excerpt}\n"
    
    # AI Generates a response based on retrieved knowledge if necessary
    if response_context:
        chat_memory.put({"query": query, "response": response_context})
        generated_response = llm.complete(response_context + "\n\nBased on this, answer the user's query: " + query)
        print(generated_response.text)
    else:
        print("AI: I cannot answer this question as it is outside the scope of the provided documents.")
    
    print("-" * 50)

# 11. Interactive Session
if __name__ == "__main__":
    print("Document Q&A System Initialized. Type 'exit' to quit.")
    while True:
        user_query = input("\nYour question: ")
        if user_query.lower() in ["exit", "quit", "bye"]:
            print("Goodbye!")
            break
        chat_with_proof(user_query)


In [1]:
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.core import Settings, SimpleDirectoryReader, VectorStoreIndex
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.memory import ChatMemoryBuffer
from llama_index.core.chat_engine import CondensePlusContextChatEngine
from llama_index.core.extractors import SummaryExtractor, QuestionsAnsweredExtractor, TitleExtractor, KeywordExtractor
import chromadb
import os
import re
import uuid

# 1. Initialize AI Model & Embeddings
llm = Ollama(model="llama3.2")
embed_model = OllamaEmbedding(model_name="nomic-embed-text")

# 2. Set Global AI Settings
Settings.llm = llm
Settings.embed_model = embed_model

# 3. Initialize ChromaDB Client
chroma_client = chromadb.PersistentClient(path="./chroma_db_IonIdea")
collection_name = "document_knowledge_base"
collection = chroma_client.get_or_create_collection(collection_name)

# 4. Initialize Chat Memory with Token Limit
chat_memory = ChatMemoryBuffer(token_limit=2048)

# 5. Load and Process Documents (Ensuring Proper Mapping)
if collection.count() == 0:
    print("No existing collection found. Extracting knowledge...")
    
    documents = SimpleDirectoryReader(
        input_dir="data", 
        required_exts=[".pdf"],
        filename_as_id=True,
        recursive=True
    ).load_data()
    
    print(f"Loaded {len(documents)} documents")
    
    # 6. Extract structured knowledge (Summaries + Q&A + Keywords)
    node_parser = SentenceSplitter(chunk_size=1024, chunk_overlap=200, separator=" ", paragraph_separator="\n\n")
    extractors = [
        TitleExtractor(nodes=5),
        QuestionsAnsweredExtractor(questions=3),
        SummaryExtractor(summaries=["self"]),
        KeywordExtractor(),
    ]
    
    index = VectorStoreIndex.from_documents(documents, node_parser=node_parser, extractors=extractors, show_progress=True)
    
    # Store parsed knowledge into ChromaDB
    for doc in documents:
        metadata = doc.metadata
        filename = metadata.get("file_name", "Unknown File")
        page = metadata.get("page_label", "Unknown Page")
        content = doc.text
        doc_id = str(uuid.uuid4())  # Generate unique ID
        
        collection.add(
            ids=[doc_id],
            documents=[content],
            metadatas=[{"file_name": filename, "page_label": page}]
        )
        print(f"Stored: {filename} - Page {page}")
else:
    print("Existing collection found. Skipping extraction.")

# 7. System Prompt to Maintain Context-Only Responses
system_prompt = """
You are an AI assistant designed to answer document-related queries. Your rules:
1. Casual conversations (e.g., greetings) are allowed.
2. You strictly refuse to answer any question that is outside the provided documents.
3. When retrieving information, cite sources correctly with [Source: {file_name}, Page {page_label}].
4. If the query has no relevant document, politely decline to answer.
5. Generate responses based only on retrieved knowledge.
"""

# 8. Function for retrieving knowledge from ChromaDB
def retrieve_knowledge(query):
    results = collection.query(query_texts=[query], n_results=3)
    
    sources = []
    for doc, meta_list in zip(results["documents"], results["metadatas"]):
        if not meta_list:
            continue
        meta = meta_list[0]  # Extract first metadata dictionary
        filename = meta.get("file_name", "Unknown File")
        page = meta.get("page_label", "Unknown Page")
        sources.append((filename, page, doc[0]))  # doc[0] as documents are lists
    
    return sources

# 9. Function to clean retrieved text
def clean_text(text):
    return re.sub(r'\s+', ' ', text).strip()

# 10. Chat Function with Proper Citations
def chat_with_proof(query):
    print(f"\nUser: {query}")
    sources = retrieve_knowledge(query)
    
    if not sources:
        print("AI: I'm sorry, but I can only answer questions related to the provided documents.")
        return
    
    print("\nAI Response:")
    response_context = ""
    for filename, page, excerpt in sources:
        excerpt = clean_text(excerpt[:300])
        response_context += f"{excerpt}\n"
    
    if response_context:
        chat_memory.put({"query": query, "response": response_context})
        generated_response = llm.complete(response_context + "\n\nBased on this, answer the user's query: " + query)
        print(f"{generated_response.text} [Source: {filename}, Page {page}]")
    else:
        print("AI: I cannot answer this question as it is outside the scope of the provided documents.")
    
    print("-" * 50)

# 11. Interactive Session
if __name__ == "__main__":
    print("Document Q&A System Initialized. Type 'exit' to quit.")
    while True:
        user_query = input("\nYour question: ")
        if user_query.lower() in ["exit", "quit", "bye"]:
            print("Goodbye!")
            break
        chat_with_proof(user_query)


No existing collection found. Extracting knowledge...
Loaded 6 documents


Parsing nodes:   0%|          | 0/6 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/10 [00:00<?, ?it/s]

Stored: Leading the Charge The Future of Healthcare Software Solutions.pdf - Page 1
Stored: Leading the Charge The Future of Healthcare Software Solutions.pdf - Page 2
Stored: Leading the Charge The Future of Healthcare Software Solutions.pdf - Page 3
Stored: Navigating the Future Pioneering Trends in Manufacturing Software Development.pdf - Page 1
Stored: Navigating the Future Pioneering Trends in Manufacturing Software Development.pdf - Page 2
Stored: Navigating the Future Pioneering Trends in Manufacturing Software Development.pdf - Page 3
Document Q&A System Initialized. Type 'exit' to quit.

User: hello

AI Response:
Hello! It seems like you're interested in learning more about IonIde or HealthCareSoft. However, without more context, it's difficult to provide specific information.

Healthcare software can encompass a broad range of tools and technologies designed for healthcare providers, patients, and other stakeholders. These systems are often used for managing medical records, 

In [2]:
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.core import Settings, SimpleDirectoryReader, VectorStoreIndex
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.memory import ChatMemoryBuffer
from llama_index.core.chat_engine import CondensePlusContextChatEngine
from llama_index.core.extractors import SummaryExtractor, QuestionsAnsweredExtractor, TitleExtractor, KeywordExtractor
import chromadb
import os
import re
import uuid

# 1. Initialize AI Model & Embeddings
llm = Ollama(model="llama3.2")
embed_model = OllamaEmbedding(model_name="nomic-embed-text")

# 2. Set Global AI Settings
Settings.llm = llm
Settings.embed_model = embed_model

# 3. Initialize ChromaDB Client
chroma_client = chromadb.PersistentClient(path="./chroma_db_IonIdea")
collection_name = "document_knowledge_base"
collection = chroma_client.get_or_create_collection(collection_name)

# 4. Initialize Chat Memory with Token Limit
chat_memory = ChatMemoryBuffer(token_limit=2048)

# 5. Load and Process Documents (Ensuring Proper Mapping)
if collection.count() == 0:
    print("No existing collection found. Extracting knowledge...")
    
    documents = SimpleDirectoryReader(
        input_dir="data", 
        required_exts=[".pdf"],
        filename_as_id=True,
        recursive=True
    ).load_data()
    
    print(f"Loaded {len(documents)} documents")
    
    # 6. Extract structured knowledge (Summaries + Q&A + Keywords)
    node_parser = SentenceSplitter(chunk_size=1024, chunk_overlap=200, separator=" ", paragraph_separator="\n\n")
    extractors = [
        TitleExtractor(nodes=5),
        QuestionsAnsweredExtractor(questions=3),
        SummaryExtractor(summaries=["self"]),
        KeywordExtractor(),
    ]
    
    index = VectorStoreIndex.from_documents(documents, node_parser=node_parser, extractors=extractors, show_progress=True)
    
    # Store parsed knowledge into ChromaDB
    for doc in documents:
        metadata = doc.metadata
        filename = metadata.get("file_name", "Unknown File")
        page = metadata.get("page_label", "Unknown Page")
        content = doc.text
        doc_id = str(uuid.uuid4())  # Generate unique ID
        
        collection.add(
            ids=[doc_id],
            documents=[content],
            metadatas=[{"file_name": filename, "page_label": page}]
        )
        print(f"Stored: {filename} - Page {page}")
else:
    print("Existing collection found. Skipping extraction.")

# 7. System Prompt for Query Expansion
query_expansion_prompt = """
You are an AI assistant that improves user queries for better document retrieval.
Given a user query, generate 3 similar but varied questions to enhance understanding and correct errors.
Examples:
User: "how does AI impact healthcare?"
Generated:
1. "What are the effects of AI on the healthcare industry?"
2. "How is artificial intelligence used in modern medicine?"
3. "What are the benefits and challenges of AI in healthcare?"
"""

# 8. Function for generating query variations
def generate_query_variations(query):
    prompt = query_expansion_prompt + f"\nUser: \"{query}\"\nGenerated:"
    response = llm.complete(prompt)
    return response.text.strip().split("\n")[:3]  # Return top 3 variations

# 9. Function for retrieving knowledge from ChromaDB
def retrieve_knowledge(queries):
    results = []
    for query in queries:
        query_results = collection.query(query_texts=[query], n_results=3)
        for doc, meta_list in zip(query_results["documents"], query_results["metadatas"]):
            if not meta_list:
                continue
            meta = meta_list[0]  # Extract first metadata dictionary
            filename = meta.get("file_name", "Unknown File")
            page = meta.get("page_label", "Unknown Page")
            results.append((filename, page, doc[0]))  # doc[0] as documents are lists
    return results

# 10. Function to clean retrieved text
def clean_text(text):
    return re.sub(r'\s+', ' ', text).strip()

# 11. Chat Function with Query Expansion & Proper Citations
def chat_with_proof(query):
    print(f"\nUser: {query}")
    query_variations = generate_query_variations(query)
    print(f"Generated Query Variations: {query_variations}")
    
    sources = retrieve_knowledge(query_variations)
    
    if not sources:
        print("AI: I'm sorry, but I can only answer questions related to the provided documents.")
        return
    
    print("\nAI Response:")
    response_context = ""
    for filename, page, excerpt in sources:
        excerpt = clean_text(excerpt[:300])
        response_context += f"{excerpt}\n"
    
    if response_context:
        chat_memory.put({"query": query, "response": response_context})
        generated_response = llm.complete(response_context + "\n\nBased on this, answer the user's query: " + query)
        print(f"{generated_response.text} [Source: {filename}, Page {page}]")
    else:
        print("AI: I cannot answer this question as it is outside the scope of the provided documents.")
    
    print("-" * 50)

# 12. Interactive Session
if __name__ == "__main__":
    print("Document Q&A System Initialized. Type 'exit' to quit.")
    while True:
        user_query = input("\nYour question: ")
        if user_query.lower() in ["exit", "quit", "bye"]:
            print("Goodbye!")
            break
        chat_with_proof(user_query)


Existing collection found. Skipping extraction.
Document Q&A System Initialized. Type 'exit' to quit.

User: what is ionidea?
Generated Query Variations: ["Since you didn't provide a specific query, I'll generate some general similar questions to improve document retrieval:", '', '1. "What is Ionide\'s business model?"']

AI Response:
IonIdea appears to be a software company that specializes in developing innovative solutions for the manufacturing industry, particularly in the context of Industry 4.0 (Internet of Things, or IoT). They offer software products and services that enable real-time communication between machines, tools, and components on factory floors, as well as addressing key challenges faced by the industry.

Based on the provided text, it seems that IonIdea is a technology company focused on providing digital solutions for the manufacturing sector. [Source: Leading the Charge The Future of Healthcare Software Solutions.pdf, Page 3]
--------------------------------------