In [None]:
import os
import sys
import glob

sys.path.append(os.path.abspath(".."))

from src.ingestion import ingest_document_set
from src.chunking import chunk_text, chunk_table, chunk_image_text
from src.hybrid_retrieval import load_or_build_indexes, hybrid_retrieve
from src.agent import run_agent
from src.indexing import initialize_dense_index, build_dense_index, initialize_bm25


INPUT_DIR = "../Documents" 

pdf_paths = glob.glob(f"{INPUT_DIR}/*.pdf")
image_paths = glob.glob(f"{INPUT_DIR}/img/*.*") 
table_paths = glob.glob(f"{INPUT_DIR}/*.csv") + glob.glob(f"{INPUT_DIR}/*.xlsx")

print("PDFs found:", pdf_paths)
print("Images found:", image_paths)
print("Tables found:", table_paths)


docs = ingest_document_set(
    pdf_paths=pdf_paths,
    image_paths=image_paths,
    table_paths=table_paths
)
print(f"Total documents ingested: {len(docs)}")
print(docs[0] if docs else "No documents found.")

# Build chunks
all_chunks = []
for doc in docs:
    all_chunks.extend(chunk_text(doc["text"], doc["source"]))
    for table in doc["tables"]:
        all_chunks.extend(chunk_table(table, doc["source"]))
    for img_text in doc["images_ocr"]:
        all_chunks.extend(chunk_image_text(img_text, doc["source"]))

print(f"Total chunks created: {len(all_chunks)}")

# Build indexes
dense_db = initialize_dense_index("enterprise_dense")
dense_db = build_dense_index(dense_db, all_chunks)
bm25 = initialize_bm25(all_chunks)

print("Hybrid indexes ready.")

# Queries
queries = [
    "Summarize key product specifications.",
    "List numerical data in the report.",
    "Identify risks highlighted in the documents.",
    "Provide overall insights."
]

for query in queries:
    print("="*60)
    print(f"Query: {query}\n")
    output = run_agent(
        query=query,
        chunks=all_chunks,
        dense_db=dense_db,
        bm25=bm25,
        k=5
    )
    print(output)
    print("="*60, "\n")