In [4]:
import getpass
import os

if not os.environ.get("OPENAI_API_KEY"):
  os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")

In [None]:
MODEL_PROVIDER = "openai"
MODEL_NAME = "gpt-4o-mini"
APP_VERSION = 1.0

In [None]:
import os
import shutil

from unstructured.partition.pdf import partition_pdf
from openai import OpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS, Chroma
from langchain.docstore.document import Document


def get_documents_from_source(source_path):
    """
    Extracts text and metadata from a source path using the 'unstructured'
    library. It can handle a single PDF file or a directory of PDFs.

    Args:
        source_path (str): The path to the PDF file or directory.

    Returns:
        list: A list of LangChain Document objects with rich metadata.
    """
    if not os.path.exists(source_path):
        print(f"🛑 Error: Source path not found at {source_path}")
        return []

    all_docs = []
    
    # Determine which PDF files to process
    pdf_files_to_process = []
    book_title = "Single Document"

    if os.path.isdir(source_path):
        book_title = os.path.basename(source_path)
        print(f"✅ Processing book directory: '{book_title}'")
        pdf_files_to_process = [os.path.join(source_path, f) for f in sorted(os.listdir(source_path)) if f.lower().endswith('.pdf')]
    elif os.path.isfile(source_path) and source_path.lower().endswith('.pdf'):
        print(f"✅ Processing single PDF file: {os.path.basename(source_path)}")
        pdf_files_to_process.append(source_path)
    else:
        print(f"🛑 Error: Path '{source_path}' is not a valid PDF file or directory.")
        return []

    if not pdf_files_to_process:
        print(f"🛑 Warning: No PDF files found to process in {source_path}")
        return []

    # Process each PDF file with 'unstructured'
    for pdf_path in pdf_files_to_process:
        chapter_title = os.path.splitext(os.path.basename(pdf_path))[0]
        print(f"  📖 Processing Chapter: '{chapter_title}' with unstructured...")
        
        try:
            # The core of the new logic: partition_pdf
            elements = partition_pdf(
                filename=pdf_path,
                # 'fast' is a good balance of speed and accuracy.
                # Use 'hi_res' for more complex documents, which may require tesseract.
                strategy="fast"
            )
        except Exception as e:
            print(f"🛑 Error processing file {pdf_path} with unstructured: {e}")
            continue

        current_heading = "Introduction" # Default for text before the first header
        for el in elements:
            # unstructured identifies titles and headers, which we use for chapter context
            if el.category in ("Title", "Header", "SubTitle"):
                current_heading = el.text.strip()
            
            # Create a Document for each element with detailed metadata
            doc = Document(
                page_content=el.text,
                metadata={
                    "book_title": book_title,
                    "chapter_file": os.path.basename(pdf_path),
                    "heading_context": current_heading,
                    "element_type": el.category
                }
            )
            all_docs.append(doc)

    if not all_docs:
        print("🛑 Warning: No text could be extracted to create documents.")
    else:
        print(f"✅ Source processing complete. Total documents created: {len(all_docs)}")
    
    return all_docs


# --- FAISS Vector Store Functions ---

def create_or_load_faiss_store(documents, embeddings, index_path="faiss_index"):
    if os.path.exists(index_path):
        print(f"✅ Loading existing FAISS index from: {index_path}")
        vector_store = FAISS.load_local(index_path, embeddings, allow_dangerous_deserialization=True)
    else:
        print(f"ℹ️ No FAISS index found. Creating a new one at: {index_path}")
        if not documents:
            print("🛑 Error: No documents provided to create a new FAISS store.")
            return None
        vector_store = FAISS.from_documents(documents, embeddings)
        vector_store.save_local(index_path)
        print(f"✅ New FAISS index created and saved.")
    return vector_store

# --- ChromaDB Vector Store Functions ---
def create_or_load_chroma_store(documents, embeddings, persist_directory="chroma_db"):
    if os.path.exists(persist_directory):
        print(f"✅ Loading existing ChromaDB from: {persist_directory}")
        vector_store = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
    else:
        print(f"ℹ️ No ChromaDB found. Creating a new one at: {persist_directory}")
        if not documents:
            print("🛑 Error: No documents provided to create a new ChromaDB store.")
            return None
        vector_store = Chroma.from_documents(documents=documents, embedding=embeddings, persist_directory=persist_directory)
        print(f"✅ New ChromaDB created and persisted.")
    return vector_store


# --- Retrieval Function ---

def perform_similarity_search(vector_store, query):
    if not vector_store:
        print("🛑 Cannot perform search: Vector store is not available.")
        return []

    print(f"\n🔍 Performing similarity search for query: '{query}'")
    results = vector_store.similarity_search(query, k=3)
    if not results:
        print("   -> No results found.")
        return []
        
    for i, doc in enumerate(results):
        content_snippet = " ".join(doc.page_content.split())[:250] + "..."
        print(f"   📄 Result {i+1}: \"{content_snippet}\"")
        if doc.metadata:
            book = doc.metadata.get('book_title', 'N/A')
            chapter = doc.metadata.get('chapter_file', 'N/A')
            heading = doc.metadata.get('heading_context', 'N/A')
            el_type = doc.metadata.get('element_type', 'N/A')
            print(f"      ▶️  Metadata: [Book: {book}] [Chapter File: {chapter}] [Heading: {heading}] [Type: {el_type}]")

    return results


if __name__ == '__main__':
    if not os.environ.get("OPENAI_API_KEY") or os.environ.get("OPENAI_API_KEY") == "YOUR_API_KEY":
        print("🛑 WARNING: Please set your OPENAI_API_KEY environment variable.")
    
    embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")

    print("\n" + "="*50)
    print("🚀 STARTING PDF BOOK WORKFLOW with UNSTRUCTURED")
    print("="*50)
    
    # This path should point to your directory of PDFs
    book_source_path = r"/home/manoj/Project/RAG/kehs1dd"
    
    # Dummy directory setup for first-time run
    if not os.path.exists(book_source_path):
        print(f"ℹ️ Test directory not found. Creating '{book_source_path}' for demonstration.")
        os.makedirs(book_source_path)
        # You MUST replace these with real PDFs for unstructured to work.
        print("🛑 Note: The created directory is empty. You MUST add real PDF files to it for processing.")

    book_faiss_path = "faiss_index_from_unstructured"
    
    # This now calls the unstructured-powered function
    book_docs = get_documents_from_source(book_source_path)
    
    if book_docs:
        faiss_store_from_book = create_or_load_faiss_store(book_docs, embedding_model, book_faiss_path)
        perform_similarity_search(faiss_store_from_book, "What is the main theme?")
        perform_similarity_search(faiss_store_from_book, "Find a character description.")
    else:
        print("\nSkipping vector store creation because no documents were extracted.")


  from .autonotebook import tqdm as notebook_tqdm



🚀 STARTING PDF BOOK WORKFLOW with UNSTRUCTURED
✅ Processing book directory: 'kehs1dd'
  📖 Processing Chapter: 'kehs101' with unstructured...
  📖 Processing Chapter: 'kehs102' with unstructured...
  📖 Processing Chapter: 'kehs103' with unstructured...
  📖 Processing Chapter: 'kehs104' with unstructured...
  📖 Processing Chapter: 'kehs105' with unstructured...
  📖 Processing Chapter: 'kehs106' with unstructured...
  📖 Processing Chapter: 'kehs107' with unstructured...
  📖 Processing Chapter: 'kehs1ps' with unstructured...
✅ Source processing complete. Total documents created: 136
ℹ️ No FAISS index found. Creating a new one at: faiss_index_from_unstructured
✅ New FAISS index created and saved.

🔍 Performing similarity search for query: 'What is the main theme?'
   📄 Result 1: "Second: when you read about the making of states and empires in Sec- tion II, you will see that the drama unfolds not only in Rome (Theme 2), that is in Europe, but in the Central Islamic states (Theme 4), and the 