 # Install Required Libraries

In [23]:
!pip install -q langchain langchain-community pymupdf
!pip install -q sentence-transformers faiss-cpu
!pip install -q langchain-community google-generativeai
!pip install --upgrade langchain langchain-community




# Saved PDF File

In [24]:
def save_uploaded_files_to_document(uploaded_files: List[str], save_dir: str = "Document"):
    os.makedirs(save_dir, exist_ok=True)
    saved_count = 0

    for file in uploaded_files:
        filename = os.path.basename(file.name if hasattr(file, "name") else file)
        if filename.lower().endswith(".pdf"):
            dest_path = os.path.join(save_dir, filename)
            shutil.copy(file.name if hasattr(file, "name") else file, dest_path)
            saved_count += 1

    return saved_count


# Load PDFs as LangChain Documents

In [25]:
def load_documents_from_folder(folder_path: str = "Document"):
    all_documents = []

    for idx, filename in enumerate(os.listdir(folder_path)):
        if filename.lower().endswith(".pdf"):
            file_path = os.path.join(folder_path, filename)
            loader = PyMuPDFLoader(file_path)
            docs = loader.load()

            # Added metadata
            for doc in docs:
                doc.metadata["doc_number"] = f"doc_{idx+1}"
                doc.metadata["filename"] = filename

            all_documents.extend(docs)

    print(f"Loaded {len(all_documents)} pages from {len(os.listdir(folder_path))} PDFs.")
    return all_documents


In [26]:
def split_documents_and_save_to_csv(documents: List[Document], csv_path: str = "parsed_documents.csv"):
    data_records = []

    text_splitter = RecursiveCharacterTextSplitter(
        separators=["\n\n", "\n", ".", " "],
        chunk_size=500,
        chunk_overlap=100
    )

    for doc in documents:
        doc_id = doc.metadata.get("doc_number", "unknown_doc")
        page_no = doc.metadata.get("page", -1)
        text = doc.page_content.strip()
        chunks = text_splitter.split_text(text)

        for pehra_no, chunk in enumerate(chunks, start=1):
            data_records.append({
                "doc_id": doc_id,
                "page_no": page_no,
                "pehra_no": pehra_no,
                "text": chunk
            })

    df = pd.DataFrame(data_records)
    df.to_csv(csv_path, index=False)
    print(f"CSV created with {len(df)} rows. Saved as: {csv_path}")
    return df


In [27]:
# Generate text embeddings, build FAISS index and save metadata
def embed_and_index_documents(csv_path: str = "parsed_documents.csv",
                              model_name: str = "all-MiniLM-L6-v2",
                              index_path: str = "docs_faiss.index",
                              metadata_path: str = "docs_metadata.pkl"):

    df = pd.read_csv(csv_path)
    model = SentenceTransformer(model_name)
    embeddings = model.encode(df['text'].tolist(), show_progress_bar=True)
    embeddings = np.array(embeddings).astype("float32")

    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings)

    faiss.write_index(index, index_path)

    metadata = df.to_dict(orient="records")
    with open(metadata_path, "wb") as f:
        pickle.dump(metadata, f)

    print(f"Saved: FAISS ➝ `{index_path}`, Metadata ➝ `{metadata_path}`")
    return index, metadata


In [28]:
from typing import List, Dict

# Function to search top matching document chunks using FAISS
def search_docs(query: str, top_k: int = 5):
    if not query.strip():
        return []

    query_embedding = model.encode([query]).astype("float32")
    distances, indices = index.search(query_embedding, top_k)

    results = []
    for idx, dist in zip(indices[0], distances[0]):
        if idx < len(metadata):
            similarity = 1 / (1 + dist)
            results.append({
                "doc_id": metadata[idx]["doc_id"],
                "page_no": metadata[idx]["page_no"],
                "pehra_no": metadata[idx]["pehra_no"],
                "text": metadata[idx]["text"],
                "similarity_score": round(similarity, 4)
            })

    return results


In [29]:
# Prompt Template to form a structured QA input for LLM
def generate_prompt_from_query(query: str, top_k: int = 5):
    search_results = search_docs(query, top_k=top_k)

    if not search_results:
        return f"Context:\n\n(No relevant documents found)\n\nQuestion:\n{query}\nAnswer:\n"

    context_chunks = []
    for r in search_results:
        chunk = f"{r['text']} (Source: doc_id: {r['doc_id']}, page: {r['page_no']}, pehra: {r['pehra_no']})"
        context_chunks.append(chunk)
    context = "\n\n".join(context_chunks)

    return qa_prompt.format(context=context, question=query)


#  Gemini LLM to get the final answer from the prompt

In [30]:
def get_answer_from_llm(prompt: str)
    try:
        chain = llm | parser
        response = chain.invoke(prompt)
        return response
    except Exception as e:
        return f"Error generating response: {str(e)}"


In [31]:
def generate_themes_from_query(query: str, top_k: int = 5)
    try:
        retrieved_results = search_docs(query, top_k=top_k)
        if not retrieved_results:
            return "No relevant context found for theme extraction."

        context_chunks = [
            f"Text: {r['text']}\nSource: doc_id: {r['doc_id']}, page: {r['page_no']}, pehra: {r['pehra_no']}"
            for r in retrieved_results
        ]
        context = "\n\n".join(context_chunks)
        final_prompt = theme_prompt.format(context=context, question=query)

        response = llm.invoke(final_prompt)
        return response.content

    except Exception as e:
        return f" Error generating themes: {str(e)}"


#  Gradio app for uploading documents and asking questions

In [32]:

from pathlib import Path

def handle_upload(files: List[Path]):
    document_dir = "Document"
    if os.path.exists(document_dir):
        shutil.rmtree(document_dir)
    os.makedirs(document_dir, exist_ok=True)

    for file in files:
        dest_path = os.path.join(document_dir, os.path.basename(file.name))
        if os.path.abspath(file) != os.path.abspath(dest_path):
            shutil.copy(file, dest_path)

    documents = load_documents_from_folder(document_dir)
    split_documents_and_save_to_csv(documents)
    embed_and_index_documents()

    return f" Uploaded {len(files)} document(s), old documents removed. Ready for queries!"

def handle_query(query):
    if not query.strip():
        return "Please enter a query.", "", ""

    search_results = search_docs(query, top_k=5)
    retrieved_texts = "\n\n".join([
        f"{r['text']}\n(Source: doc_id: {r['doc_id']}, page: {r['page_no']}, pehra: {r['pehra_no']}, similarity: {r['similarity_score']})"
        for r in search_results
    ])

    prompt = generate_prompt_from_query(query, top_k=5)
    answer = get_answer_from_llm(prompt)
    themes = generate_themes_from_query(query, top_k=15)

    return retrieved_texts, answer, themes
