# Import modules

In [None]:
import os
import getpass
from typing import List
from langchain.chat_models import init_chat_model
from langchain_huggingface import HuggingFaceEmbeddings

# File-type handlers
import fitz  # PyMuPDF
from docx import Document as DocxDocument
from langchain.docstore.document import Document
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS



In [None]:
if not os.environ.get("GROQ_API_KEY"):
  os.environ["GROQ_API_KEY"] = getpass.getpass("Enter API key for Groq: ") 

In [6]:
# Initialize the chat model
# Note: Ensure you have the correct model name and provider
llm = init_chat_model("llama3-8b-8192", model_provider="groq")

In [17]:
# Supported extensions
SUPPORTED_EXTENSIONS = {'.pdf', '.docx', '.txt'}

In [16]:
def load_pdf(path: str) -> List[Document]:
    doc = fitz.open(path)
    texts = [page.get_text() for page in doc]
    return [Document(page_content=text, metadata={"source": path}) for text in texts]

def load_docx(path: str) -> List[Document]:
    doc = DocxDocument(path)
    full_text = "\n".join(p.text for p in doc.paragraphs)
    return [Document(page_content=full_text, metadata={"source": path})]

def load_txt(path: str) -> List[Document]:
    with open(path, 'r', encoding='utf-8') as f:
        text = f.read()
    return [Document(page_content=text, metadata={"source": path})]

def load_file(path: str) -> List[Document]:
    ext = os.path.splitext(path)[-1].lower()
    if ext == '.pdf':
        return load_pdf(path)
    elif ext == '.docx':
        return load_docx(path)
    elif ext == '.txt':
        return load_txt(path)
    else:
        raise ValueError(f"Unsupported file type: {ext}")

In [18]:

def load_documents_from_folder(folder_path: str) -> List[Document]:
    all_docs = []
    for root, _, files in os.walk(folder_path):
        for file in files:
            ext = os.path.splitext(file)[-1].lower()
            if ext in SUPPORTED_EXTENSIONS:
                full_path = os.path.join(root, file)
                try:
                    docs = load_file(full_path)
                    all_docs.extend(docs)
                except Exception as e:
                    print(f"Failed to load {full_path}: {e}")
    return all_docs

In [23]:
# import bs4
from langchain import hub
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langgraph.graph import START, StateGraph
from typing_extensions import List, TypedDict
from langchain.chains import RetrievalQA
import gradio as gr
import tempfile

In [None]:

# --- RAG Build ---
def build_vectorstore(documents: List[Document]):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    chunks = text_splitter.split_documents(documents)
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
    return FAISS.from_documents(chunks, embeddings)

In [22]:
def query_rag(db, query: str) -> str:
    retriever = db.as_retriever(search_kwargs={"k": 3})
    qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)
    return qa_chain.run(query)

In [None]:
# --- Gradio Handler ---
vectorstore = None

def handle_file_upload(files):
    global vectorstore
    temp_paths = []
    for file in files:
        temp = tempfile.NamedTemporaryFile(delete=False)
        temp.write(file.read())
        temp.close()
        temp_paths.append(temp.name)
    docs = load_documents_from_folder(temp_paths)
    if not docs:
        return "No valid documents uploaded."
    vectorstore = build_vectorstore(docs)
    return "Documents loaded and indexed. Ask your question."


* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.




Traceback (most recent call last):
  File "c:\Users\faiza\anaconda3\envs\ai_env\Lib\site-packages\gradio\queueing.py", line 626, in process_events
    response = await route_utils.call_process_api(
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    ...<5 lines>...
    )
    ^
  File "c:\Users\faiza\anaconda3\envs\ai_env\Lib\site-packages\gradio\route_utils.py", line 350, in call_process_api
    output = await app.get_blocks().process_api(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    ...<11 lines>...
    )
    ^
  File "c:\Users\faiza\anaconda3\envs\ai_env\Lib\site-packages\gradio\blocks.py", line 2240, in process_api
    result = await self.call_function(
             ^^^^^^^^^^^^^^^^^^^^^^^^^
    ...<8 lines>...
    )
    ^
  File "c:\Users\faiza\anaconda3\envs\ai_env\Lib\site-packages\gradio\blocks.py", line 1747, in call_function
    prediction = await anyio.to_thread.run_sync(  # type: ignore
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
        fn, *

In [None]:

def handle_query(question):
    global vectorstore
    if vectorstore is None:
        return "Please upload documents first."
    return query_rag(vectorstore, question)



In [None]:
# --- Gradio UI ---
# with gr.Blocks() as demo:
#     gr.Markdown("## 🔍 RAG File QA (PDF/DOCX/TXT)")

#     file_input = gr.File(file_types=['.pdf', '.docx', '.txt'], file_count="multiple", label="Upload Files")
#     upload_btn = gr.Button("Load Documents")

#     status = gr.Textbox(label="Status")
#     upload_btn.click(fn=handle_file_upload, inputs=[file_input], outputs=[status])

#     question = gr.Textbox(label="Ask a Question")
#     answer = gr.Textbox(label="Answer", lines=6)
#     ask_btn = gr.Button("Get Answer")

#     ask_btn.click(fn=handle_query, inputs=[question], outputs=[answer])

# demo.launch()