#Install Depecdencies

In [1]:
!pip install langchain langchain-community langchain-huggingface langchain-core chromadb pdfplumber pymupdf langchain_chroma gradio

Collecting langchain-community
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting langchain-huggingface
  Downloading langchain_huggingface-0.3.0-py3-none-any.whl.metadata (996 bytes)
Collecting chromadb
  Downloading chromadb-1.0.15-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.0 kB)
Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pymupdf
  Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting langchain_chroma
  Downloading langchain_chroma-0.2.4-py3-none-any.whl.metadata (1.1 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings

#IMPORT LIBRARIES

In [2]:
import os
from google.colab import userdata
import sys
import base64
import fitz  # PyMuPDF
import pdfplumber
from dotenv import load_dotenv
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_community.document_loaders import TextLoader, UnstructuredMarkdownLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma

#Download Document


In [3]:
#https://drive.google.com/file/d/1IyqC-9QAzuTo5pYY7FuPcQQIk8H9a-eQ/view?usp=drive_link
os.makedirs("data", exist_ok=True)

# Google Drive file ID
file_id = "1IyqC-9QAzuTo5pYY7FuPcQQIk8H9a-eQ"

%cd data
!gdown --id {file_id}
%cd ..

/content/data
Downloading...
From: https://drive.google.com/uc?id=1IyqC-9QAzuTo5pYY7FuPcQQIk8H9a-eQ
To: /content/data/who_guideline.pdf
100% 4.72M/4.72M [00:00<00:00, 123MB/s]
/content


#HELPER FUNCTIONS

In [4]:
def load_parameters():
    HF_TOKEN = "HUGGINGFACEHUB_API_TOKEN"
    GROQ_KEY=userdata.get("GROQ_API_KEY")
    MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.3"
    API_URL = f"https://api-inference.huggingface.co/models/{MODEL_ID}"

    return HF_TOKEN,API_URL,GROQ_KEY

In [5]:
def query_llm(prompt: str):
    HF_TOKEN,API_URL,GROQ_KEY=load_parameters()

    headers = {
        "Authorization": f"Bearer {GROQ_KEY}",
        "Content-Type": "application/json"
    }

    payload = {
        "model": "llama-3.3-70b-versatile",  # Groq's Mistral-based model
        "messages": [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        "temperature": 0.7,
        "max_tokens": 512
    }

    response = requests.post(
        "https://api.groq.com/openai/v1/chat/completions",
        headers=headers,
        json=payload
    )
    response.raise_for_status()
    return response.json()["choices"][0]["message"]["content"]

In [6]:
def build_prompt(query: str, docs: list[Document]) -> str:
    context = "\n\n".join([doc.page_content for doc in docs])
    return f"""[Context]\n{context}\n\n[Question]\n{query}\n\n[Answer]"""

In [7]:
def load_image_parameters():
    GROQ_API_KEY = userdata.get("GROQ_API_KEY")
    IMGBB_API_KEY = userdata.get("IMGBB_API_KEY")
    GROQ_ENDPOINT = "https://api.groq.com/openai/v1/chat/completions"
    MODEL = "meta-llama/llama-4-scout-17b-16e-instruct"

    if not GROQ_API_KEY or not IMGBB_API_KEY:
        raise ValueError("Missing GROQ_API_KEY or IMGBB_API_KEY in .env")

    return GROQ_API_KEY, IMGBB_API_KEY, GROQ_ENDPOINT, MODEL

def upload_to_imgbb(image_path: str) -> str:
    _, IMGBB_API_KEY, _, _ = load_image_parameters()

    with open(image_path, "rb") as file:
        image_data = file.read()

    response = requests.post(
        "https://api.imgbb.com/1/upload",
        params={"key": IMGBB_API_KEY},
        files={"image": image_data}
    )

    if response.status_code == 200:
        return response.json()['data']['url']
    else:
        raise RuntimeError(f"❌ ImgBB upload failed: {response.text}")

def descriptor(image_url: str, prompt: str = "What's in this image?") -> str:
    GROQ_API_KEY, _, GROQ_ENDPOINT, MODEL = load_image_parameters()

    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {GROQ_API_KEY}"
    }

    payload = {
        "messages": [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": image_url
                        }
                    }
                ]
            }
        ],
        "model": MODEL,
        "temperature": 1,
        "max_completion_tokens": 1024,
        "top_p": 1,
        "stream": False,
        "stop": None
    }

    response = requests.post(GROQ_ENDPOINT, json=payload, headers=headers)

    if response.status_code == 200:
        return response.json()["choices"][0]["message"]["content"]
    # else:
    #     raise RuntimeError(f"❌ GROQ API error: {response.status_code} - {response.text}")


In [8]:
import os
import requests
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.documents import Document

CHROMA_PATH = ("./vector_store")
EMBED_MODEL = "all-MiniLM-L6-v2"

def get_vectorstore():
    embedder = HuggingFaceEmbeddings(model_name=EMBED_MODEL)
    vectordb = Chroma(
        persist_directory=CHROMA_PATH,
        embedding_function=embedder
    )
    return vectordb

In [9]:
def context(query: str, vectordb, k=5):
    retriever = vectordb.as_retriever(search_type="similarity", search_kwargs={"k": k})
    docs = retriever.get_relevant_documents(query)
    return docs




#DATA INGESTION



In [22]:
CHROMA_PATH = ("./vector_store")
DOCS_PATH = "./data/"
EMBED_MODEL = "all-MiniLM-L6-v2"

In [11]:
embedder = HuggingFaceEmbeddings(model_name=EMBED_MODEL, cache_folder="./models")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [12]:
def extract_text_images_from_pdf(path):
    docs = []
    pdf = fitz.open(path)
    image_output_dir = "data/extracted_images"
    os.makedirs(image_output_dir, exist_ok=True)

    for i, page in enumerate(pdf):
        # Extract text
        text = page.get_text().strip()
        if text:
            docs.append(Document(
                page_content=text,
                metadata={
                    "type": "Text",
                    "page_number": i + 1,
                    "source": path
                }
            ))

        # Extract and describe images
        images = page.get_images(full=True)
        for img_index, img in enumerate(images):
            xref = img[0]
            base_image = pdf.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image.get("ext", "png")
            image_b64 = base64.b64encode(image_bytes).decode("utf-8")

            local_filename = f"page_{i+1}_img_{img_index+1}.{image_ext}"
            local_path = os.path.join(image_output_dir, local_filename)

            # Save locally
            with open(local_path, "wb") as f:
                f.write(image_bytes)

            try:
                public_url = upload_to_imgbb(local_path)
                description = descriptor(public_url)
            except Exception as e:
                print(f"⚠️ Error processing image on page {i+1}: {e}")
                description = "Image description unavailable due to error."

            # Add image document with description
            docs.append(Document(
                page_content=description,
                metadata={
                    "type": "Image",
                    "page_number": i + 1,
                    "source": path,
                    "image_name": local_filename,
                    "image_ext": image_ext,
                    "image_base64": image_b64,
                    "imgbb_url": public_url
                }
            ))

    print(f"✅ Extracted {len(docs)} text+image elements from {path}")
    return docs

In [13]:
def extract_tables_from_pdf(path):
    docs = []
    with pdfplumber.open(path) as pdf:
        for i, page in enumerate(pdf.pages):
            tables = page.extract_tables()
            for table in tables:
                table_text = "\n".join([", ".join(cell if cell is not None else "" for cell in row) for row in table if row])
                if table_text.strip():
                    docs.append(Document(
                        page_content=table_text,
                        metadata={"type": "Table", "page_number": i + 1, "source": path}
                    ))
    print(f"✅ Extracted {len(docs)} table elements from {path}")
    return docs

In [14]:
def load_documents(directory):
    all_docs = []
    for file in os.listdir(directory):
        path = os.path.join(directory, file)
        if file.endswith(".pdf"):
            all_docs += extract_text_images_from_pdf(path)
            all_docs += extract_tables_from_pdf(path)
        elif file.endswith(".txt"):
            all_docs += TextLoader(path).load()
        elif file.endswith(".md"):
            all_docs += UnstructuredMarkdownLoader(path).load()
        else:
            continue
    print(f"📚 Loaded total {len(all_docs)} documents from {directory}")
    return all_docs

In [15]:
def chunk_documents(docs):
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    chunks = splitter.split_documents(docs)
    print(f"🔗 Split into {len(chunks)} chunks")
    return chunks

In [16]:
def store_embeddings(chunks):
    vectordb = Chroma.from_documents(
        documents=chunks,
        embedding=embedder,
        persist_directory=CHROMA_PATH
    )
    print(f"✅ Stored {len(chunks)} chunks in ChromaDB at {CHROMA_PATH}")

#EXECUTE DATA INGESTION & CHUNKING

In [None]:
#https://drive.google.com/file/d/1sxgCImukvmc43QfruK6yWfNN4s6JY-3L/view?usp=drive_link
# if os.path.exists("data/who_guideline.pdf"):
#     print("✅ File exists. Proceeding to download existing vector data binary...")

#     folder_zip_id = "1sxgCImukvmc43QfruK6yWfNN4s6JY-3L"
#     !gdown --id {folder_zip_id} -O vectordb.zip

#     !unzip -o vectordb.zip -d /content/
# else:
print("Proceding for data ingestion pipeline...")
print("🚀 Starting ingestion pipeline (fitz + pdfplumber)...")
print(DOCS_PATH)
docs = load_documents(DOCS_PATH)
print(docs)
chunks = chunk_documents(docs)
store_embeddings(chunks)


Proceding for data ingestion pipeline...
🚀 Starting ingestion pipeline (fitz + pdfplumber)...
./data/


#CHAT BOT

In [None]:
import gradio as gr
from PIL import Image
import requests
import base64
from io import BytesIO


vectordb = get_vectorstore()


def query_rag_system(query):
    try:
        if not query:
            return {"answer": "⚠️ No query provided.", "sources": []}

        docs = context(query, vectordb)
        print("📄 Retrieved context docs:", docs)

        prompt = build_prompt(query, docs)

        response = query_llm(prompt)
        print("🤖 LLM Response:", response.strip())

        result = {
            "answer": response.strip(),
            "sources": [{
                **d.metadata,
                "content": d.page_content
            } for d in docs],
        }
        return result

    except Exception as e:
        print("❌ Exception:", str(e))
        return {
            "answer": f"❌ Exception: {str(e)}",
            "sources": []
        }

def decode_base64_image(base64_str):
    try:
        img_bytes = base64.b64decode(base64_str)
        return Image.open(BytesIO(img_bytes))
    except Exception as e:
        print("Failed to decode base64 image:", e)
        return None

def rag_chatbot(user_message, chat_history):
    rag_result = query_rag_system(user_message)
    print("🔎 RAG Result:", rag_result)

    answer = rag_result.get("answer", "⚠️ No answer returned.")
    sources = rag_result.get("sources", [])

    answer_output = answer
    images = []

    if sources:
        answer_output += "\n\n📖 **Source Context:**"
        for i, src in enumerate(sources):
            answer_output += f"\n\n**Source {i+1}**"
            answer_output += f"\n- **Type:** `{src.get('type', 'Unknown')}`"
            answer_output += f"\n- **Page:** `{src.get('page_number', 'N/A')}`"
            answer_output += f"\n- **File:** `{src.get('source', 'N/A')}`"
            answer_output += f"\n- **Content:** `{src.get('content', 'N/A')}`"

            if src.get("type") == "Image":
                if "image_base64" in src:
                    img = decode_base64_image(src["image_base64"])
                    if img:
                        images.append((img, f"Image from page {src.get('page_number')}"))
                elif "imgbb_url" in src:
                    try:
                        img = Image.open(requests.get(src["imgbb_url"], stream=True).raw)
                        images.append((img, f"Image from page {src.get('page_number')}"))
                    except Exception as e:
                        print("Failed to load image from URL:", e)

    chat_history.append((user_message, answer_output))
    return chat_history, chat_history, images


with gr.Blocks() as demo:
    gr.Markdown("## 📚 Chat with your Documents using RAG")

    chatbot = gr.Chatbot()
    msg = gr.Textbox(label="Ask something...", placeholder="e.g., What is covered on page 5?")
    state = gr.State([])
    gallery = gr.Gallery(label="📷 Source Images", columns=2, height=300)

    msg.submit(rag_chatbot, [msg, state], [chatbot, state, gallery])

demo.launch(share=True)