In [7]:
!pip install chromadb
!pip install chromadb google-generativeai python-dotenv tiktoken
!pip install pypdf python-dotenv

Collecting pypdf
  Downloading pypdf-6.6.2-py3-none-any.whl.metadata (7.1 kB)
Downloading pypdf-6.6.2-py3-none-any.whl (329 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m329.1/329.1 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-6.6.2


In [4]:
import chromadb
from google.colab import userdata

client = chromadb.CloudClient(
  api_key=userdata.get('CHROMA_KEY'),
  tenant='b48cc1cb-b913-4b47-86e2-08dec612ea3e',
  database='smartcloud'
)

In [8]:
import os
import uuid
import chromadb
import google.generativeai as genai
from typing import List
from dotenv import load_dotenv
from pypdf import PdfReader


In [9]:
genai.configure(api_key=userdata.get('GOOGLE_API_KEY'))

In [11]:
userId1 = 'kalyan09876-2333'
userId2 = 'sufiya03294-2333'
EMBEDDING_MODEL = "models/text-embedding-004"

In [12]:
collection = client.get_or_create_collection(
    name="rag_context_embeddings",
    metadata={"hnsw:space": "cosine"}
)

In [29]:
def load_pdf(path: str) -> str:
    reader = PdfReader(path)
    pages = []

    for page in reader.pages:
        text = page.extract_text()
        if text:
            pages.append(text)

    return "\n".join(pages)

def chunk_text(text, chunk_size=500, overlap=100):
    words = text.split()
    chunks = []
    start = 0

    while start < len(words):
        end = start + chunk_size
        chunk = " ".join(words[start:end])
        chunks.append(chunk)
        start += chunk_size - overlap

    return chunks

def build_contextual_chunks(text: str):
    """
    Each chunk embedding includes surrounding context
    """
    chunks = chunk_text(text)
    contextual_chunks = []

    for i, chunk in enumerate(chunks):
        prev_chunk = chunks[i - 1] if i > 0 else ""
        next_chunk = chunks[i + 1] if i < len(chunks) - 1 else ""

        context_text = f"""
        PREVIOUS CONTEXT:
        {prev_chunk}

        MAIN CHUNK:
        {chunk}

        NEXT CONTEXT:
        {next_chunk}
        """

        contextual_chunks.append({
            "id": str(uuid.uuid4()),
            "text": chunk,
            "context_embedding_text": context_text.strip(),
            "metadata": {
                "chunk_index": i,
                "source": "custom_document",
                "userId": userId2
            }
        })

    return contextual_chunks


def add_document(text: str):
    chunks = build_contextual_chunks(text)

    embeddings = embed_text(
        [c["context_embedding_text"] for c in chunks]
    )

    collection.add(
        ids=[c["id"] for c in chunks],
        documents=[c["text"] for c in chunks],
        embeddings=embeddings,
        metadatas=[c["metadata"] for c in chunks],
    )



def embed_text(texts: List[str]) -> List[List[float]]:
    result = genai.embed_content(
        model=EMBEDDING_MODEL,
        content=texts,
        task_type="retrieval_document",
    )
    return result["embedding"]

def retrieve(query: str, k: int = 4):
    query_embedding = embed_text([query])[0]

    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=k,
        where={
            "userId": userId2
        }
    )

    return results["documents"][0]

def generate_answer(query: str):
    retrieved_chunks = retrieve(query)

    context = "\n\n".join(retrieved_chunks)

    prompt = f"""
    Use the context below to answer the question.
    If the answer is not in the context, say you don't know.

    CONTEXT:
    {context}

    QUESTION:
    {query}
    """

    # model = genai.GenerativeModel("gemini-1.5-flash")
    model = genai.GenerativeModel("models/gemini-flash-lite-latest")

    response = model.generate_content(prompt)

    return response.text

def ingest_pdf(pdf_path: str):
    print(f"Ingesting {pdf_path}...")
    text = load_pdf(pdf_path)

    # print(text)

    chunks = build_contextual_chunks(
        text=text
    )

    embeddings = embed_text(
        [c["context_embedding_text"] for c in chunks]
    )

    collection.add(
        ids=[c["id"] for c in chunks],
        documents=[c["text"] for c in chunks],
        embeddings=embeddings,
        metadatas=[c["metadata"] for c in chunks],
    )

    # client.persist()
    print("Ingestion complete.\n")

In [32]:
if __name__ == "__main__":
    # pdf_path = "/content/pdfs/Cloud_resources_modelling_using_smart_cloud_manage.pdf"
    # pdf_path = "/content/pdfs/jamaneurology_tanaka_2026_oi_250092_1768320727.58952.pdf"
    pdf_path = "/content/pdfs/A230-Final.pdf"
    ingest_pdf(pdf_path)



Ingesting /content/pdfs/A230-Final.pdf...
Ingestion complete.



In [39]:
question = "what is Baahubali"
answer = generate_answer(question)
print("Q:", question)
print("A:", answer)

Q: what is Baahubali
A: Baahubali is an Indian bilingual epic historical fiction movie directed by S.S. Rajamouli, produced by Shobu Yarlagadda and Prasad Devineni, in two cinematic parts (Bahubali: The Beginning in 2015 and Bahubali: The Conclusion in 2017). It was originally shot in Telugu and Tamil and was also a hit in North India when released in Hindi by Dharma Production.


In [31]:
# "What is the main concept described in the document?"
