In [None]:
!pip install pypdf


In [None]:
# 2Ô∏è‚É£ Imports and basic setup

import os
import math
from typing import List, Tuple

from sentence_transformers import SentenceTransformer
import faiss
from groq import Groq
from pypdf import PdfReader

from google.colab import files  # For file upload in Colab
import textwrap
import getpass


In [None]:
# 2Ô∏è‚É£.1 Set Groq API key (secure input in Colab)

# You will be prompted to paste your API key. It will NOT be shown on screen.
if "GROQ_API_KEY" not in os.environ or not os.environ["GROQ_API_KEY"]:
    os.environ["GROQ_API_KEY"] = getpass.getpass("Enter your GROQ_API_KEY: ")

groq_api_key = os.environ["GROQ_API_KEY"]

# Create Groq client
client = Groq(api_key=groq_api_key)

# Choose LLM model (you can change this if you want)
GROQ_MODEL = "openai/gpt-oss-120b"  # example model name


## üî§ 3. Load the embedding model

We will use a popular, fast Sentence Transformer:

- `sentence-transformers/all-MiniLM-L6-v2`



In [None]:
# 3Ô∏è‚É£ Load embedding model

EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"

embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME)

# Get embedding dimension to build FAISS index later
EMBED_DIM = embedding_model.get_sentence_embedding_dimension()
print("Embedding dimension:", EMBED_DIM)


## üìÑ 4. Upload a document (PDF or TXT)

You can upload:
- A **PDF** file (e.g., book, report)
- A **TXT** file (plain text, article, notes)

We will extract text from the file for chunking and indexing.


In [None]:
# 4Ô∏è‚É£ Helper function: load text from uploaded file

def extract_text_from_pdf(file_path: str) -> str:
    """Extracts text from a PDF file using pypdf."""
    reader = PdfReader(file_path)
    pages_text = []
    for page in reader.pages:
        try:
            pages_text.append(page.extract_text() or "")
        except Exception:
            # If extraction fails for a page, skip it
            continue
    return "\n".join(pages_text)


def load_document_text() -> Tuple[str, str]:
    """
    Lets user upload a single file via Colab, returns (file_name, text_content).
    Supports .pdf and .txt files.
    """
    uploaded = files.upload()  # Opens a file picker in Colab

    if not uploaded:
        raise ValueError("No file uploaded.")

    # Take the first uploaded file
    file_name = next(iter(uploaded.keys()))
    file_path = file_name  # Colab saves it in current working directory

    if file_name.lower().endswith(".pdf"):
        text = extract_text_from_pdf(file_path)
    elif file_name.lower().endswith(".txt"):
        text = uploaded[file_name].decode("utf-8", errors="ignore")
    else:
        raise ValueError("Unsupported file type. Please upload a .pdf or .txt file.")

    # Basic cleaning
    text = text.replace("\r", " ").replace("\t", " ")
    return file_name, text.strip()


print("‚¨ÜÔ∏è Run the next cell to upload your document.")


### üì• Upload your document now


In [None]:
file_name, raw_text = load_document_text()
print(f"Loaded file: {file_name}")
print("Total characters in document:", len(raw_text))
print("\nPreview (first 500 characters):\n")
print(raw_text[:500])


## ‚úÇÔ∏è 5. Dynamic chunking

We want the **number of chunks to depend on the document size**:

- A small article ‚Üí only a few chunks (e.g., 3‚Äì4)
- A big book ‚Üí many chunks (e.g., hundreds)

We‚Äôll:

1. Decide how many chunks we *roughly* want based on total characters  
2. Derive a chunk size from that  
3. Split the text accordingly with some overlap for continuity


In [None]:
# 5Ô∏è‚É£ Dynamic chunking utilities

def compute_dynamic_chunk_size(
    total_chars: int,
    base_target_chars_per_chunk: int = 900,
    min_chunks: int = 3,
    max_chunks: int = 400,
) -> Tuple[int, int]:
    """
    Decide dynamic chunk size based on document length.

    - `base_target_chars_per_chunk`: ideal chunk size if the doc is large
    - Number of chunks is bounded between min_chunks and max_chunks

    Returns: (chunk_size, estimated_num_chunks)
    """
    if total_chars <= 0:
        raise ValueError("Document is empty.")

    # For large documents, chunks ~= total_chars / base_target_chars_per_chunk
    approx_chunks = math.ceil(total_chars / base_target_chars_per_chunk)

    # Clip between min and max
    approx_chunks = max(min_chunks, min(max_chunks, approx_chunks))

    # Derive chunk size to get about that many chunks
    chunk_size = math.ceil(total_chars / approx_chunks)

    return chunk_size, approx_chunks


def chunk_text(
    text: str,
    chunk_size: int,
    chunk_overlap: int = 150
) -> List[str]:
    """
    Split text into overlapping chunks.

    - chunk_size: max characters per chunk
    - chunk_overlap: characters overlapped between consecutive chunks
    """
    chunks = []
    start = 0
    text_length = len(text)

    while start < text_length:
        end = start + chunk_size
        chunk = text[start:end]
        chunks.append(chunk.strip())

        # Move start forward but keep some overlap
        start = end - chunk_overlap

        if start < 0:
            start = 0

    # Remove any empty chunks
    chunks = [c for c in chunks if c]
    return chunks


In [None]:
# 5Ô∏è‚É£.1 Apply dynamic chunking to your document

total_chars = len(raw_text)

dynamic_chunk_size, estimated = compute_dynamic_chunk_size(
    total_chars,
    base_target_chars_per_chunk=900,  # You can tweak this
    min_chunks=3,
    max_chunks=500,
)

print(f"Estimated number of chunks: ~{estimated}")
print(f"Dynamic chunk size: {dynamic_chunk_size} characters\n")

# Now actually create chunks
chunks = chunk_text(raw_text, chunk_size=dynamic_chunk_size, chunk_overlap=150)

print(f"Actual number of chunks created: {len(chunks)}")
print("\nExample chunk (first one):\n")
print(textwrap.fill(chunks[0][:600], width=100))


## üß¨ 6. Build FAISS index from chunks

Steps:
1. Create embeddings for each chunk using Sentence Transformers  
2. Store them in a FAISS index for **fast similarity search**  
3. Keep a mapping from index ‚Üí original chunk text


In [None]:
# 6Ô∏è‚É£ Create embeddings and FAISS index

def build_faiss_index(chunks: List[str]):
    """
    Takes a list of text chunks, returns:
      - FAISS index
      - embeddings (as a numpy array)
      - chunk_texts list (same order as embeddings)
    """
    if not chunks:
        raise ValueError("No chunks to index.")

    # Compute embeddings (batch)
    embeddings = embedding_model.encode(
        chunks,
        convert_to_numpy=True,
        show_progress_bar=True,
    )

    # Normalize embeddings for cosine similarity with inner product
    # FAISS IndexFlatIP assumes we're using inner product; normalizing
    # makes inner product equivalent to cosine similarity.
    faiss.normalize_L2(embeddings)

    index = faiss.IndexFlatIP(EMBED_DIM)  # IP = inner product
    index.add(embeddings)

    return index, embeddings, chunks


faiss_index, doc_embeddings, doc_chunks = build_faiss_index(chunks)
print("FAISS index built with", len(doc_chunks), "chunks.")


## üîç 7. Define a RAG query function

For each user question:

1. Convert the question into an embedding  
2. Use FAISS to find the **top-k most similar chunks**  
3. Pass those chunks + question as context to Groq LLM  
4. Get an answer grounded in the document


In [None]:
# 7Ô∏è‚É£ RAG query function

def retrieve_relevant_chunks(
    query: str,
    index,
    k: int = 4
) -> List[Tuple[str, float]]:
    """
    Retrieve top-k most similar chunks to the query.
    Returns list of (chunk_text, score).
    """
    # Encode query
    query_embedding = embedding_model.encode([query], convert_to_numpy=True)
    faiss.normalize_L2(query_embedding)

    # Search in FAISS
    distances, indices = index.search(query_embedding, k)

    results = []
    for idx, score in zip(indices[0], distances[0]):
        if idx == -1:
            continue
        results.append((doc_chunks[idx], float(score)))

    return results


def build_context_from_chunks(chunks_with_scores: List[Tuple[str, float]]) -> str:
    """
    Concatenate top chunks into a single context string.
    """
    parts = []
    for i, (chunk, score) in enumerate(chunks_with_scores, start=1):
        header = f"\n\n[Chunk {i} | score={score:.3f}]\n"
        parts.append(header + chunk)
    return "".join(parts)


def ask_groq_llm(question: str, context: str) -> str:
    """
    Send the question + context to Groq LLM and get an answer.
    """
    system_prompt = (
        "You are a helpful assistant using the provided context from a document. "
        "Answer the question ONLY using the context. If the answer is not in the "
        "context, say you don't know."
    )

    messages = [
        {"role": "system", "content": system_prompt},
        {
            "role": "user",
            "content": (
                f"Context:\n{context}\n\n"
                f"Question: {question}\n\n"
                "Answer based only on the above context."
            ),
        },
    ]

    response = client.chat.completions.create(
        model=GROQ_MODEL,
        messages=messages,
        temperature=0.2,
    )

    return response.choices[0].message.content.strip()


def rag_answer(question: str, top_k: int = 4, show_context: bool = False) -> None:
    """
    High-level helper:
    1. Retrieve relevant chunks
    2. Build context
    3. Ask Groq LLM
    4. Print results
    """
    print(f"üîé Question: {question}\n")

    # 1. Retrieve
    top_chunks = retrieve_relevant_chunks(question, faiss_index, k=top_k)

    # 2. Build context
    context = build_context_from_chunks(top_chunks)

    if show_context:
        print("üìö Retrieved context (truncated):\n")
        print(textwrap.shorten(context, width=1200, placeholder="..."))
        print("\n" + "="*80 + "\n")

    # 3. Ask LLM
    answer = ask_groq_llm(question, context)

    print("üß† Answer:\n")
    print(answer)


## üß™ 8. Try asking questions!

Now you can ask questions about the uploaded document.  
The system will:

- Dynamically chunk the document  
- Retrieve the most relevant chunks  
- Use Groq LLM to generate an answer based **only** on the document


In [None]:
# 8Ô∏è‚É£ Ask your first question

rag_answer("Iss insaan ko December me kitne paise mile?", top_k=4, show_context=False)


In [None]:
# 8Ô∏è‚É£.1 Ask more questions (you can edit this cell and re-run)

rag_answer("Summarize the key ideas from the document.", top_k=5, show_context=False)
