In [None]:
!pip install qdrant-client sentence-transformers accelerate transformers --quiet
!pip install llama-cpp-python --upgrade --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m337.3/337.3 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m117.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m94.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m51.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
!pip install pymupdf

Collecting pymupdf
  Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m93.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.26.3


In [None]:
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PointStruct
from google.colab import files
import os
import uuid
import fitz  # PyMuPDF
import torch
import time
from transformers import AutoTokenizer, pipeline, AutoModelForCausalLM

# Upload documents
uploaded = files.upload()
docs = [f for f in os.listdir() if f.endswith(('.pdf', '.txt'))]
print("Uploaded documents:", docs)

# Initialize embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")  # or try intfloat/e5-small-v2

# Initialize Qdrant (in-memory)
qdrant = QdrantClient(":memory:")
collection_name = "rag_docs"

if qdrant.collection_exists(collection_name):
    qdrant.delete_collection(collection_name)

qdrant.create_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=384, distance=Distance.COSINE)
)

# Chunking
def chunk_text(text, max_tokens=300):
    sentences = text.split('.')
    chunks, current = [], ""
    for s in sentences:
        if len((current + s).split()) < max_tokens:
            current += s + ". "
        else:
            chunks.append(current.strip())
            current = s + ". "
    if current:
        chunks.append(current.strip())
    return chunks

# Process and embed documents
all_chunks = []
for doc in docs:
    text = ""
    if doc.endswith(".pdf"):
        with fitz.open(doc) as pdf:
            text = " ".join([page.get_text() for page in pdf])
    elif doc.endswith(".txt"):
        with open(doc, "r", encoding="utf-8") as f:
            text = f.read()

    chunks = chunk_text(text)
    for chunk in chunks:
        vector = embedding_model.encode(chunk).tolist()
        qdrant.upsert(
            collection_name=collection_name,
            points=[PointStruct(
                id=str(uuid.uuid4()),
                vector=vector,
                payload={"text": chunk, "source": doc}
            )]
        )
        all_chunks.append(chunk)

print(f"✅ {len(all_chunks)} chunks embedded and stored in Qdrant.")

Uploaded documents: ['Absolutely.pdf']
✅ 3 chunks embedded and stored in Qdrant.


In [None]:
from getpass import getpass
from huggingface_hub import login
access_token = getpass("Enter your Hugging Face access token:")
login(access_token)

Enter your Hugging Face access token:··········


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer
import gradio as gr
import torch
import time

# Load Gemma model (replace with model you downloaded or have access to)
model_name = "google/gemma-2b-it"  # Or try "gemma-2b-it" for smaller version
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")

streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

# 🔍 Chunk Retriever
def retrieve_relevant_chunks(query, top_k=3):
    query_text = f"query: {query}"
    query_embedding = embedding_model.encode([query_text])[0].tolist()

    results = qdrant.search(
        collection_name=collection_name,
        query_vector=query_embedding,
        limit=top_k
    )

    texts = []
    sources = []
    for hit in results:
        payload = hit.payload or {}
        texts.append(payload.get("text", ""))
        sources.append(payload.get("source", ""))
    return texts, sources

# 🧠 Streaming-like Answer Generator using Gemma
def generate_streaming_answer(query):
    contexts, sources = retrieve_relevant_chunks(query)
    context_str = "\n".join(contexts)

    prompt = (
        f"You are a helpful AI assistant. Based on the context below, answer the user's question clearly and concisely.\n\n"
        f"---\nContext:\n{context_str}\n---\n\n"
        f"Question: {query}\n"
        f"Answer:"
    )

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    generated_ids = model.generate(
        **inputs,
        max_new_tokens=512,
        do_sample=False,
        pad_token_id=tokenizer.eos_token_id
    )

    full_output = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    answer = full_output.split("Answer:")[-1].strip()

    # Yield token-by-token to simulate streaming
    partial = ""
    for word in answer.split():
        partial += word + " "
        formatted_sources = "\n".join(f"• {s}" for s in set(sources))
        yield f"📘 **Answer:**\n{partial.strip()}\n\n📄 **Sources:**\n{formatted_sources}"
        time.sleep(0.03)


# 🎛 Gradio Interface
gr.Interface(
    fn=generate_streaming_answer,
    inputs=gr.Textbox(lines=2, placeholder="Ask your question here..."),
    outputs=gr.Markdown(),
    title="RAG Chatbot",
    description="Real-time RAG with Google Gemma + E5 + Qdrant",
    allow_flagging="never"
).launch(debug=True)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://3583862db8752511ed.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


  results = qdrant.search(


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://3583862db8752511ed.gradio.live


