#Library Instatllation

In [10]:
!pip install fastapi
!pip install uvicorn[standard]
!pip install python-multipart
!pip install qdrant-client
!pip install sentence-transformers
!pip install tqdm
!pip install PyMuPDF
!pip install pillow
!pip install pytesseract
!pip install sqlalchemy
!pip install aiofiles
!pip install python-dotenv
!pip install transformers
!pip install nest_asyncio
!pip install pyngrok
!pip install rank_bm25




In [11]:
!pip install langchain



#Qdrant Based Vector Database Inilization

In [12]:
from qdrant_client import QdrantClient
qdrant = QdrantClient(
    url="https://bc7fde05-251b-4940-b11a-683326ab9396.europe-west3-0.gcp.cloud.qdrant.io",
    api_key="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.CTZmywE4q_bNllGjFY5_ILpWDulZxyvrINCL15v7LVM"
)


# FastAPI Endpoints

In [13]:
%%writefile app.py
from fastapi import FastAPI, UploadFile, File, Query
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.http import models
import fitz
import pytesseract
from PIL import Image
import io, uuid, numpy as np, nest_asyncio

app = FastAPI()
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
qdrant = QdrantClient(path="./qdrant_data")

collection = "docs"
if collection not in [c.name for c in qdrant.get_collections().collections]:
    qdrant.create_collection(
        collection_name=collection,
        vectors_config=models.VectorParams(size=model.get_sentence_embedding_dimension(), distance=models.Distance.COSINE),
    )

@app.post("/upload")
async def upload(file: UploadFile = File(...)):
    content = await file.read()
    text = ""
    if file.filename.lower().endswith(".pdf"):
        pdf = fitz.open(stream=content, filetype="pdf")
        for page in pdf:
            text += page.get_text("text")
    elif file.filename.lower().endswith((".png", ".jpg", ".jpeg")):
        image = Image.open(io.BytesIO(content))
        text = pytesseract.image_to_string(image)
    else:
        text = content.decode("utf-8", errors="ignore")

    vector = model.encode(text)
    qdrant.upsert(
        collection_name=collection,
        points=[models.PointStruct(id=str(uuid.uuid4()), vector=vector.tolist(), payload={"text": text, "filename": file.filename})],
    )
    return {"status": "ok", "filename": file.filename}

@app.get("/query")
async def query(q: str = Query(...)):
    vector = model.encode(q)
    search = qdrant.search(collection_name=collection, query_vector=vector.tolist(), limit=5)
    return {"query": q, "results": [{"score": s.score, "filename": s.payload["filename"], "text": s.payload["text"][:200]} for s in search]}

@app.get("/health")
async def health(): return {"status": "ok"}


Overwriting app.py


#Ngrok Tunnel for lacal testing

In [14]:
import nest_asyncio
from pyngrok import ngrok
import uvicorn
import threading

# Allow nested event loops (required in Colab/Jupyter)
nest_asyncio.apply()

# --- ngrok setup ---
NGROK_AUTH_TOKEN = "34jTqLuLDdCeLTpYzUn3IgY5Yik_64EUmUL8dbsUxF7imD8EF"  # paste your token here
ngrok.set_auth_token(NGROK_AUTH_TOKEN)

# Create public tunnel
public_url = ngrok.connect(8000)
print("Public URL:", public_url)

# --- Run Uvicorn in a thread ---
def run_app():
    uvicorn.run("app:app", host="0.0.0.0", port=8000, log_level="info")

# Run in a background thread
thread = threading.Thread(target=run_app, daemon=True)
thread.start()

Public URL: NgrokTunnel: "https://heavily-tubbier-tajuana.ngrok-free.dev" -> "http://localhost:8000"


#Directory

In [15]:
!mkdir -p modules data/uploads data/processed


INFO:     Started server process [1294]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
ERROR:    [Errno 98] error while attempting to bind on address ('0.0.0.0', 8000): address already in use
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.


#Retrival (Sparse, Dense, Hybrid)

In [16]:
%%writefile modules/query.py
import numpy as np
from qdrant_client import QdrantClient
from sentence_transformers import SentenceTransformer
from rank_bm25 import BM25Okapi
from sklearn.preprocessing import MinMaxScaler

# --- Initialize models ---
embedder = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

# --- Connect to persistent Qdrant Cloud ---
qdrant = QdrantClient(
    url="https://bc7fde05-251b-4940-b11a-683326ab9396.europe-west3-0.gcp.cloud.qdrant.io",
    api_key="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.CTZmywE4q_bNllGjFY5_ILpWDulZxyvrINCL15v7LVM"
)

collection_name = "documents"

# --- Sparse Retriever (BM25) ---
bm25_index = None
bm25_texts = []

def initialize_bm25():
    """
    Initialize BM25 dynamically from stored text chunks in Qdrant.
    """
    global bm25_index, bm25_texts

    print(" Loading text chunks from Qdrant for BM25...")
    points, _ = qdrant.scroll(collection_name=collection_name, limit=5000)
    bm25_texts = [p.payload.get("text", "") for p in points if p.payload.get("text")]

    if not bm25_texts:
        print(" No text chunks found in Qdrant for BM25 initialization.")
        return

    tokenized_corpus = [text.split() for text in bm25_texts]
    bm25_index = BM25Okapi(tokenized_corpus)
    print(f"BM25 index initialized with {len(bm25_texts)} text chunks.")


# --- Dense Retrieval ---
def dense_search(query_text, top_k=5):
    """Semantic dense retrieval using embeddings from Qdrant."""
    query_vector = embedder.encode(query_text).tolist()
    search_result = qdrant.search(
        collection_name=collection_name,
        query_vector=query_vector,
        limit=top_k,
    )

    dense_results = [
        {
            "text": hit.payload.get("text", ""),
            "file_path": hit.payload.get("file_path"),
            "file_type": hit.payload.get("file_type"),
            "score": float(hit.score),
            "type": "dense"
        }
        for hit in search_result
    ]
    return dense_results


# --- Sparse Retrieval ---
def sparse_search(query_text, top_k=5):
    """Keyword-based BM25 retrieval over stored chunks."""
    global bm25_index

    if bm25_index is None:
        initialize_bm25()

    if bm25_index is None:
        return []

    tokenized_query = query_text.split()
    scores = bm25_index.get_scores(tokenized_query)
    top_indices = np.argsort(scores)[::-1][:top_k]

    sparse_results = [
        {
            "text": bm25_texts[i],
            "file_path": None,
            "score": float(scores[i]),
            "type": "sparse"
        }
        for i in top_indices
    ]
    return sparse_results


# --- Hybrid Retrieval ---
def hybrid_search(query_text, top_k=5, alpha=0.5):
    """
    Combine dense (semantic) and sparse (keyword) retrieval.
    alpha balances weight: 0.5 = equal.
    """
    dense_results = dense_search(query_text, top_k)
    sparse_results = sparse_search(query_text, top_k)

    combined = dense_results + sparse_results
    if not combined:
        return []

    # Normalize and combine scores
    scores = np.array([r["score"] for r in combined]).reshape(-1, 1)
    normalized = MinMaxScaler().fit_transform(scores).flatten()

    for i, r in enumerate(combined):
        r["hybrid_score"] = (
            normalized[i] * (alpha if r["type"] == "dense" else (1 - alpha))
        )

    sorted_results = sorted(combined, key=lambda x: x["hybrid_score"], reverse=True)
    return sorted_results[:top_k]



Writing modules/query.py


#Ingest and Chunking

In [17]:
%%writefile modules/ingest.py
import os
import pytesseract
import fitz  # PyMuPDF
from PIL import Image
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct
from datetime import datetime
from modules.query import initialize_bm25
from langchain.text_splitter import RecursiveCharacterTextSplitter
import uuid

# --- Initialize models ---
embedder = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

# --- Initialize Qdrant ---
qdrant = QdrantClient(
    url="https://bc7fde05-251b-4940-b11a-683326ab9396.europe-west3-0.gcp.cloud.qdrant.io",
    api_key="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.CTZmywE4q_bNllGjFY5_ILpWDulZxyvrINCL15v7LVM"
)
collection_name = "documents"
qdrant.recreate_collection(
    collection_name=collection_name,
    vectors_config={"size": 768, "distance": "Cosine"},  # 768 for mpnet
)


def smart_chunk_text(text):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=150,
        separators=["\n\n", ".", "!", "?", "\n", " "]
    )
    chunks = splitter.split_text(text)
    print(f" Created {len(chunks)} semantic chunks.")
    return chunks


def extract_text_from_image(image_path):
    """Extract text from an image using OCR."""
    try:
        image = Image.open(image_path)
        text = pytesseract.image_to_string(image)
        return text.strip()
    except Exception as e:
        return f"OCR failed: {str(e)}"

def extract_from_pdf(pdf_path):
    """Extract text and OCR from all pages and images in a PDF."""
    text_content = []
    pdf_document = fitz.open(pdf_path)

    for page_index, page in enumerate(pdf_document):
        # --- Extract visible text ---
        page_text = page.get_text("text")
        if page_text.strip():
            text_content.append(f"[Page {page_index+1} Text]\n{page_text}")

        # --- Extract and OCR images ---
        for img_index, img in enumerate(page.get_images(full=True)):
            try:
                xref = img[0]
                base_image = pdf_document.extract_image(xref)
                image_bytes = base_image["image"]
                image_ext = base_image["ext"]

                os.makedirs("data/processed", exist_ok=True)
                image_filename = f"{uuid.uuid4()}.{image_ext}"
                image_path = os.path.join("data/processed", image_filename)
                with open(image_path, "wb") as f:
                    f.write(image_bytes)

                # OCR on image
                ocr_text = extract_text_from_image(image_path)
                if ocr_text.strip():
                    text_content.append(f"[Page {page_index+1} Image {img_index+1} OCR]\n{ocr_text}")

            except Exception as e:
                print(f" Error extracting image {img_index} on page {page_index}: {e}")

    full_text = "\n".join(text_content)
    return full_text.strip()


"""Main ingestion function for any supported file type."""

def process_file(file_path, file_type):
    if file_type.lower() in ["txt", "text"]:
        with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
            content = f.read()
    elif file_type.lower() in ["jpg", "jpeg", "png"]:
        content = extract_text_from_image(file_path)
    elif file_type.lower() == "pdf":
        content = extract_from_pdf(file_path)
    else:
        raise ValueError("Unsupported file type")


    # --- Create semantic chunks ---
    chunks = smart_chunk_text(content)

    all_chunks = []
    for idx, chunk in enumerate(chunks):
        embedding = embedder.encode(chunk).tolist()
        all_chunks.append({"text": chunk})

        qdrant.upsert(
            collection_name=collection_name,
            points=[
                PointStruct(
                    id=str(uuid.uuid4()),
                    vector=embedding,
                    payload={
                        "text": chunk,
                        "file_path": file_path,
                        "file_type": file_type,
                        "chunk_index": idx,
                        "timestamp": datetime.now().isoformat(),
                    },
                )
            ],
        )

    print(f"Ingested {len(chunks)} chunks into Qdrant.")
    initialize_bm25(all_chunks)  # build BM25 for hybrid search
    return {"status": "success", "chunks": len(chunks), "path": file_path}

Writing modules/ingest.py


In [18]:
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.http import models # Import models
import numpy as np
import uuid
import fitz
import pytesseract
from PIL import Image
import io
import os
import torch # Import torch

# --- Initialize models ---
# Explicitly set the device during initialization as a workaround for the meta tensor error
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")
embedder = SentenceTransformer("sentence-transformers/all-mpnet-base-v2", device=device)

# --- Initialize Qdrant ---
# Use a new in-memory client for this cell's execution
qdrant = QdrantClient(
    url="https://bc7fde05-251b-4940-b11a-683326ab9396.europe-west3-0.gcp.cloud.qdrant.io",
    api_key="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.CTZmywE4q_bNllGjFY5_ILpWDulZxyvrINCL15v7LVM"
)

collection_name = "documents"
if collection_name not in [c.name for c in qdrant.get_collections().collections]:
    qdrant.recreate_collection( # Use recreate_collection for simplicity in this example
        collection_name=collection_name,
        vectors_config=models.VectorParams(size=embedder.get_sentence_embedding_dimension(), distance=models.Distance.COSINE),
    )

def chunk_text(text, max_chars=500, overlap=100):
    """Split text into overlapping chunks for better retrieval."""
    text = text.replace("\n", " ").strip()
    chunks = []
    start = 0
    while start < len(text):
        end = start + max_chars
        chunks.append({"text": text[start:end]}) # Return as list of dicts for BM25
        start += max_chars - overlap
    return chunks


# --- Re-ingest the document for this client instance ---

file_name = "IIT Patna & IIIT Ranchi Joint Degree MCA syllabi.pdf" # Or the name of the file you uploaded earlier
file_path = file_name # Assuming it's in the current directory
file_type = file_name.split(".")[-1]

text = ""
try:
    if file_type.lower() == "pdf":
        pdf = fitz.open(file_path)
        for page in pdf:
            text += page.get_text("text")
    elif file_type.lower() in ["png", "jpg", "jpeg"]:
        image = Image.open(file_path)
        text = pytesseract.image_to_string(image)
    else:
        # Assuming it's a text file
        with open(file_path, "r") as f:
            text = f.read()
except Exception as e:
    print(f"Error during text extraction: {e}")
    text = "" # Ensure text is empty string on error


# Add this line to print the extracted text
print("--- Extracted Text ---")
print(text)
print("----------------------")


#  Chunk + Embed + Store ---
if text.strip(): # Only chunk, embed, and store if text is not empty
    chunks = chunk_text(text)
    for idx, chunk in enumerate(chunks):
        vector = embedder.encode(chunk["text"]).tolist()
        qdrant.upsert(
            collection_name=collection_name,
            points=[
                models.PointStruct(
                    id=str(uuid.uuid4()),
                    vector=vector,
                    payload={
                        "text": chunk["text"],
                        "filename": file_name,
                        "file_type": file_type,
                        "chunk_index": idx,
                    },
                )
            ],
        )

    print(f" Ingested {len(chunks)} chunks from {file_name}.")
else:
    print(f" No text extracted from {file_name}. Skipping ingestion.")


#  Query Function ---
def query_documents(qdrant_client, query_text, top_k=3):
    """Search Qdrant for documents similar to a text query."""
    query_embedding = embedder.encode(query_text).tolist()

    search_result = qdrant_client.search(
        collection_name=collection_name,
        query_vector=query_embedding,
        limit=top_k,
    )

    results = []
    for hit in search_result:
        payload = hit.payload
        score = float(hit.score)
        results.append(
            {
                "text_snippet": payload.get("text", "")[:300],
                "file_type": payload.get("file_type"),
                "file_path": payload.get("filename"),
                "score": score,
                "chunk_index": payload.get("chunk_index"),
            }
        )
    return results

# ---  Run a Query ---
query = "What is the name of course?"
# Corrected function call: pass top_k as a positional argument
results = query_documents(qdrant, query, 3)

print("\n--- Query Results ---")
for r in results:
    print(f"Score: {r['score']:.3f}")
    print(f"Type: {r['file_type']}")
    print(f"Snippet: {r['text_snippet']}")
    print(f"Chunk: {r['chunk_index']}")
    print("------")

Using device: cuda


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
disagreeing, Initiating, Summarizing and Attaining the Objective.  
Interview& Presentation Skills: Interviewer and Interviewee– in-depth perspectives. 
Before, During and After the Interview.  
Tips for Success: Types, Content, Audience Analysis, Essential Tips – Before, During and 
After, Overcoming Nervousness.  
 
4  
  
Non-Verbal Communication & Personality Development  
Importance and Elements; Body Language. Concept, Essentials, Tips, Meaning, Nature, 
Features, Stages, Models; Learning Skills; Adaptability Skills.  
 
 
5  
  
Business Etiquette & Team Work  
Concept of Teams; Building effective teams; Concept of Leadership and honing Leadership 
skills. Meaning, Nature, Features, Stages, Models; Learning Skills; Adaptability Skills.  
Reference Book  
1. Managing Soft Skills for Personality Development – edited by B.N. Ghosh, McGraw Hill India, 
2012.  
2. Effective Communication and Soft Skills, Nitin Bhatnagar

  search_result = qdrant_client.search(



--- Query Results ---
Score: 0.329
Type: pdf
Snippet: .  
  
Course Code  
  
Course Name  
Contact Hours / Week  
 
Credit  
L  
T  
P  
Total  
 
THEORY  
 
 
1  
MCA20-101  
Programming in Python  
3  
1  
-  
4  
4  
2  
MCA20-102  
Relational Database Management System  
3  
1  
-  
4  
4  
3  
MCA20-103  
Computer Organization and Architecture  

Chunk: 1
------
Score: 0.329
Type: pdf
Snippet: .  
  
Course Code  
  
Course Name  
Contact Hours / Week  
 
Credit  
L  
T  
P  
Total  
 
THEORY  
 
 
1  
MCA20-101  
Programming in Python  
3  
1  
-  
4  
4  
2  
MCA20-102  
Relational Database Management System  
3  
1  
-  
4  
4  
3  
MCA20-103  
Computer Organization and Architecture  

Chunk: 1
------
Score: 0.329
Type: pdf
Snippet: .  
  
Course Code  
  
Course Name  
Contact Hours / Week  
 
Credit  
L  
T  
P  
Total  
 
THEORY  
 
 
1  
MCA20-101  
Programming in Python  
3  
1  
-  
4  
4  
2  
MCA20-102  
Relational Database Management System  
3  
1  
-  
4  
4  
3  


# Google Flan T5 LLM Integration

In [19]:
from transformers import pipeline
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.http import models # Import models
import torch # Import torch

# --- Initialize models ---
# Explicitly set the device during initialization as a workaround for the meta tensor error
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")
embedder = SentenceTransformer("sentence-transformers/all-mpnet-base-v2", device=device)

# --- Initialize Qdrant ---
# Use a new in-memory client for this cell's execution
qdrant = QdrantClient(
    url="https://bc7fde05-251b-4940-b11a-683326ab9396.europe-west3-0.gcp.cloud.qdrant.io",
    api_key="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.CTZmywE4q_bNllGjFY5_ILpWDulZxyvrINCL15v7LVM"
)

collection_name = "documents"
# Check if collection exists before recreating (optional, but safer)
if collection_name not in [c.name for c in qdrant.get_collections().collections]:
    qdrant.recreate_collection( # Use recreate_collection for simplicity in this example
        collection_name=collection_name,
        vectors_config=models.VectorParams(size=embedder.get_sentence_embedding_dimension(), distance=models.Distance.COSINE),
    )


#  Query Function
def query_documents(qdrant_client, query_text, top_k=3):
    """Search Qdrant for documents similar to a text query."""
    query_embedding = embedder.encode(query_text).tolist()

    search_result = qdrant_client.search(
        collection_name=collection_name,
        query_vector=query_embedding,
        limit=top_k,
    )

    results = []
    for hit in search_result:
        payload = hit.payload
        score = float(hit.score)
        results.append(
            {
                "text_snippet": payload.get("text", "")[:300],
                "file_type": payload.get("file_type"),
                "file_path": payload.get("filename"),
                "score": score,
                "chunk_index": payload.get("chunk_index"),
            }
        )
    return results


#  Load a lightweight generation model ---
qa_model = pipeline("text2text-generation", model="google/flan-t5-large")

def generate_answer(question, top_k=3):
    # Step 1: Retrieve context from Qdrant
    retrieved_docs = query_documents(qdrant, question, top_k=top_k)
    context = " ".join([doc['text_snippet'] for doc in retrieved_docs])

    # Step 2: Build a prompt for the LLM
    prompt = f"Context: {context}\n\nQuestion: {question}\nAnswer:"

    # Step 3: Generate answer using LLM
    output = qa_model(prompt, max_new_tokens=200, do_sample=False)
    answer = output[0]['generated_text']

    return answer, retrieved_docs

# - Test the full RAG flow ---
query = "What is the name of course?"
answer, context = generate_answer(query)

print("\n---  Generated Answer ---")
print(answer)
print("\n--- 🔎 Retrieved Context ---")
for c in context:
    print(f"• {c['text_snippet'][:200]} ...")

Using device: cuda


config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0
  search_result = qdrant_client.search(



---  Generated Answer ---
MCA20-101 Programming in Python 3 1 - 4 4 2 MCA20-102 Relational Database Management System 3 1 - 4 4 3 MCA20-103 Computer Organization and Architecture

--- 🔎 Retrieved Context ---
• .  
  
Course Code  
  
Course Name  
Contact Hours / Week  
 
Credit  
L  
T  
P  
Total  
 
THEORY  
 
 
1  
MCA20-101  
Programming in Python  
3  
1  
-  
4  
4  
2  
MCA20-102  
Relational Databa ...
• .  
  
Course Code  
  
Course Name  
Contact Hours / Week  
 
Credit  
L  
T  
P  
Total  
 
THEORY  
 
 
1  
MCA20-101  
Programming in Python  
3  
1  
-  
4  
4  
2  
MCA20-102  
Relational Databa ...
• .  
  
Course Code  
  
Course Name  
Contact Hours / Week  
 
Credit  
L  
T  
P  
Total  
 
THEORY  
 
 
1  
MCA20-101  
Programming in Python  
3  
1  
-  
4  
4  
2  
MCA20-102  
Relational Databa ...


In [20]:
!pip install fastapi uvicorn[standard] python-multipart aiofiles pyngrok nest_asyncio




#FastAPI Building

In [3]:
!pip install fastapi uvicorn[standard] python-multipart aiofiles pyngrok nest_asyncio qdrant-client sentence-transformers transformers pytesseract PyMuPDF pillow

import nest_asyncio, uvicorn
from fastapi import FastAPI, File, UploadFile, Form
from fastapi.responses import JSONResponse
from datetime import datetime
from pyngrok import ngrok
import os, uuid, fitz, pytesseract
from PIL import Image
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.http import models
from transformers import pipeline
import asyncio # Import asyncio

# --- Allow async event loop reuse in Colab ---
nest_asyncio.apply()

# --- Initialize components ---
app = FastAPI(title="Multimodal RAG API")
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
qdrant = QdrantClient(":memory:")
collection_name = "documents"

# Create collection if missing
if collection_name not in [c.name for c in qdrant.get_collections().collections]:
    qdrant.recreate_collection(
        collection_name=collection_name,
        vectors_config=models.VectorParams(size=embedder.get_sentence_embedding_dimension(), distance=models.Distance.COSINE),
    )

# Load small LLM (Flan-T5-base)
rag_model = pipeline("text2text-generation", model="google/flan-t5-base")

# --- Helper functions ---
def extract_text(file_path, file_type):
    text = ""
    try:
        if file_type == "pdf":
            pdf = fitz.open(file_path)
            for page in pdf:
                text += page.get_text("text")
        elif file_type in ["png", "jpg", "jpeg"]:
            image = Image.open(file_path)
            text = pytesseract.image_to_string(image)
        else:
            with open(file_path, "r") as f:
                text = f.read()
    except Exception as e:
        print(f"Error during text extraction: {e}")
        text = ""
    return text

async def ingest_file(file_path, filename):
    file_type = filename.split(".")[-1].lower()
    text = extract_text(file_path, file_type)
    if not text.strip():
        print(f"Warning: No text extracted from {filename}. Cannot ingest.")
        return None

    vector = embedder.encode(text).tolist()
    payload = {
        "text": text[:5000],
        "filename": filename,
        "file_type": file_type,
        "timestamp": datetime.now().isoformat()
    }
    # Using upsert with a single point
    qdrant.upsert(collection_name=collection_name, points=[models.PointStruct(id=str(uuid.uuid4()), vector=vector, payload=payload)])
    print(f"Ingested document {filename}.")
    return {"filename": filename, "status": "ingested"}


def query_documents(query, top_k=3):
    query_vector = embedder.encode(query).tolist()
    search_result = qdrant.search(collection_name=collection_name, query_vector=query_vector, limit=top_k)
    return [{"text": hit.payload["text"], "score": hit.score, "filename": hit.payload["filename"]} for hit in search_result]

def generate_answer(question, search_results):
    """Generates an answer based on retrieved search results."""
    context = " ".join([r["text"] for r in search_results])
    if not context.strip():
        return "I cannot answer this question based on the provided documents."

    prompt = f"Context: {context}\n\nQuestion: {question}\nAnswer:"
    answer = rag_model(prompt, max_new_tokens=200, do_sample=False)[0]["generated_text"]
    return answer


# --- Pydantic Models for Request Body ---
from pydantic import BaseModel

class QueryRequest(BaseModel):
    query: str
    top_k: int = 5


# --- Endpoints ---
@app.post("/upload")
async def upload_document(file: UploadFile = File(...)):
    file_path = file.filename
    # Save the file temporarily
    with open(file_path, "wb") as f:
        content = await file.read()
        f.write(content)

    # Process and ingest the file
    result = await ingest_file(file_path, file.filename)

    # Clean up the temporary file
    os.remove(file_path)

    if result:
        return JSONResponse({"status": "success", "message": f"Successfully ingested {result['filename']}"})
    else:
         return JSONResponse({"status": "failed", "message": f"Failed to ingest document {file.filename}. No text extracted."}, status_code=400)


@app.post("/query")
async def query_api(request: QueryRequest):
    results = query_documents(request.query, top_k=request.top_k)
    answer = generate_answer(request.query, results)
    return JSONResponse({"question": request.query, "answer": answer, "sources": results})


# --- Run with ngrok and Uvicorn ---
async def main():

    from google.colab import userdata
    try:
        ngrok_auth_token = userdata.get("NGROK_AUTH_TOKEN")
        if not ngrok_auth_token:
             raise ValueError("NGROK_AUTH_TOKEN secret not found.")
        ngrok.set_auth_token(ngrok_auth_token)
        # Create public tunnel -
        public_url = ngrok.connect(8000)
        print("Public URL:", public_url)
        ngrok_tunnel = public_url
    except Exception as e:
        print(f"Error starting ngrok tunnel: {e}")
        print("Please ensure port 8000 is not in use and your ngrok auth token is correct and set as a Colab secret named NGROK_AUTH_TOKEN.")
        ngrok_tunnel = None
        public_url = None


    # Use uvicorn.Server to run within the existing event loop
    if public_url:
        config = uvicorn.Config(app, host="0.0.0.0", port=8000, log_level="info")
        server = uvicorn.Server(config)
        print("Starting Uvicorn server on port 8000...")
        await server.serve()
    else:
        print("Skipping Uvicorn server start due to ngrok tunnel failure.")






The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  qdrant.recreate_collection(
Device set to use cuda:0


In [None]:
# --- Run with ngrok ---
public_url = ngrok.connect(8000)
print("Public URL:", public_url)

# Use uvicorn.Server to run within the existing event loop
config = uvicorn.Config(app, host="0.0.0.0", port=8000, log_level="info")
server = uvicorn.Server(config)
asyncio.run(server.serve())

Public URL: NgrokTunnel: "https://heavily-tubbier-tajuana.ngrok-free.dev" -> "http://localhost:8000"


INFO:     Started server process [6413]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)


INFO:     122.162.102.31:0 - "GET /docs HTTP/1.1" 200 OK
INFO:     122.162.102.31:0 - "GET /openapi.json HTTP/1.1" 200 OK
Ingested document IIT Patna & IIIT Ranchi Joint Degree MCA syllabi.pdf.
INFO:     122.162.102.31:0 - "POST /upload HTTP/1.1" 200 OK


  search_result = qdrant.search(collection_name=collection_name, query_vector=query_vector, limit=top_k)
Token indices sequence length is longer than the specified maximum sequence length for this model (1302 > 512). Running this sequence through the model will result in indexing errors


INFO:     122.162.102.31:0 - "POST /query HTTP/1.1" 200 OK
Ingested document MCA Sem1 TimeTable.png.
INFO:     122.162.102.31:0 - "POST /upload HTTP/1.1" 200 OK


  search_result = qdrant.search(collection_name=collection_name, query_vector=query_vector, limit=top_k)


INFO:     122.162.102.31:0 - "POST /query HTTP/1.1" 200 OK
