In [1]:
!pip install qdrant_client PyPDF2 pypdfium2 rank_bm25 langdetect gradio
!pip install git+https://github.com/huggingface/transformers@v4.49.0-Gemma-3
!huggingface-cli login --token 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

Collecting qdrant_client
  Downloading qdrant_client-1.13.3-py3-none-any.whl.metadata (10 kB)
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting pypdfium2
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m27.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gradio
  Downloading gradio-5.22.0-py3-none-any.whl.metadata (16 kB)
Collecting grpcio-tools>=1.41.0 (from qdrant_client)
  Downloading grpcio_tools-1.71.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.

In [None]:
import os
import warnings
import json
import torch
import numpy as np
import PyPDF2
import pypdfium2 as pdfium
import nltk
from PIL import Image
from transformers import pipeline, MarianMTModel, MarianTokenizer
from sentence_transformers import SentenceTransformer
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, VectorParams, Distance
from langdetect import detect, DetectorFactory
import gradio as gr

# Ensure reproducible language detection
DetectorFactory.seed = 0

# Global list to record rendered image paths
img_list = []

# Download NLTK data if not already available.
nltk.download('punkt_tab')
nltk.download('punkt')

# Use Qdrant persistent storage (all Qdrant and embedding work on CPU)
qdrant_client = QdrantClient(path="qdrant_storage1")
collection_name = "pdf_embeddings1"

# Load the embedding model on CPU
embedding_model = SentenceTransformer("nomic-ai/nomic-embed-text-v2-moe", trust_remote_code=True)
embedding_dim = embedding_model.get_sentence_embedding_dimension()

def ensure_qdrant_collection():
    """Ensure the Qdrant collection exists with the correct configuration."""
    collections = [col.name for col in qdrant_client.get_collections().collections]
    if collection_name not in collections:
        qdrant_client.create_collection(
            collection_name=collection_name,
            vectors_config=VectorParams(size=embedding_dim, distance=Distance.COSINE)
        )
        print(f"Created collection '{collection_name}'.")
    else:
        print(f"Collection '{collection_name}' already exists.")

ensure_qdrant_collection()

def translate_german_to_english(text):
    model_name = "Helsinki-NLP/opus-mt-de-en"
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)  # Loaded on CPU
    inputs = tokenizer(text, return_tensors="pt", padding=True)
    translated_tokens = model.generate(**inputs)
    translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
    return translated_text

def extract_text_from_pdf(pdf_path):
    """
    Extracts text from each page of the PDF and returns a list of entries with text, page number,
    sentence ID, and detected language.
    """
    reader = PyPDF2.PdfReader(pdf_path)
    page_entries = []
    for page_number, page in enumerate(reader.pages, start=1):
        page_text = page.extract_text()
        if page_text:
            sentences = nltk.sent_tokenize(page_text)
            for i, sentence in enumerate(sentences):
                if sentence.strip():
                    try:
                        lang = detect(sentence)
                    except Exception:
                        lang = "unknown"
                    page_entries.append({
                        "text": sentence,
                        "page": page_number,
                        "sentence_id": i,
                        "language": lang
                    })
    return page_entries

def store_pdf_embeddings(pdf_path, context_window=5):
    """
    Extracts text, generates embeddings, and stores them in Qdrant.
    Combines neighboring sentences (based on context_window) for extra context.
    Skips processing if embeddings for this PDF are already stored.
    """
    doc_name = os.path.basename(pdf_path)

    # Check if embeddings for this document already exist.
    from qdrant_client.http.models import Filter, FieldCondition, MatchValue
    filter_condition = Filter(
        must=[FieldCondition(key="document", match=MatchValue(value=doc_name))]
    )
    count_result = qdrant_client.count(collection_name=collection_name, count_filter=filter_condition)
    if count_result.count > 0:
        print(f"Embeddings for {doc_name} already exist. Skipping.")
        return

    pdf_entries = extract_text_from_pdf(pdf_path)
    texts = [entry["text"] for entry in pdf_entries]
    embeddings = embedding_model.encode(texts, batch_size=32).tolist()

    # Build a lookup for entries by page.
    entries_by_page = {}
    for entry in pdf_entries:
        page = entry["page"]
        entries_by_page.setdefault(page, []).append(entry)

    points = []
    for i, entry in enumerate(pdf_entries):
        page = entry["page"]
        sid = entry["sentence_id"]
        # Retrieve neighboring sentences based on context_window.
        page_entries_sorted = sorted(entries_by_page[page], key=lambda x: x["sentence_id"])
        context_entries = [e for e in page_entries_sorted if abs(e["sentence_id"] - sid) <= context_window]
        full_text = " ".join([e["text"] for e in context_entries])

        uid = f"{doc_name}_{page}_{sid}"
        points.append(PointStruct(
            id=hash(uid),
            vector=embeddings[i],
            payload={
                "text": entry["text"],
                "full_text": full_text,
                "page": page,
                "document": doc_name,
                "language": entry.get("language", "unknown"),
                "sentence_id": sid
            }
        ))
    qdrant_client.upsert(collection_name=collection_name, points=points)
    print(f"Stored embeddings for {doc_name}.")

def hybrid_search(query, top_k=3):
    """
    Performs hybrid search combining dense retrieval (from Qdrant) and sparse retrieval (using BM25).
    """
    try:
        _ = detect(query)
    except Exception:
        pass

    query_embedding = embedding_model.encode([query]).tolist()[0]

    search_results = qdrant_client.search(
        collection_name=collection_name,
        query_vector=query_embedding,
        limit=top_k * 10,
        with_payload=True
    )

    candidate_texts = [hit.payload.get("full_text", hit.payload.get("text", "")) for hit in search_results if hit.payload]
    tokenized_docs = [word_tokenize(text.lower()) for text in candidate_texts]
    bm25 = BM25Okapi(tokenized_docs)
    tokenized_query = word_tokenize(query.lower())
    sparse_scores = np.array(bm25.get_scores(tokenized_query))

    dense_scores = np.array([hit.score for hit in search_results])
    max_dense = max(dense_scores) if max(dense_scores) > 0 else 1
    dense_scores = dense_scores / max_dense

    max_sparse = max(sparse_scores) if max(sparse_scores) > 0 else 1
    sparse_scores = sparse_scores / max_sparse

    hybrid_scores = 0.7 * dense_scores + 0.3 * sparse_scores
    sorted_indices = np.argsort(hybrid_scores)[::-1]

    best_results = []
    seen = set()
    for i in sorted_indices:
        hit = search_results[i]
        doc_page_pair = (hit.payload["document"], hit.payload["page"])
        if doc_page_pair in seen:
            continue
        seen.add(doc_page_pair)
        best_results.append({
            "text": hit.payload.get("full_text", hit.payload.get("text", "")),
            "page": hit.payload["page"],
            "document": hit.payload["document"],
            "score": hybrid_scores[i]
        })
        if len(best_results) >= top_k:
            break
    return best_results

def hybrid_search_dual(query, top_k=5):
    """
    If the query is in German, translates it to English.
    Runs hybrid search on both the original and translated query.
    Merges and re-ranks the results.
    """
    try:
        query_lang = detect(query)
    except Exception:
        query_lang = None

    if query_lang == "de":
        query_en = translate_german_to_english(query)
        query_de = query
    else:
        query_en = query
        query_de = query

    results_de = hybrid_search(query_de, top_k=top_k)
    results_en = hybrid_search(query_en, top_k=top_k)

    combined = {}
    for res in results_de + results_en:
        key = (res["document"], res["page"])
        if key in combined:
            combined[key]["score"] = (combined[key]["score"] + res["score"]) / 2
        else:
            combined[key] = res

    combined_results = list(combined.values())
    combined_results.sort(key=lambda x: x["score"], reverse=True)
    return combined_results[:top_k]

# PDF Rendering Functions
def get_total_pages(pdf_path):
    """Return total pages in the PDF."""
    reader = PyPDF2.PdfReader(pdf_path)
    return len(reader.pages)

def render_page(pdf_path, page_number, scale=1.0, output_path=None):
    """
    Render a specified page using pypdfium2.
    Saves the image if output_path is provided.
    """
    pdf = pdfium.PdfDocument(pdf_path)
    page = pdf.get_page(page_number)
    bitmap = page.render(scale=scale)
    pil_image = bitmap.to_pil()
    if output_path:
        pil_image.save(output_path)
        print(f"Saved rendered page {page_number + 1} to {output_path}")
    page.close()
    pdf.close()
    return pil_image

def process_specific_page(pdf_path, page_number, scale=7.5, output_dir="rendered_page"):
    """
    Render a specific page and record the image path.
    """
    total_pages = get_total_pages(pdf_path)
    if page_number < 0 or page_number >= total_pages:
        print(f"Page number {page_number + 1} out of range. Total pages: {total_pages}")
        return None
    os.makedirs(output_dir, exist_ok=True)
    doc_name = os.path.splitext(os.path.basename(pdf_path))[0]
    output_path = os.path.join(output_dir, f"{doc_name}_page_{page_number + 1}.png")
    # Save the absolute path in global img_list.
    abs_path = os.path.abspath(output_path).replace('\\', '/')
    if abs_path not in img_list:
        img_list.append(abs_path)
    return render_page(pdf_path, page_number, scale=scale, output_path=output_path)

def get_pdf_paths(root_folder):
    """
    Recursively find all PDF files in a root folder.
    """
    pdf_paths = []
    for subdir, _, files in os.walk(root_folder):
        for file in files:
            if file.lower().endswith(".pdf"):
                pdf_paths.append(os.path.join(subdir, file))
    return pdf_paths

# Setup the Gemma pipeline (LLM on GPU; others on CPU)
pipe = pipeline("image-text-to-text", model="google/gemma-3-4b-it", device="cuda", torch_dtype=torch.bfloat16)

# Set the folder containing your PDFs (change as needed)
root_directory = "/content/paper"
pdf_files = get_pdf_paths(root_directory)
for pdf_file in pdf_files:
    store_pdf_embeddings(pdf_file, context_window=5)

# --- Gradio UI Functions ---

def search_query(query):
    """
    Takes a search query, performs dual-hybrid search, renders images for retrieved pages,
    and returns a summary string, a list of PIL images, and a JSON of best results.
    """
    global img_list
    img_list.clear()
    best_results = hybrid_search_dual(query, top_k=5)
    if not best_results:
        return "No results found.", [], json.dumps(best_results)

    results_text = ""
    for res in best_results:
        results_text += f"Document: {res['document']}, Page: {res['page']}\n"
        results_text += f"Score: {res['score']:.4f}\n"
        results_text += f"Text: {res['text']}\n"
        results_text += "-"*50 + "\n"

    images = []
    for res in best_results:
        pdf_path = next((path for path in pdf_files if os.path.basename(path) == res["document"]), None)
        if pdf_path:
            pil_img = process_specific_page(pdf_path, res["page"] - 1, scale=10.0)
            if pil_img:
                images.append(pil_img)
    return results_text, images, json.dumps(best_results)

def generate_text_response(additional_prompt, best_results_json, selected_image):
    """
    Takes the additional prompt and the selected image (its file path) along with the retrieved texts,
    builds the conversation, and calls the Gemma pipeline to generate a response.
    """
    best_results = json.loads(best_results_json)
    # Combine the retrieved text from the search results.
    combined_text = "\n".join([res["text"] for res in best_results])
    if additional_prompt:
        combined_prompt = additional_prompt + "\n" + combined_text
    else:
        combined_prompt = combined_text
    system_prompt_text = "You are an advanced technical agent."

    # Build conversation: if a selected image path is provided, include it.
    if selected_image:
        conversation = [
            {
                "role": "system",
                "content": [{"type": "text", "text": system_prompt_text}]
            },
            {
                "role": "user",
                "content": [
                    {"type": "image", "url": selected_image},
                    {"type": "text", "text": additional_prompt},
                ],
            },
        ]
    else:
        conversation = [
            {
                "role": "system",
                "content": [{"type": "text", "text": system_prompt_text}]
            },
            {
                "role": "user",
                "content": [{"type": "text", "text": combined_prompt}],
            },
        ]
    output = pipe(text=conversation, max_new_tokens=1800)
    ans=""
    for entry in output[0]['generated_text']:
        if entry['role'] == 'assistant':
            ans+=entry['text']
    return ans

# --- Build Gradio Interface (Sequential Flow) ---

with gr.Blocks() as demo:
    gr.Markdown("# PDF Search and LLM Query Interface")
    gr.Markdown("### Step 1: Enter your search query")
    with gr.Row():
        query_input = gr.Textbox(label="Search Query", placeholder="Enter search query here")
        search_btn = gr.Button("Search")
    search_results = gr.Textbox(label="Search Results", lines=10)
    gr.Markdown("### Retrieved Page Images")
    gallery = gr.Gallery(label="Rendered Images", height="auto")
    # Hidden component to store best results JSON between steps.
    hidden_best_results = gr.Textbox(visible=False)

    gr.Markdown("### Step 2: Generate Text Response")
    gr.Markdown("Select an image by copying its file path (displayed below) and paste it into the box.")
    selected_image_input = gr.Textbox(label="Selected Image (file path)", placeholder="Paste selected image path here")
    additional_prompt_input = gr.Textbox(label="Additional Prompt", placeholder="Enter additional text prompt here")
    generate_btn = gr.Button("Generate Response")
    generated_text = gr.Textbox(label="Generated Text", lines=10)

    # When search is clicked, update search results, gallery and store best_results.
    search_btn.click(
        fn=search_query,
        inputs=query_input,
        outputs=[search_results, gallery, hidden_best_results]
    )

    # When generate is clicked, generate text response using the additional prompt and selected image.
    generate_btn.click(
        fn=generate_text_response,
        inputs=[additional_prompt_input, hidden_best_results, selected_image_input],
        outputs=generated_text
    )

demo.launch()

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/554 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/8.41k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.48k [00:00<?, ?B/s]

configuration_hf_nomic_bert.py:   0%|          | 0.00/1.96k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/nomic-ai/nomic-bert-2048:
- configuration_hf_nomic_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_hf_nomic_bert.py:   0%|          | 0.00/103k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/nomic-ai/nomic-bert-2048:
- modeling_hf_nomic_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/1.90G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

Created collection 'pdf_embeddings1'.


config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/90.6k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.64G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/192 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/70.0 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/1.61k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

Device set to use cuda


Stored embeddings for Scientometric-visualization-of-data-on-artificial-intelligence-_2024_Science.pdf.
Stored embeddings for Quantum simulation of battery materials using ionic pseudopotentials.pdf.
Stored embeddings for An-overview-of-skew-distributions-in-model-based-clustering--s_2024_Science-.pdf.
Stored embeddings for MobileQuant.pdf.
Stored embeddings for Short-beam-in-plane-bending-behaviour-of-a-hybrid-CFRP-titani_2024_Science-T.pdf.
Stored embeddings for Structural-health-monitoring-of-reinforced-concrete-beams-und_2024_Science-T.pdf.
Stored embeddings for Quantum Embedding Search for QuantumML.pdf.
Stored embeddings for Characterization-of-high-protein-anisotropic-structures-using-Rh_2024_Scienc.pdf.
Stored embeddings for Effect-of-porosities-on-mechanical-behavior-and-structural-integri_2024_Scie.pdf.
Stored embeddings for Fracture-cracks-localization-in-machined-H13-tool-steel-using_2024_Science-T.pdf.
Stored embeddings for qft.pdf.
Stored embeddings for Rapid-assessment-s



#Final

In [2]:
import os
import warnings
import json
import torch
import numpy as np
import PyPDF2
import pypdfium2 as pdfium
import nltk
from PIL import Image
from transformers import pipeline, MarianMTModel, MarianTokenizer
from sentence_transformers import SentenceTransformer
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, VectorParams, Distance
from langdetect import detect, DetectorFactory
import gradio as gr

# Ensure reproducible language detection
DetectorFactory.seed = 0

# Global list to record rendered image paths
img_list = []

# Download NLTK data if not already available.
nltk.download('punkt_tab')
nltk.download('punkt')

# Use Qdrant persistent storage (all Qdrant and embedding work on CPU)
qdrant_client = QdrantClient(path="qdrant_storage1")
collection_name = "pdf_embeddings1"

# Load the embedding model on CPU
embedding_model = SentenceTransformer("nomic-ai/nomic-embed-text-v2-moe", trust_remote_code=True)
embedding_dim = embedding_model.get_sentence_embedding_dimension()

def ensure_qdrant_collection():
    """Ensure the Qdrant collection exists with the correct configuration."""
    collections = [col.name for col in qdrant_client.get_collections().collections]
    if collection_name not in collections:
        qdrant_client.create_collection(
            collection_name=collection_name,
            vectors_config=VectorParams(size=embedding_dim, distance=Distance.COSINE)
        )
        print(f"Created collection '{collection_name}'.")
    else:
        print(f"Collection '{collection_name}' already exists.")

ensure_qdrant_collection()

def translate_german_to_english(text):
    model_name = "Helsinki-NLP/opus-mt-de-en"
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)  # Loaded on CPU
    inputs = tokenizer(text, return_tensors="pt", padding=True)
    translated_tokens = model.generate(**inputs)
    translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
    return translated_text

def extract_text_from_pdf(pdf_path):
    """
    Extracts text from each page of the PDF and returns a list of entries with text, page number,
    sentence ID, and detected language.
    """
    reader = PyPDF2.PdfReader(pdf_path)
    page_entries = []
    for page_number, page in enumerate(reader.pages, start=1):
        page_text = page.extract_text()
        if page_text:
            sentences = nltk.sent_tokenize(page_text)
            for i, sentence in enumerate(sentences):
                if sentence.strip():
                    try:
                        lang = detect(sentence)
                    except Exception:
                        lang = "unknown"
                    page_entries.append({
                        "text": sentence,
                        "page": page_number,
                        "sentence_id": i,
                        "language": lang
                    })
    return page_entries

def store_pdf_embeddings(pdf_path, context_window=5):
    """
    Extracts text, generates embeddings, and stores them in Qdrant.
    Combines neighboring sentences (based on context_window) for extra context.
    Skips processing if embeddings for this PDF are already stored.
    """
    doc_name = os.path.basename(pdf_path)

    # Check if embeddings for this document already exist.
    from qdrant_client.http.models import Filter, FieldCondition, MatchValue
    filter_condition = Filter(
        must=[FieldCondition(key="document", match=MatchValue(value=doc_name))]
    )
    count_result = qdrant_client.count(collection_name=collection_name, count_filter=filter_condition)
    if count_result.count > 0:
        print(f"Embeddings for {doc_name} already exist. Skipping.")
        return

    pdf_entries = extract_text_from_pdf(pdf_path)
    texts = [entry["text"] for entry in pdf_entries]
    embeddings = embedding_model.encode(texts, batch_size=32).tolist()

    # Build a lookup for entries by page.
    entries_by_page = {}
    for entry in pdf_entries:
        page = entry["page"]
        entries_by_page.setdefault(page, []).append(entry)

    points = []
    for i, entry in enumerate(pdf_entries):
        page = entry["page"]
        sid = entry["sentence_id"]
        # Retrieve neighboring sentences based on context_window.
        page_entries_sorted = sorted(entries_by_page[page], key=lambda x: x["sentence_id"])
        context_entries = [e for e in page_entries_sorted if abs(e["sentence_id"] - sid) <= context_window]
        full_text = " ".join([e["text"] for e in context_entries])

        uid = f"{doc_name}_{page}_{sid}"
        points.append(PointStruct(
            id=hash(uid),
            vector=embeddings[i],
            payload={
                "text": entry["text"],
                "full_text": full_text,
                "page": page,
                "document": doc_name,
                "language": entry.get("language", "unknown"),
                "sentence_id": sid
            }
        ))
    qdrant_client.upsert(collection_name=collection_name, points=points)
    print(f"Stored embeddings for {doc_name}.")

def hybrid_search(query, top_k=3):
    """
    Performs hybrid search combining dense retrieval (from Qdrant) and sparse retrieval (using BM25).
    """
    try:
        _ = detect(query)
    except Exception:
        pass

    query_embedding = embedding_model.encode([query]).tolist()[0]

    search_results = qdrant_client.search(
        collection_name=collection_name,
        query_vector=query_embedding,
        limit=top_k * 10,
        with_payload=True
    )

    candidate_texts = [hit.payload.get("full_text", hit.payload.get("text", "")) for hit in search_results if hit.payload]
    tokenized_docs = [word_tokenize(text.lower()) for text in candidate_texts]
    bm25 = BM25Okapi(tokenized_docs)
    tokenized_query = word_tokenize(query.lower())
    sparse_scores = np.array(bm25.get_scores(tokenized_query))

    dense_scores = np.array([hit.score for hit in search_results])
    max_dense = max(dense_scores) if max(dense_scores) > 0 else 1
    dense_scores = dense_scores / max_dense

    max_sparse = max(sparse_scores) if max(sparse_scores) > 0 else 1
    sparse_scores = sparse_scores / max_sparse

    hybrid_scores = 0.7 * dense_scores + 0.3 * sparse_scores
    sorted_indices = np.argsort(hybrid_scores)[::-1]

    best_results = []
    seen = set()
    for i in sorted_indices:
        hit = search_results[i]
        doc_page_pair = (hit.payload["document"], hit.payload["page"])
        if doc_page_pair in seen:
            continue
        seen.add(doc_page_pair)
        best_results.append({
            "text": hit.payload.get("full_text", hit.payload.get("text", "")),
            "page": hit.payload["page"],
            "document": hit.payload["document"],
            "score": hybrid_scores[i]
        })
        if len(best_results) >= top_k:
            break
    return best_results

def hybrid_search_dual(query, top_k=5):
    """
    If the query is in German, translates it to English.
    Runs hybrid search on both the original and translated query.
    Merges and re-ranks the results.
    """
    try:
        query_lang = detect(query)
    except Exception:
        query_lang = None

    if query_lang == "de":
        query_en = translate_german_to_english(query)
        query_de = query
    else:
        query_en = query
        query_de = query

    results_de = hybrid_search(query_de, top_k=top_k)
    results_en = hybrid_search(query_en, top_k=top_k)

    combined = {}
    for res in results_de + results_en:
        key = (res["document"], res["page"])
        if key in combined:
            combined[key]["score"] = (combined[key]["score"] + res["score"]) / 2
        else:
            combined[key] = res

    combined_results = list(combined.values())
    combined_results.sort(key=lambda x: x["score"], reverse=True)
    return combined_results[:top_k]

# PDF Rendering Functions
def get_total_pages(pdf_path):
    """Return total pages in the PDF."""
    reader = PyPDF2.PdfReader(pdf_path)
    return len(reader.pages)

def render_page(pdf_path, page_number, scale=1.0, output_path=None):
    """
    Render a specified page using pypdfium2.
    Saves the image if output_path is provided.
    """
    pdf = pdfium.PdfDocument(pdf_path)
    page = pdf.get_page(page_number)
    bitmap = page.render(scale=scale)
    pil_image = bitmap.to_pil()
    if output_path:
        pil_image.save(output_path)
        print(f"Saved rendered page {page_number + 1} to {output_path}")
    page.close()
    pdf.close()
    return pil_image

def process_specific_page(pdf_path, page_number, scale=7.5, output_dir="rendered_page"):
    """
    Render a specific page and record the image path.
    """
    total_pages = get_total_pages(pdf_path)
    if page_number < 0 or page_number >= total_pages:
        print(f"Page number {page_number + 1} out of range. Total pages: {total_pages}")
        return None
    os.makedirs(output_dir, exist_ok=True)
    doc_name = os.path.splitext(os.path.basename(pdf_path))[0]
    output_path = os.path.join(output_dir, f"{doc_name}_page_{page_number + 1}.png")
    # Save the absolute path in global img_list.
    abs_path = os.path.abspath(output_path).replace('\\', '/')
    if abs_path not in img_list:
        img_list.append(abs_path)
    return render_page(pdf_path, page_number, scale=scale, output_path=output_path)

def get_rendered_image_paths(rendered_dir="rendered_page"):
    """
    Retrieve all rendered image file paths from the rendered directory.
    """
    image_paths = []
    for root, _, files in os.walk(rendered_dir):
        for file in files:
            if file.lower().endswith(".png"):
                image_paths.append(os.path.join(root, file))
    return image_paths

def get_pdf_paths(root_folder):
    """
    Recursively find all PDF files in a root folder.
    """
    pdf_paths = []
    for subdir, _, files in os.walk(root_folder):
        for file in files:
            if file.lower().endswith(".pdf"):
                pdf_paths.append(os.path.join(subdir, file))
    return pdf_paths

# Setup the Gemma pipeline (LLM on GPU; others on CPU)
pipe = pipeline("image-text-to-text", model="google/gemma-3-4b-it", device="cuda", torch_dtype=torch.bfloat16)

# Set the folder containing your PDFs (change as needed)
root_directory = "/content/paper"
pdf_files = get_pdf_paths(root_directory)
for pdf_file in pdf_files:
    store_pdf_embeddings(pdf_file, context_window=5)

# --- Helper Function to Extract Assistant's Text Response ---
def extract_assistant_text(generated_text_list):
    """
    Extracts and concatenates text from the assistant role in the generated text list.
    """
    for message in generated_text_list:
        if message.get("role") == "assistant":
            # If the assistant's content is a string, return it directly.
            content = message.get("content")
            return content
    return ""
# --- Gradio UI Functions ---

def search_query(query):
    """
    Takes a search query, performs dual-hybrid search, renders images for retrieved pages,
    and returns a summary string, a list of PIL images, and a JSON of best results.
    """
    global img_list
    img_list.clear()
    best_results = hybrid_search_dual(query, top_k=5)
    if not best_results:
        return "No results found.", [], json.dumps(best_results)

    results_text = ""
    for res in best_results:
        results_text += f"Document: {res['document']}, Page: {res['page']}\n"
        results_text += f"Score: {res['score']:.4f}\n"
        results_text += f"Text: {res['text']}\n"
        results_text += "-"*50 + "\n"

    images = []
    for res in best_results:
        pdf_path = next((path for path in pdf_files if os.path.basename(path) == res["document"]), None)
        if pdf_path:
            pil_img = process_specific_page(pdf_path, res["page"] - 1, scale=10.0)
            if pil_img:
                images.append(pil_img)
    # Return the best results JSON in a hidden output so it can be used later.
    return results_text, images, json.dumps(best_results)

def generate_text_response(additional_prompt, best_results_json, selected_index):
    """
    Takes the additional prompt, the retrieved texts, and an image index.
    The index is used to pick an image from the global img_list.
    Builds the conversation and calls the Gemma pipeline to generate a response.
    Only the assistant's text response is returned.
    """
    best_results = json.loads(best_results_json)
    # Combine the retrieved text from the search results.
    combined_text = "\n".join([res["text"] for res in best_results])
    if additional_prompt:
        combined_prompt = additional_prompt + "\n" + combined_text
    else:
        combined_prompt = combined_text
    system_prompt_text = "using the retrieved text and image only. Respond in English.\nYou are an advanced technical agent."

    # Convert the provided index to an integer and select the image from the global list.
    try:
        idx = int(selected_index)
    except Exception:
        idx = 0
    if 0 <= idx < len(img_list):
        selected_image = img_list[idx]
    else:
        selected_image = ""

    # Build conversation: include image if a valid image path was selected.
    if selected_image:
        conversation = [
            {
                "role": "system",
                "content": [{"type": "text", "text": system_prompt_text}]
            },
            {
                "role": "user",
                "content": [
                    {"type": "image", "url": selected_image},
                    {"type": "text", "text": additional_prompt},
                ],
            },
        ]
    else:
        conversation = [
            {
                "role": "system",
                "content": [{"type": "text", "text": system_prompt_text}]
            },
            {
                "role": "user",
                "content": [{"type": "text", "text": combined_prompt}],
            },
        ]
    # Call the Gemma pipeline
    output = pipe(text=conversation, max_new_tokens=1800)
    assistant_response = extract_assistant_text(output[0]['generated_text'])
    return assistant_response

# --- Build Gradio Interface (Sequential Flow) ---
with gr.Blocks() as demo:
    gr.Markdown("# PDF Search and LLM Query Interface")
    gr.Markdown("### Step 1: Enter your search query")
    with gr.Row():
        query_input = gr.Textbox(label="Search Query", placeholder="Enter search query here")
        search_btn = gr.Button("Search")
    search_results = gr.Textbox(label="Search Results", lines=10)
    gr.Markdown("### Retrieved Page Images")
    gallery = gr.Gallery(label="Rendered Images", height="auto")

    gr.Markdown("### Step 2: Generate Text Response")
    gr.Markdown("Enter an additional prompt and select the rendered image by entering its index (starting at 0).")
    # Instead of a dropdown, we now use a number input for the image index.
    selected_image_index = gr.Number(label="Image Index", value=0)
    additional_prompt_input = gr.Textbox(label="Additional Prompt", placeholder="Enter additional text prompt here")
    # Hidden textbox to store the best_results JSON from the search query.
    hidden_best_results = gr.Textbox(label="Hidden Best Results", visible=False)
    generate_btn = gr.Button("Generate Response")
    generated_text = gr.Textbox(label="Generated Text", lines=10)

    # When search is clicked, update search results, gallery, and the hidden best_results.
    search_btn.click(
        fn=search_query,
        inputs=query_input,
        outputs=[search_results, gallery, hidden_best_results]
    )

    # When generate is clicked, generate text response using the additional prompt,
    # the hidden best_results JSON, and the image index.
    generate_btn.click(
        fn=generate_text_response,
        inputs=[additional_prompt_input, hidden_best_results, selected_image_index],
        outputs=generated_text
    )

demo.launch()


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/554 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/8.41k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.48k [00:00<?, ?B/s]

configuration_hf_nomic_bert.py:   0%|          | 0.00/1.96k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/nomic-ai/nomic-bert-2048:
- configuration_hf_nomic_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_hf_nomic_bert.py:   0%|          | 0.00/103k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/nomic-ai/nomic-bert-2048:
- modeling_hf_nomic_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/1.90G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

Created collection 'pdf_embeddings1'.


config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/90.6k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.64G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/70.0 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/1.61k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

Device set to use cuda


Stored embeddings for Quantum simulation of battery materials using ionic pseudopotentials.pdf.
Stored embeddings for Quantum Embedding Search for QuantumML.pdf.
Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://26e11ac1d297c32f93.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


NameError: name 'generate_text_response' is not defined