In [1]:
!pip install -U pinecone google-generativeai PyMuPDF pdfplumber


Collecting pinecone
  Downloading pinecone-7.3.0-py3-none-any.whl.metadata (9.5 kB)
Collecting PyMuPDF
  Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting pinecone-plugin-assistant<2.0.0,>=1.6.0 (from pinecone)
  Downloading pinecone_plugin_assistant-1.7.0-py3-none-any.whl.metadata (28 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone)
  Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Collecting pdfminer.six==20250506 (from pdfplumber)
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m225.3/232.6 kB[0m [31m8.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [3]:
!pip install -q tqdm

In [4]:
from pinecone import Pinecone
from getpass import getpass
from uuid import uuid4

# Step 1: API key
PINECONE_API_KEY = getpass("🔐 Pinecone API Key: ")

# Step 2: Connect
pc = Pinecone(api_key=PINECONE_API_KEY)

# Step 3: Create index for server-side embedding
index_name = "sherlock"

if not pc.has_index(index_name):
    pc.create_index_for_model(
        name=index_name,
        cloud="aws",
        region="us-east-1",
        embed={
            "model":"multilingual-e5-large",
            "field_map":{"text": "chunk_text"}
        }
    )



🔐 Pinecone API Key: ··········


In [5]:
import os
from google.colab import files


if not os.path.exists("docs"):
    os.makedirs("docs")


uploaded = files.upload()


for filename in uploaded.keys():
    os.rename(filename, f"docs/{filename}")


Saving A Study in Scarlet.pdf to A Study in Scarlet.pdf


In [6]:
import os
from uuid import uuid4
from getpass import getpass
from pinecone import Pinecone
import PyPDF2
import google.generativeai as genai

def extract_chunks_from_pdf(file_path):
    chunks = []
    with open(file_path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        for page_num, page in enumerate(reader.pages):
            text = page.extract_text()
            if text:
                text = text.replace("\n", "\n ")
                for para in text.split("\n\n"):
                    clean_para = para.strip()
                    if len(clean_para) > 100:
                        chunks.append(f"[Page {page_num+1}]\n{clean_para}")
    return chunks


In [7]:
records = []
for filename in os.listdir("docs"):
    filepath = os.path.join("docs", filename)
    chunks = extract_chunks_from_pdf(filepath)
    for chunk in chunks:
        records.append({
            "_id": str(uuid4()),
            "chunk_text": chunk
        })

In [25]:
with open("records1.txt", "w") as f:
    for item in records:
        f.write(f"{item}\n")


In [8]:
import time
# Target the index
dense_index = pc.Index(index_name)

# Upsert the records into a namespace

def batched_upsert(index, records, namespace=None, batch_size=90):
    from itertools import islice

    def chunks(iterable, size):
        it = iter(iterable)
        while chunk := list(islice(it, size)):
            yield chunk

    for i, batch in enumerate(chunks(records, batch_size), start=1):
        index.upsert_records("a-study-in-scarlet", batch)

        if i % 3 == 0:
          print(f"⏸️  Sleeping 61 seconds after batch {i} to respect token limit...")
          time.sleep(61)

# 🚀 Upsert in batches with namespace
batched_upsert(index=dense_index, records=records, namespace="ns1")


print(f"✅ Upserted {len(records)} records into Pinecone index '{index_name}'")

✅ Upserted 55 records into Pinecone index 'sherlock'


In [10]:
dense_index.describe_index_stats()

{'dimension': 1024,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'a-study-in-scarlet': {'vector_count': 55},
                'ns1': {'vector_count': 806}},
 'total_vector_count': 861,
 'vector_type': 'dense'}

In [28]:
import os
import time
from getpass import getpass
import google.generativeai as genai
GEMMA_API_KEY = getpass("🔐 Gemma API Key: ")

genai.configure(api_key=GEMMA_API_KEY)

model = genai.GenerativeModel("gemma-3-12b-it")



🔐 Gemma API Key: ··········


In [35]:
# Helper to extract text from search results
def extract_context_from_results(results):
    # if not results or not results.matches:
    #     return "No context found."
    return "\n\n".join([match.get("chunk_text", "") for match in results.matches])



In [38]:

#Query
while True:
    query = input("\n Ask a question (or type 'q' to quit): ").strip()
    if query.lower() == "q":
        break



    results = dense_index.search(
        namespace="ns1",
        query={"inputs": {"text": query}, "top_k": 10},
    )

    context = extract_context_from_results(results)
    prompt = f"""Answer the following question using the provided context below. If the answer is not in the context, say so.

### Question:
{query}

### Context:
{context}
"""

    response = model.generate_content(prompt)
    print("\n💡 Answer:")
    print(response.text)


 Ask a question (or type 'q' to quit): doctor where degree

Debug - Raw Results: {'result': {'hits': [{'_id': 'd81687a1-135f-4206-8ce9-c315cf6cef76',
                      '_score': 0.8162242770195007,
                      'fields': {'chunk_text': '[Page 13]\n'
                                               'A Study InScarlet\n'
                                               ' CHAPTER I.\n'
                                               ' Mr. Sherlock Holmes\n'
                                               ' In the year 1878 I took my '
                                               'degree of\n'
                                               ' Doctor of Medicine of the '
                                               'University of\n'
                                               ' London, and proceeded to '
                                               'Netley to go\n'
                                               ' through the course prescribed '
                              