In [1]:
%pip install --upgrade pymupdf langchain openai pinecone-client tiktoken python-dotenv


Collecting langchain
  Obtaining dependency information for langchain from https://files.pythonhosted.org/packages/ed/5c/5c0be747261e1f8129b875fa3bfea736bc5fe17652f9d5e15ca118571b6f/langchain-0.3.25-py3-none-any.whl.metadata
  Downloading langchain-0.3.25-py3-none-any.whl.metadata (7.8 kB)
Collecting openai
  Obtaining dependency information for openai from https://files.pythonhosted.org/packages/3c/4c/3889bc332a6c743751eb78a4bada5761e50a8a847ff0e46c1bd23ce12362/openai-1.78.1-py3-none-any.whl.metadata
  Downloading openai-1.78.1-py3-none-any.whl.metadata (25 kB)
Collecting pinecone-client
  Obtaining dependency information for pinecone-client from https://files.pythonhosted.org/packages/5a/e4/7780cd631dc6dad0172a245e958b41b28a70779594c0790fa08b952aa97f/pinecone_client-6.0.0-py3-none-any.whl.metadata
  Downloading pinecone_client-6.0.0-py3-none-any.whl.metadata (3.4 kB)
Collecting python-dotenv
  Obtaining dependency information for python-dotenv from https://files.pythonhosted.org/pack

In [3]:
import os
from dotenv import load_dotenv
load_dotenv()                           # expects .env in the repo root

OPENAI_API_KEY  = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_ENV     = os.getenv("PINECONE_ENV")  # e.g. "gcp-starter"


In [4]:
from pathlib import Path
import fitz  # PyMuPDF

def load_pdf(path: Path) -> str:
    doc = fitz.open(path)
    text = []
    for page in doc:
        text.append(page.get_text("text"))
    return "\n".join(text)

raw_text = load_pdf(Path("/Users/rajeev/Downloads/doc2.pdf"))
print(raw_text[:1000])   # sanity‑check first 1 000 chars


OPERATION AND
MAINTENANCE MANUAL
Bench Lathe
Art. T999/230V – T999/230V3A
Art. T999/400V - T999/400V3A
TRANSLATION OF THE ORIGINAL INSTRUCTIONS
fervi.com

MACHINES AND
ACCESSORIES
Page 2 of 84
PREFACE
Please ensure you have read this manual before operation
TRANSLATION OF THE ORIGINAL INSTRUCTIONS
It is compulsory to read this instruction manual before starting operation. The guarantee of
smooth operation and full performance of the machine is highly dependent on the application
of all the instructions contained in this manual.
Operator qualifications
The workers responsible for the use of this machine must have all the necessary
information and instruction and should be given adequate training in relation to safety
regarding:
a)
Conditions of use for the equipment;
b)
Foreseeable abnormal situations, pursuant to Article 73 of Legislative Decree
81/08.
We guarantee the Machine complies with the specifications and technical instructions
described in the Manual on the date of issuance an

In [7]:
%pip install tabula-py jpype1

Collecting tabula-py
  Obtaining dependency information for tabula-py from https://files.pythonhosted.org/packages/2f/80/10bc6f303054d1a06eb8628f90e5997f4b1272956a477230f3fa95637c28/tabula_py-2.10.0-py3-none-any.whl.metadata
  Downloading tabula_py-2.10.0-py3-none-any.whl.metadata (7.6 kB)
Collecting jpype1
  Obtaining dependency information for jpype1 from https://files.pythonhosted.org/packages/35/a0/638186a75026a02286041e4a0449b1dff799a3914dc1c0716ef9b9367b73/jpype1-1.5.2-cp311-cp311-macosx_10_9_universal2.whl.metadata
  Downloading jpype1-1.5.2-cp311-cp311-macosx_10_9_universal2.whl.metadata (4.9 kB)
Downloading tabula_py-2.10.0-py3-none-any.whl (12.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m60.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading jpype1-1.5.2-cp311-cp311-macosx_10_9_universal2.whl (584 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m584.5/584.5 kB[0m [31m35.8 MB/s[0m eta [36m0:00:00

In [11]:
import os, base64
from pathlib import Path
from tqdm import tqdm

import fitz                         # PyMuPDF 2.0+
import tabula                       # needs Java;  pip install tabula-py jpype1
from langchain.text_splitter import RecursiveCharacterTextSplitter


In [5]:
def create_directories(base_dir: str = "data"):
    for leaf in ["images", "text", "tables", "page_images"]:
        os.makedirs(Path(base_dir) / leaf, exist_ok=True)


In [6]:
def process_tables(filepath: str, doc, page_num: int, base_dir: str, items: list):
    """Extract all tables on a page with Tabula → save as txt."""
    try:
        tables = tabula.read_pdf(filepath, pages=page_num + 1, multiple_tables=True)
        if not tables:
            return
        for t_idx, df in enumerate(tables):
            table_text = "\n".join([" | ".join(map(str, row)) for row in df.values])
            fname = Path(base_dir) / "tables" / f"{Path(filepath).stem}_table_{page_num}_{t_idx}.txt"
            fname.write_text(table_text, encoding="utf-8")
            items.append({"page": page_num, "type": "table", "text": table_text, "path": str(fname)})
    except Exception as e:
        print(f"[warn] page {page_num}: table extract failed ➜ {e}")

def process_text_chunks(text: str, splitter, page_num: int, base_dir: str, filepath: str, items: list):
    for i, chunk in enumerate(splitter.split_text(text)):
        fname = Path(base_dir) / "text" / f"{Path(filepath).stem}_text_{page_num}_{i}.txt"
        fname.write_text(chunk, encoding="utf-8")
        items.append({"page": page_num, "type": "text", "text": chunk, "path": str(fname)})

def process_images(doc, page, page_num: int, base_dir: str, filepath: str, items: list):
    for idx, img in enumerate(page.get_images(full=True)):
        xref = img[0]
        pix = fitz.Pixmap(doc, xref)
        # handle CMYK → RGB
        if pix.alpha or pix.colorspace.n > 3:
            pix = fitz.Pixmap(fitz.csRGB, pix)
        fname = Path(base_dir) / "images" / f"{Path(filepath).stem}_image_{page_num}_{idx}_{xref}.png"
        pix.save(fname)
        items.append({
            "page": page_num, "type": "image", "path": str(fname),
            "image": base64.b64encode(fname.read_bytes()).decode("utf-8")
        })

def process_page_image(page, page_num: int, base_dir: str, items: list):
    pix = page.get_pixmap(matrix=fitz.Matrix(2,2))   # 2× for crispness
    fname = Path(base_dir) / "page_images" / f"page_{page_num:03d}.png"
    pix.save(fname)
    items.append({
        "page": page_num, "type": "page", "path": str(fname),
        "image": base64.b64encode(fname.read_bytes()).decode("utf-8")
    })


In [12]:
filepath = "/Users/rajeev/Downloads/doc2.pdf"          # ← update if needed
base_dir = "data"              # root output folder

create_directories(base_dir)
splitter = RecursiveCharacterTextSplitter(
    chunk_size=700, chunk_overlap=200, length_function=len
)

items = []

doc = fitz.open(filepath)
for page_num in tqdm(range(len(doc)), desc="Processing PDF pages"):
    page = doc[page_num]
    text = page.get_text("text")

    process_tables(filepath, doc, page_num, base_dir, items)
    process_text_chunks(text, splitter, page_num, base_dir, filepath, items)
    process_images(doc, page, page_num, base_dir, filepath, items)
    process_page_image(page, page_num, base_dir, items)

print(f"✅  finished – extracted {len(items)} items")


Processing PDF pages: 100%|██████████| 84/84 [00:30<00:00,  2.74it/s]

✅  finished – extracted 750 items





In [14]:
# First text chunk – preview 200 chars
first_text = next(obj for obj in items if obj["type"] == "text")
print(first_text["text"][:200])

# First table – preview 200 chars
first_table = next(obj for obj in items if obj["type"] == "table")
print(first_table["text"][:200])

# First image – show path (or the base64 string length, etc.)
first_img = next(obj for obj in items if obj["type"] == "image")
print(first_img["path"])
# print(len(first_img["image"]))   # if you want to confirm it’s base64


OPERATION AND
MAINTENANCE MANUAL
Bench Lathe
Art. T999/230V – T999/230V3A
Art. T999/400V - T999/400V3A
TRANSLATION OF THE ORIGINAL INSTRUCTIONS
fervi.com
2.2 | Safety rules for electrical machine equipment ..............................................................9
2.3 | Technical Assistance .........................................................
data/images/doc2_image_0_0_515.png


In [14]:
%pip install --upgrade openai tqdm  pillow

Note: you may need to restart the kernel to use updated packages.


In [21]:
pip uninstall pinecone-client pinecone -y


[0mFound existing installation: pinecone 6.0.2
Uninstalling pinecone-6.0.2:
  Successfully uninstalled pinecone-6.0.2
Note: you may need to restart the kernel to use updated packages.


In [22]:
pip install pinecone


Collecting pinecone
  Obtaining dependency information for pinecone from https://files.pythonhosted.org/packages/5b/c7/2bc1210aa51528b9ba75aede1f169998f50942cc47cdd82dd2dbcba4faa5/pinecone-6.0.2-py3-none-any.whl.metadata
  Using cached pinecone-6.0.2-py3-none-any.whl.metadata (9.0 kB)
Using cached pinecone-6.0.2-py3-none-any.whl (421 kB)
Installing collected packages: pinecone
Successfully installed pinecone-6.0.2
Note: you may need to restart the kernel to use updated packages.


In [23]:
pip show pinecone


Name: pinecone
Version: 6.0.2
Summary: Pinecone client and SDK
Home-page: 
Author: Pinecone Systems, Inc.
Author-email: support@pinecone.io
License: Apache-2.0
Location: /Users/rajeev/anaconda3/lib/python3.11/site-packages
Requires: certifi, pinecone-plugin-interface, python-dateutil, typing-extensions, urllib3
Required-by: langchain-pinecone
Note: you may need to restart the kernel to use updated packages.


In [1]:
from pinecone import Pinecone, ServerlessSpec


In [None]:
#probably not needed

from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "multimodal-manual"

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=1536,  # for `text-embedding-4-small`
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",          # or "gcp"
            region="us-east-1"    # or any supported region
        )
    )

index = pc.Index(index_name)


In [25]:
# !pip install --upgrade openai tqdm  pillow  # pillow only for quick mime-type sniffing
from pathlib import Path
import base64, mimetypes, uuid, os, time
from tqdm import tqdm

import openai                     # v1.14+
from langchain_openai import OpenAIEmbeddings
import pinecone

openai.api_key      = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY    = os.getenv("PINECONE_API_KEY")

# ── embedding & chat models (adjust if your account differs) ────────────
EMBED_MODEL = "text-embedding-3-large"    # 1536-D, GPT-4 family
CHAT_MODEL  = "gpt-4o-mini"               # vision-capable

embeddings = OpenAIEmbeddings(model=EMBED_MODEL)

PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")  # keep this

pc = Pinecone(api_key=PINECONE_API_KEY)
index_name = "multimodal-manual"

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=1536,  # or embeddings.embedding_dimension
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )

index = pc.Index(index_name)


In [26]:
def img_to_data_uri(path: Path) -> str:
    mime, _ = mimetypes.guess_type(path)
    b64 = base64.b64encode(path.read_bytes()).decode()
    return f"data:{mime or 'image/png'};base64,{b64}"

def caption_image(path: Path, retry: int = 3) -> str:
    """Return a single-sentence literal caption for the image."""
    data_uri = img_to_data_uri(path)
    for attempt in range(retry):
        try:
            resp = openai.chat.completions.create(
                model=CHAT_MODEL,
                messages=[{
                    "role": "user",
                    "content": [
                        {"type": "text",
                         "text": "Describe this image in one concise sentence, no interpretation."},
                        {"type": "image_url", "image_url": {"url": data_uri}}
                    ]
                }],
                max_tokens=60
            )
            return resp.choices[0].message.content.strip()
        except openai.RateLimitError:
            wait = 2 ** attempt
            print(f"rate-limited, retrying in {wait}s"); time.sleep(wait)
    return "unavailable caption"


In [None]:
docs, meta, ids = [], [], []

for obj in tqdm(items, desc="Preparing docs"):          # ← items list from extractor
    if obj["type"] in {"text", "table"}:
        text_repr = obj["text"]

    elif obj["type"] in {"image", "page"}:
        caption = caption_image(Path(obj["path"]))
        text_repr = f"[IMAGE] {caption}"
        obj["caption"] = caption                      # keep for traceability

    else:
        continue                                      # skip unknown types

    uid = str(uuid.uuid4())
    ids.append(uid)
    docs.append(text_repr)
    meta.append({"type": obj["type"],
                 "page": obj["page"],
                 "path": obj["path"]})

# ── embed in batches ───────────────────────────────────────────
batch = 100
vectors = []
for i in tqdm(range(0, len(docs), batch), desc="Embedding"):
    vecs = embeddings.embed_documents(docs[i:i+batch])
    vectors.extend(vecs)

# ── upsert to Pinecone ────────────────────────────────────────
to_upsert = [(ids[i], vectors[i], meta[i]) for i in range(len(ids))]
index.upsert(vectors=to_upsert, namespace="v1")

print(f"✅  Upserted {len(to_upsert)} multimodal items to namespace 'v1'")


In [39]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

# 🔁 Delete the old 1536-D index
pc.delete_index("multimodal-manual")

# 🆕 Create a 3072-D index to match text-embedding-3-large
pc.create_index(
    name="multimodal-manual",
    dimension=3072,
    metric="cosine",
    spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

# 🔗 Then connect to it
index = pc.Index("multimodal-manual")


In [51]:
import sys
import json

def get_payload_size(batch):
    return len(json.dumps(batch).encode("utf-8"))

MAX_PAYLOAD = 3_500_000
batch = []

print("🔄 Resuming safe batch upsert...")

for vec in to_upsert:
    batch.append(vec)
    if get_payload_size(batch) >= MAX_PAYLOAD:
        index.upsert(batch, namespace="v1")
        print(f"✅ Upserted batch of {len(batch)} items (approx {get_payload_size(batch)//1024} KB)")
        batch = []

# Final leftovers
if batch:
    index.upsert(batch, namespace="v1")
    print(f"✅ Upserted final batch of {len(batch)} items (approx {get_payload_size(batch)//1024} KB)")


🔄 Resuming safe batch upsert...


In [52]:
# Final leftovers
if batch:
    index.upsert(batch, namespace="v1")
    print(f"✅ Upserted final batch of {len(batch)} items (approx {get_payload_size(batch)//1024} KB)")


In [50]:
# REBUILD to_upsert cleanly with proper metadata
to_upsert = [
    (ids[i], vectors[i], {**meta[i], "text": docs[i]})
    for i in range(len(docs))
]


In [53]:
index.describe_index_stats(namespace="v1")


{'dimension': 3072,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'v1': {'vector_count': 750}},
 'total_vector_count': 750,
 'vector_type': 'dense'}

In [42]:
query = "How to adjust spindle speed?"

query_embedding = embeddings.embed_query(query)

res = index.query(
    vector=query_embedding,
    top_k=3,
    namespace="v1",
    include_metadata=True
)

for match in res['matches']:
    print(f"\nScore: {match['score']:.4f}")
    print(f"Type: {match['metadata'].get('type')}")
    print(f"Path: {match['metadata'].get('path')}")



Score: 0.6529
Type: text
Path: data/text/doc2_text_28_2.txt

Score: 0.6057
Type: text
Path: data/text/doc2_text_13_4.txt

Score: 0.5967
Type: text
Path: data/text/doc2_text_28_3.txt


In [43]:
image_matches = [m for m in res['matches'] if m['metadata'].get('type') == 'image']
for img in image_matches:
    print(f"🔍 Image: {img['metadata'].get('path')} → score: {img['score']:.3f}")


In [45]:
from langchain_pinecone import PineconeVectorStore

vectorstore = PineconeVectorStore(
    index      = index,          # your Pinecone index from earlier
    embedding  = embeddings,     # same OpenAIEmbeddings object
    namespace  = "v1",           # must match your upsert namespace
    text_key   = "text"
)


In [54]:
query = "How to lubricate the spindle?"
query_vector = embeddings.embed_query(query)

res = index.query(
    vector=query_vector,
    top_k=3,
    namespace="v1",
    include_metadata=True
)

for match in res['matches']:
    print(f"\nScore: {match['score']:.3f}")
    print(f"Type: {match['metadata'].get('type')}")
    print(f"Page: {match['metadata'].get('page')}")
    print(f"Path: {match['metadata'].get('path')}")
    print(f"Text: {match['metadata'].get('text', '')[:150]}...")




Score: 0.567
Type: text
Page: 51.0
Path: data/text/doc2_text_51_1.txt
Text: ...

Score: 0.550
Type: text
Page: 54.0
Path: data/text/doc2_text_54_1.txt
Text: ...

Score: 0.536
Type: text
Page: 51.0
Path: data/text/doc2_text_51_0.txt
Text: ...


In [None]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
docs = retriever.invoke("Where is the lubrication diagram?")

for doc in docs:
    print(f"\nType: {doc.metadata.get('type')}")
    print(f"Page: {doc.metadata.get('page')}")
    print(f"Path: {doc.metadata.get('path')}")
    print(f"Content preview: {doc.page_content[:200]}...")


Found document with no `text` key. Skipping.
Found document with no `text` key. Skipping.
Found document with no `text` key. Skipping.


Task exception was never retrieved
future: <Task finished name='Task-2799' coro=<caption_image_async() done, defined at /var/folders/3x/zwytj0k10b1_d1jp5zlhvt7w0000gn/T/ipykernel_12320/159809286.py:35> exception=APIConnectionError('Connection error.')>
Traceback (most recent call last):
  File "/Users/rajeev/anaconda3/lib/python3.11/site-packages/httpx/_transports/default.py", line 72, in map_httpcore_exceptions
    yield
  File "/Users/rajeev/anaconda3/lib/python3.11/site-packages/httpx/_transports/default.py", line 377, in handle_async_request
    resp = await self._pool.handle_async_request(req)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/rajeev/anaconda3/lib/python3.11/site-packages/httpcore/_async/connection_pool.py", line 256, in handle_async_request
    raise exc from None
  File "/Users/rajeev/anaconda3/lib/python3.11/site-packages/httpcore/_async/connection_pool.py", line 236, in handle_async_request
    response = await connection.handle_async_request