# Install

In [None]:
!pip install langchain-community sentence-transformers faiss-cpu pdf2image Pillow python-dotenv pypdf pymupdf

Collecting langchain-community
  Downloading langchain_community-0.3.30-py3-none-any.whl.metadata (3.0 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Collecting pypdf
  Downloading pypdf-6.1.1-py3-none-any.whl.metadata (7.1 kB)
Collecting pymupdf
  Downloading pymupdf-1.26.4-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting requests<3.0.0,>=2.32.5 (from langchain-community)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting dataclasses-json<0.7.0,>=0.6.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7.0,>=0.6.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7.0

# Upload .env and PDF

In [None]:
from google.colab import files
from dotenv import load_dotenv
import os

# Upload .env
print("Upload your .env file:")
files.upload()
load_dotenv()
pplx_key = os.getenv("PERPLEXITY_API_KEY")
if not pplx_key:
    raise ValueError("PERPLEXITY_API_KEYnot found in .env")
print("PERPLEXITY_API_KEY Loaded:", bool(pplx_key))

# Upload PDF
print("Upload your PDF file:")
pdf_upload = files.upload()
pdf_filename = list(pdf_upload.keys())[0]
print(f"Uploaded PDF: {pdf_filename}")


Upload your .env file:


Saving .env to .env
PERPLEXITY_API_KEY Loaded: True
Upload your PDF file:


Saving ijoc.2021.1107.pdf to ijoc.2021.1107.pdf
Uploaded PDF: ijoc.2021.1107.pdf


#Load PDF

In [None]:
from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader(pdf_filename)
documents = loader.load()
print(f"Loaded {len(documents)} pages from PDF")
print("Preview first page:\n", documents[0].page_content[:500])


Loaded 22 pages from PDF
Preview first page:
 This article was downloaded by: [14.139.108.35] On: 09 April 2023, At: 15:59
Publisher: Institute for Operations Research and the Management Sciences (INFORMS)
INFORMS is located in Maryland, USA
INFORMS Journal on Computing
Publication details, including instructions for authors and subscription information:
http://pubsonline.informs.org
Iterative Prediction-and-Optimization for E-Logistics
Distribution Network Design
Junming Liu, Weiwei Chen, Jingyuan Yang, Hui Xiong, Can Chen
To cite this art


#Chunk text

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
)

In [None]:
text_chunks = text_splitter.split_documents(documents)
print(f"Total text chunks: {len(text_chunks)}")

Total text chunks: 130


#Extract images from PDF

In [None]:
import fitz
doc = fitz.open(pdf_filename)
image_items = []

In [None]:
import io
from PIL import Image # Import the Image class

for page_num in range(len(doc)):
    page = doc[page_num]
    for img_info in page.get_images(full=True):
        xref = img_info[0]
        base_image = doc.extract_image(xref)
        img_bytes = base_image["image"]
        img = Image.open(io.BytesIO(img_bytes)).convert("RGB")
        image_items.append({"type": "image", "content": img, "page": page_num + 1})

In [None]:
print(f"Extracted {len(image_items)} images from PDF.")

Extracted 11 images from PDF.


#Create embeddings for text and images

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
import json


In [None]:
from transformers import CLIPModel, CLIPProcessor
import numpy as np
import torch



device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

In [None]:
text_embeddings = []
for doc in text_chunks:
    inputs = clip_processor(text=[doc.page_content], return_tensors="pt", padding=True, truncation=True).to(device)
    with torch.no_grad():
        text_features = clip_model.get_text_features(**inputs)
    text_embeddings.append(text_features.cpu().numpy()[0])


In [None]:
text_embeddings = np.array(text_embeddings, dtype="float32")


In [None]:
import io
from PIL import Image

image_embeddings = []
# Iterate over image_items and access the 'content' key
for img_item in image_items:
    img = img_item['content']
    inputs = clip_processor(images=img, return_tensors="pt").to(device)
    with torch.no_grad():
        image_features = clip_model.get_image_features(**inputs)
    image_embeddings.append(image_features.cpu().numpy()[0])

In [None]:
image_embeddings = np.array(image_embeddings, dtype="float32")

In [None]:
print("Created embeddings:", text_embeddings.shape, image_embeddings.shape)

Created embeddings: (130, 512) (11, 512)


#faiss

In [None]:
dim = text_embeddings.shape[1]

In [None]:
# Text index
text_index = faiss.IndexFlatIP(dim)
text_index.add(text_embeddings)
print(f"Text FAISS index created with {text_index.ntotal} embeddings.")

Text FAISS index created with 130 embeddings.


In [None]:
# Image index
image_index = faiss.IndexFlatIP(dim)
image_index.add(image_embeddings)

print(f"Image FAISS index created with {image_index.ntotal} embeddings.")


Image FAISS index created with 11 embeddings.


In [None]:
from langchain_community.vectorstores import FAISS

In [None]:
class CLIPTextEmbeddingsCallable:
    def __init__(self, model, processor, device):
        self.model = model
        self.processor = processor
        self.device = device

    def __call__(self, text: str):
        return self.embed_query(text)

    def embed_query(self, text: str):
        inputs = self.processor(text=[text], return_tensors="pt", padding=True).to(self.device)
        with torch.no_grad():
            embeddings = self.model.get_text_features(**inputs)
        return embeddings.cpu().numpy().flatten().tolist()

    def embed_documents(self, texts: list[str]):
        return [self.embed_query(t) for t in texts]


class CLIPImageEmbeddingsCallable:
    def __init__(self, model, processor, device):
        self.model = model
        self.processor = processor
        self.device = device

    def __call__(self, image):
        return self.embed_image(image)

    def embed_query(self, image):
        return self.embed_image(image)

    def embed_image(self, image):
        inputs = self.processor(images=image, return_tensors="pt").to(self.device)
        with torch.no_grad():
            embeddings = self.model.get_image_features(**inputs)
        return embeddings.cpu().numpy().flatten().tolist()

    def embed_documents(self, images):
        return [self.embed_image(img) for img in images]


In [None]:
clip_text_embeddings_callable = CLIPTextEmbeddingsCallable(clip_model, clip_processor, device)
clip_image_embeddings_callable = CLIPImageEmbeddingsCallable(clip_model, clip_processor, device)

#Create retrievers

In [None]:
from langchain.docstore.in_memory import InMemoryDocstore
from langchain.schema import Document

In [None]:
text_docstore = InMemoryDocstore({str(i): text_chunks[i] for i in range(len(text_chunks))})
text_index_to_id = {i: str(i) for i in range(len(text_chunks))}

In [None]:
text_retriever = FAISS(
    embedding_function=clip_text_embeddings_callable,
    index=text_index,
    docstore=text_docstore,
    index_to_docstore_id=text_index_to_id
).as_retriever(search_kwargs={"k": 5})



In [None]:
image_docs = [Document(page_content=f"image_{i}.png") for i in range(len(image_items))]
image_docstore = InMemoryDocstore({str(i): image_docs[i] for i in range(len(image_items))})
image_index_to_id = {i: str(i) for i in range(len(image_items))}

In [None]:
image_retriever = FAISS(
    embedding_function=clip_image_embeddings_callable,
    index=image_index,
    docstore=image_docstore,
    index_to_docstore_id=image_index_to_id
).as_retriever(search_kwargs={"k": 5})



In [None]:
from langchain.schema import BaseRetriever, Document
from typing import List, Union
from PIL import Image
import os
from pydantic import BaseModel, Field

class MultiRetriever(BaseRetriever):
    text_retriever: BaseRetriever
    image_retriever: BaseRetriever

    def _get_relevant_documents(self, query: Union[str, Image.Image], **kwargs) -> List[Document]:
        results = []

        # Text query
        if isinstance(query, str) and not os.path.exists(query):
            results.extend(self.text_retriever.get_relevant_documents(query))

        # Image file path
        elif isinstance(query, str) and os.path.exists(query):
            with Image.open(query) as img:
                results.extend(self.image_retriever.get_relevant_documents(img))

        # Already a PIL image
        elif isinstance(query, Image.Image):
            results.extend(self.image_retriever.get_relevant_documents(query))

        else:
            raise ValueError("Unsupported query type for MultiRetriever.")

        return results


In [None]:
combined_retriever = MultiRetriever(
    text_retriever=text_retriever,
    image_retriever=image_retriever
)

#Save embeddings + metadata to JSON

In [None]:
embeddings_data = []

In [None]:
for i, (doc, emb) in enumerate(zip(text_chunks, text_embeddings)):
    embeddings_data.append({
        "id": f"text_{i}",
        "type": "text",
        "content": doc.page_content[:200],  # preview
        "embedding": emb.tolist()
    })

In [None]:
for i, (img, emb) in enumerate(zip(image_items, image_embeddings)):
    img_name = f"image_{i}.png"
    embeddings_data.append({
        "id": f"image_{i}",
        "type": "image",
        "content": img_name,
        "embedding": emb.tolist()
    })


In [None]:
with open("embeddings.json", "w") as f:
    json.dump(embeddings_data, f, indent=2)

In [None]:
print(f"Saved {len(embeddings_data)} embeddings into embeddings.json")

Saved 141 embeddings into embeddings.json


In [None]:
docs = combined_retriever.get_relevant_documents("tell what algorithm used in this pdf")
docs

  docs = combined_retriever.get_relevant_documents("tell what algorithm used in this pdf")


[Document(metadata={'producer': 'iText 4.2.0 by 1T3XT', 'creator': 'PyPDF', 'creationdate': '2023-04-09T15:59:06-07:00', 'keywords': 'facility location optimization,artificial neural network,heuristic,demand prediction,e-logistics', 'moddate': '2023-04-09T15:59:07-07:00', 'subject': 'INFORMS Journal on Computing 2022.34:769-789', 'author': 'Junming Liu, Weiwei Chen, Jingyuan Yang, Hui Xiong, and Can Chen', 'title': 'Iterative Prediction-and-Optimization for E-Logistics Distribution Network Design', 'source': 'ijoc.2021.1107.pdf', 'total_pages': 22, 'page': 8, 'page_label': '9'}, page_content='∑\nj∈VQ\nyij ≤\n∑\nk∈K\ndkgi, ∀i ∈ VP,( 7 )\n∑\ni∈VP\nyij ≤\n∑\nk∈K\ndkgj, ∀j ∈VQ,( 8 )\n∑\nk∈K\nzjk ≤\n∑\nk∈K\ndkgj, ∀j ∈VQ,( 9 )\n∑\ni′∈VP\\{i}\nτii′ ≤\n∑\nk∈K\ndkgi, ∀i ∈VP, (10)\n∑\ni∈VP\\{i′}\nτii′ ≤\n∑\nk∈K\ndkgi′, ∀i′∈ VP, (11)\n∑\ns∈S\nxsi +\n∑\ni′∈VP\\{i}\nτi′i /equals\n∑\nj∈VQ\nyij +\n∑\ni′∈VP\\{i}\nτii′ ∀i∈VP, (12)\n∑\ni∈VP\nyij /equals\n∑\nk∈K\nzjk, ∀j∈VQ: (13)\nFigure 4.(Color online)

In [None]:
!pip install langchain-openai


Collecting langchain-openai
  Downloading langchain_openai-0.3.34-py3-none-any.whl.metadata (2.4 kB)
Downloading langchain_openai-0.3.34-py3-none-any.whl (75 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/75.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: langchain-openai
Successfully installed langchain-openai-0.3.34


In [None]:
from langchain_community.chat_models import ChatPerplexity
from langchain.chains import ConversationalRetrievalChain
from langchain.prompts import PromptTemplate
from langchain.memory import ConversationBufferMemory

In [None]:
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

  memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)


In [None]:
prompt = PromptTemplate(
    template="""
You are an AI assistant. Answer the user's question based on the retrieved documents below.

Context:
{context}

Question:
{question}

Answer:""",
    input_variables=["context", "question"]
)


In [None]:
import os

# Set the environment variable for the OpenAI client used by ChatPerplexity
os.environ["PERPLEXITY_API_KEY"] = pplx_key

llm = ChatPerplexity(
    model="sonar-medium-online",
    temperature=0.7,
    max_tokens=1000
)

In [None]:
qa = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=combined_retriever,   # your text+image MultiRetriever
    memory=memory,
    combine_docs_chain_kwargs={"prompt": prompt},
    return_source_documents=True
)

In [None]:
response = qa.invoke("tell about this pdf")
print(response.content)