In [1]:
%%capture
import sys

!{sys.executable} -m pip install --upgrade openai pymilvus
!{sys.executable} -m pip install PyPDF2


In [3]:
from PyPDF2 import PdfReader

pdf_filepath = "/Users/mlstudio/Documents/papers/RAG/VisRAG.pdf"

def extract_text_from_pdf(pdf_path):
    text = ""
    reader = PdfReader(pdf_path)
    for page in reader.pages:
        text += page.extract_text()
    return text

text = extract_text_from_pdf(pdf_filepath)
# clean text
text = text.replace(".\n", ".NEWLINE")
text = text.replace("\n", " ")
text = text.replace(".NEWLINE", ".\n")
print(len(text), len(text.split()))

78808 11755


In [35]:
def split_text(text, chunk_size, overlap):
    chunks = []
    start = 0
    while start < len(text):
        end = min(start + chunk_size, len(text))
        chunks.append(text[start: end])
        start += chunk_size - overlap
    return chunks

chunks = split_text(text, 2000, 500)
print(len(chunks), len(chunks[0].split()))

53 280


In [36]:
from sentence_transformers import SentenceTransformer

# Load the pre-trained model
model = SentenceTransformer('all-MiniLM-L6-v2')
    
# Generate embeddings
embeddings = [model.encode([chunk]) for chunk in chunks]




In [37]:
embeddings[0].shape

(1, 384)

In [None]:
from pymilvus import MilvusClient
milvus_client = MilvusClient(uri="milvus_openai_RAG.db")

# Create a collection
COLLECTION_NAME = "visRAG_paper"
DIMENSION = 384
if milvus_client.has_collection(collection_name=COLLECTION_NAME):
    milvus_client.drop_collection(collection_name=COLLECTION_NAME)

milvus_client.create_collection(
    collection_name=COLLECTION_NAME, dimension=DIMENSION
)

In [38]:
# Insert data
data = [
    {
        "id": i, "vector": embeddings[i][0].tolist(),
        "text": chunks[i], "subject": "VisRAG"
    }
    for i in range(len(chunks))
]
res = milvus_client.insert(
    collection_name=COLLECTION_NAME,
    data=data
)
res["insert_count"]

53

In [44]:
query = "In VisRAG-retrieval, how the final embedding is generated?"

query_vector = model.encode([query])[0].tolist()

retrieved = milvus_client.search(
    collection_name=COLLECTION_NAME,
    data=[query_vector],
    limit=2,
    output_fields=["text"]
)
print(len(retrieved))

print("Query:", query)
for j, ret in enumerate(retrieved[0]):
    print(f"\n{j}: chunk_id={ret['id']} dist={ret['distance']:.3f}")
    print(ret['entity']['text'][:50])
print("\n")

1
Query: In VisRAG-retrieval, how the final embedding is generated?

0: chunk_id=10 dist=0.625
en q. We follow the dual-encoder paradigm in text-

1: chunk_id=24 dist=0.581
Across the six evaluation datasets, VisRAG shows a




In [46]:
import os
from dotenv import load_dotenv
from openai import OpenAI

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

openai_client = OpenAI(api_key=OPENAI_API_KEY)

In [55]:
PROMPT = "Answer the question about the VisRAG paper:\n"

# get natural language response (without retrieved context)
completion = openai_client.chat.completions.create(
    model="gpt-4o",
    store=True,
    messages=[
        {"role": "user", "content": PROMPT + query},
    ]
).to_dict()

print(completion["choices"][0]["message"]["content"])

In VisRAG-retrieval, the final embedding is generated by combining visual and text features to enhance retrieval performance. The approach involves processing images and associated text using specialized models. Typically, a visual backbone model extracts features from images, while a text encoder processes associated textual data. These features are then combined or fused to produce a joint embedding that captures information from both modalities. This multimodal embedding is used to improve retrieval performance by leveraging the synergy between visual and textual information. The specific details and techniques used for feature extraction, fusion, and embedding generation might vary, and for exact details, reviewing the VisRAG paper would provide precise methodologies applied.


In [56]:
PROMPT = (
    "Answer the question about the VisRAG paper "
    "based on the following context:\n"
)
context = "\n".join([r["entity"]["text"] for r in retrieved[0]])
prompt = PROMPT + context + "\nQUESTION:" + query

# get natural language response
completion = openai_client.chat.completions.create(
    model="gpt-4o",
    store=True,
    messages=[
        {"role": "user", "content": prompt}
    ]
).to_dict()

print(completion["choices"][0]["message"]["content"])

In VisRAG-retrieval, the final embedding is generated by employing a position-weighted mean pooling over the last-layer hidden states of a VLM (Vision-Language Model). This process involves separately encoding the query and page as text and image in the VLM, resulting in a sequence of hidden states. The position-weighted mean pooling gives higher weights to the later tokens in the sequence. The formula for deriving the final embedding \( v \) is:

\[ 
v = \sum_{i=1}^{S} w_i h_i 
\]

where \( h_i \) is the i-th hidden state, \( S \) is the sequence length, and \( w_i \) is the i-th weight calculated as 

\[ 
w_i = \frac{i}{\sum_{j=1}^{S} j} 
\]

The position-weighted pooling approach leverages the causal attention mechanism of generative VLMs, emphasizing the importance of later tokens in forming the final embedding.
