In [1]:
import os
import uuid
import base64
import io
from pathlib import Path
from dotenv import load_dotenv
from PIL import Image
from minio import Minio
from minio.error import S3Error
from unstructured.partition.pdf import partition_pdf
from langchain_core.messages import HumanMessage
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
from langchain_openai import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from openai import OpenAI

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()
OPENAI_API_TOKEN = os.getenv("OPENAI_API_KEY")
os.environ["OPENAI_API_KEY"] = OPENAI_API_TOKEN

In [3]:
MINIO_ENDPOINT = "192.168.190.214:9000"
ACCESS_KEY = "admin"
SECRET_KEY = "stackmax"
BUCKET_NAME = "mmrag"
DOCUMENT_PATH = "attention.pdf"

In [4]:
minio_client = Minio(MINIO_ENDPOINT, access_key=ACCESS_KEY, secret_key=SECRET_KEY, secure=False)
if not minio_client.bucket_exists(BUCKET_NAME):
    minio_client.make_bucket(BUCKET_NAME)

In [5]:
elements = partition_pdf(filename=DOCUMENT_PATH, strategy='hi_res', extract_images_in_pdf=True,
                         extract_image_block_types=["Image", "Table"], extract_image_block_to_payload=True)

In [6]:
text_elements, table_elements, image_elements = [], [], []
Path("output/images").mkdir(parents=True, exist_ok=True)
Path("output/tables").mkdir(parents=True, exist_ok=True)

In [7]:
for element in elements:
    doc_id = str(uuid.uuid4())
    element_dict = {"id": doc_id, "metadata": element.metadata.to_dict()}

    if element.category in ['NarrativeText', 'Text']:
        element_dict["content"] = str(element)
        text_elements.append(element_dict)

    elif element.category == 'Table':
        if element.metadata.image_base64:
            img_file = Path("output/tables") / f"{doc_id}.png"
            img_file.write_bytes(base64.b64decode(element.metadata.image_base64.replace('\n', '')))
            element_dict["image_path"] = str(img_file)

        table_elements.append(element_dict)

    elif element.category == 'Image' and element.metadata.image_base64:
        img_file = Path("output/images") / f"{doc_id}.png"
        img_file.write_bytes(base64.b64decode(element.metadata.image_base64.replace('\n', '')))
        element_dict["image_path"] = str(img_file)
        image_elements.append(element_dict)

In [8]:
def compress_and_encode_image(image_path, resize_width=512):
    img = Image.open(image_path)
    if img.width > resize_width:
        ratio = resize_width / img.width
        img = img.resize((resize_width, int(img.height * ratio)))
    buffer = io.BytesIO()
    img.save(buffer, format="PNG")
    return base64.b64encode(buffer.getvalue()).decode("utf-8")

In [9]:
chat = ChatOpenAI(model="gpt-4o", max_tokens=1024)

In [10]:
prompt = """You are an assistant tasked with summarizing images for retrieval. \
These summaries will be embedded and used to retrieve the raw image. \
Give a concise summary of the image that is well optimized for retrieval."""

In [11]:
from langchain.schema import Document

In [12]:
summaries = []
for element in image_elements + table_elements:
    if "image_path" in element:
        img_base64 = compress_and_encode_image(element["image_path"])
        summary = chat.invoke([HumanMessage(content=[{"type": "text", "text": prompt},
                                                  {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_base64}"}}])]).content
        doc = Document(page_content=summary,
        metadata={
                "id": element['id'],
                "image_path": element["image_path"],
                "image_base64": element['metadata']['image_base64'],
                "page_number": element['metadata']['page_number'],
                "last_modified":element['metadata']['last_modified'],
                "filename": element['metadata']['filename'],
                "image_mime_type": element['metadata']['image_mime_type']
        }
        )
        summaries.append(doc)
                

In [13]:
def upload_to_minio(file_path):
    try:
        object_name = Path(file_path).name
        minio_client.fput_object(bucket_name=BUCKET_NAME, object_name=object_name, file_path=file_path)
        return object_name
    except S3Error as e:
        print(f"MinIO upload error: {str(e)}")
        raise

In [14]:
for element in table_elements + image_elements:
    if "image_path" in element:
        element["minio_path"] = upload_to_minio(element["image_path"])
    if element.get("content") and element in table_elements:
        element["content_minio_path"] = upload_to_minio(element["content"])

In [15]:
Path("output/texts").mkdir(parents=True, exist_ok=True)
for element in text_elements:
    text_file = Path("output/texts") / f"{element['id']}.txt"
    text_file.write_text(element['content'], encoding='utf-8')
    # element["minio_path"] = upload_to_minio(str(text_file))
    doc = Document(page_content=element['content'],
        metadata={
                "id": element['id'],
                "page_number": element['metadata']['page_number'],
                "last_modified":element['metadata']['last_modified'],
                "filename": element['metadata']['filename']
        }
        )
    summaries.append(doc)

In [16]:
len(summaries)

82

In [17]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [18]:
text_splitter=RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=50)

In [19]:
text_chunks=text_splitter.split_documents(summaries)

In [20]:
db_path = "mm_rag2"
embeddings = OpenAIEmbeddings(api_key=OPENAI_API_TOKEN)

  embeddings = OpenAIEmbeddings(api_key=OPENAI_API_TOKEN)


In [21]:
texts = [doc.page_content for doc in text_chunks]
metadatas = [doc.metadata for doc in text_chunks]

In [22]:
db = Chroma.from_texts(
    texts=texts,
    embedding=embeddings,
    metadatas=metadatas,
    persist_directory=db_path
)

In [23]:
db.persist()

  db.persist()


In [24]:
retriever = db.as_retriever()

In [25]:
query = "explain Scaled Dot-Product Attention"
context_docs = retriever.get_relevant_documents(query, k=10)

  context_docs = retriever.get_relevant_documents(query, k=10)


In [26]:
context_docs

[Document(metadata={'id': '4a9a97b9-c26e-4be1-a07f-26d4a4b58ece', 'image_path': 'output/images/4a9a97b9-c26e-4be1-a07f-26d4a4b58ece.png', 'filename': 'attention.pdf', 'image_base64': '/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/2wBDAQkJCQwLDBgNDRgyIRwhMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjL/wAARCAIAAU4DASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/8QAHwEAAwEBAQEBAQEBAQAAAAAAAAECAwQFBgcICQoL/8QAtREAAgECBAQDBAcFBAQAAQJ3AAECAxEEBSExBhJBUQdhcRMiMoEIFEKRobHBCSMzUvAVYnLRChYkNOEl8RcYGRomJygpKjU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6goOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3uLm6wsPExcbHyMnK0tPU1dbX2Nna4uPk5ebn6Onq8vP09fb3+Pn6/9oA

In [27]:
context_text = "\n\n".join([doc.page_content for doc in context_docs])

In [28]:
context_text

'Diagram of scaled dot-product attention in a neural network, showing linear layers for Q, K, V inputs, followed by scaled attention, concatenation, and additional linear transformation.\n\nFigure 2: (left) Scaled Dot-Product Attention. (right) Multi-Head Attention consists of several attention layers running in parallel.\n\nWe call our particular attention "Scaled Dot-Product Attention" (Figure 2). The input consists of queries and keys of dimension dk, and values of dimension dv. We compute the dot products of the\n\nThe two most commonly used attention functions are additive attention [2], and dot-product (multi- plicative) attention. Dot-product attention is identical to our algorithm, except for the scaling factor 1√ of . Additive attention computes the compatibility function using a feed-forward network with dk a single hidden layer. While the two are similar in theoretical complexity, dot-product attention is much faster and more space-efﬁcient in practice, since it can be imple

In [29]:
final_prompt = PromptTemplate.from_template("""
    you are the AI assistant to summarize the answer relevant to the context based on user query.
    from the local document: {context}
    question: {query}
""")

In [30]:
chain = final_prompt | chat | StrOutputParser()
response_text = chain.invoke({"context": context_text, "query": query})
print("\nAnswer:\n", response_text)


Answer:
 Scaled Dot-Product Attention is an attention mechanism used in neural networks, particularly within Transformer models. It involves the following steps:

1. **Inputs:** It takes queries (Q), keys (K), and values (V) as inputs. The queries and keys have a dimension of \(d_k\), while the values have a dimension of \(d_v\).

2. **Dot-Product:** The attention mechanism computes the dot products between all queries and keys. This step essentially evaluates how much focus an element in the input sequence should receive in relation to other elements.

3. **Scaling:** The dot products are scaled by \(\frac{1}{\sqrt{d_k}}\). Scaling is crucial for stabilizing the gradients, especially when the dimension \(d_k\) is large, since the raw dot-product values become large, pushing the softmax function into regions with small gradients.

4. **Softmax:** After scaling, a softmax function is applied to obtain the attention weights. This normalizes the weights, effectively highlighting certain 

In [31]:
from datetime import timedelta
from IPython.display import Image as IPImage, display

In [32]:
def get_presigned_url(bucket, object_name, expiry_seconds=3600):
    return minio_client.presigned_get_object(
        bucket_name=bucket,
        object_name=object_name,
        expires=timedelta(seconds=expiry_seconds)
    )

In [35]:
matched_images = []

for doc in context_docs:
    doc_id = doc.metadata.get("id", "")
    image_file = f"{doc_id}.png"
    try:
        url = get_presigned_url(BUCKET_NAME, image_file)
        minio_client.stat_object(BUCKET_NAME, image_file)
        matched_images.append(url)
    except S3Error:
        continue

# 🧠 Display the answer
print("🧠 GPT-4o Answer:\n")
print(response_text)

if matched_images:
    print("\n🖼 Related Image(s):")
    for url in matched_images:
        try:
            display(IPImage(url=url))
        except Exception:
            continue 
else:
    print("\nNo matching images found.")

🧠 GPT-4o Answer:

Scaled Dot-Product Attention is an attention mechanism used in neural networks, particularly within Transformer models. It involves the following steps:

1. **Inputs:** It takes queries (Q), keys (K), and values (V) as inputs. The queries and keys have a dimension of \(d_k\), while the values have a dimension of \(d_v\).

2. **Dot-Product:** The attention mechanism computes the dot products between all queries and keys. This step essentially evaluates how much focus an element in the input sequence should receive in relation to other elements.

3. **Scaling:** The dot products are scaled by \(\frac{1}{\sqrt{d_k}}\). Scaling is crucial for stabilizing the gradients, especially when the dimension \(d_k\) is large, since the raw dot-product values become large, pushing the softmax function into regions with small gradients.

4. **Softmax:** After scaling, a softmax function is applied to obtain the attention weights. This normalizes the weights, effectively highlighting 