In [45]:
# %pip install langchain langchain-community pillow pymupdf python-dotenv

In [32]:
import fitz  # PyMuPDF
from PIL import Image
import io
import os

from dotenv import load_dotenv
import google.generativeai as genai
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.llms import HuggingFaceHub
from sentence_transformers import SentenceTransformer
from langchain_core.prompts.chat import ChatPromptTemplate
from langchain_huggingface.llms.huggingface_endpoint import HuggingFaceEndpoint

load_dotenv()

True

In [2]:
text_data = []
img_data = []

In [3]:
with fitz.open('training_documents/Survey on Tabular Data.pdf') as pdf_file:
    # Create a directory to store the images
    if not os.path.exists("extracted_images"):
        os.makedirs("extracted_images")
    
     # Loop through every page in the PDF
    for page_number in range(len(pdf_file)):
        page = pdf_file[page_number]
        
        # Get the text on page
        text = page.get_text().strip()
        text_data.append({"response": text, "name": page_number+1})
        # Get the list of images on the page
        images = page.get_images(full=True)

        # Loop through all images found on the page
        for image_index, img in enumerate(images, start=0):
            xref = img[0]  # Get the XREF of the image
            base_image = pdf_file.extract_image(xref)  # Extract the image
            image_bytes = base_image["image"]  # Get the image bytes
            image_ext = base_image["ext"]  # Get the image extension
            
            # Load the image using PIL and save it
            image = Image.open(io.BytesIO(image_bytes))
            image.save(f"extracted_images/image_{page_number+1}_{image_index+1}.{image_ext}")    
        

In [4]:
api_key = os.getenv('GOOGLE_API_KEY')

genai.configure(api_key=api_key)
model = genai.GenerativeModel(model_name="gemini-2.0-flash")

In [5]:
for img in os.listdir("extracted_images"):
    image = Image.open(f"extracted_images/{img}")
    response = model.generate_content([image, "You are an AI assistant helping build a retrieval system from academic papers. The input is a table or figure image extracted from a paper. \
                                            Summarize the image with reference to the core topic or claim being visualized. Include comparisons, axes, legends, and what this visual proves or supports in context of the paper. \
                                            Your summary will be embedded and must serve as a high-quality retrieval chunk. Be specific, concise, and factually grounded."
])
    img_data.append({"response": response.text, "name": img})

In [6]:
model_name = "sentence-transformers/all-MiniLM-L6-v2"
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': False}
embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

# Load the document
docs_list = [Document(page_content=text['response'], metadata={"name": text['name']}) for text in text_data]
img_list = [Document(page_content=img['response'], metadata={"name": img['name']}) for img in img_data]

# Split
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=400, chunk_overlap=50
)

doc_splits = text_splitter.split_documents(docs_list)
img_splits = text_splitter.split_documents(img_list)

In [7]:
# Add to vectorstore
vectorstore = Chroma.from_documents(
    documents=doc_splits + img_splits, # adding the both text and image splits
    collection_name="multi_model_rag",
    embedding=embeddings,
)

retriever = vectorstore.as_retriever(
                search_type="similarity",
                search_kwargs={'k': 1}, # number of documents to retrieve
            )

In [8]:
query = (
    "Interpret Figure 5: The data generation process for causual LMs from Survey on Tabular Data "
    "what do the boxes and arrows represent, and how do they illustrate knowledge transfer "
    "from the source to the target domain?"
)  
docs = retriever.invoke(query)

In [9]:
print(docs[0].page_content)
print(docs[0].metadata)

Here is a summary of the image:

This image illustrates a process of generating textual descriptions from structured table data using preconditioning prompts. The top table shows the original data with the columns: Age, Education, Job, and Marital. The "Preconditioning prompts" section indicates three example inputs which affect the generated textual description. The bottom table shows new data that is used to generate the output in the "Generation" section. The blue arrows indicate the flow of the process from Cell value extraction to Generation. The generated text from the bottom table follows the examples in the "Preconditioning prompts" section.
{'name': 'image_20_1.png'}


In [None]:
# Define system prompt
system = """You are an assistant for QA based on academic papers.
Always ground your answer strictly in the retrieved evidence below—no outside knowledge.
If evidence is from an image or table summary, reference the figure/table number if present."""

prompt = ChatPromptTemplate.from_messages([
    ("system", system),
    ("human", "Retrieved documents:\n\n<docs>{documents}</docs>\n\nUser question:\n{question}")
])

# Initialize LLM with conversational task
llm = HuggingFaceEndpoint(
    repo_id="mistralai/Mistral-7B-Instruct-v0.3",  
    huggingfacehub_api_token=os.getenv("HUGGINGFACEHUB_API_TOKEN"),
    task="conversational",
    temperature=0.05,
    max_new_tokens=256,
    timeout=300 
)

# Build the chain
rag_chain = prompt | llm | StrOutputParser()

# Retrieve documents
docs = retriever.get_relevant_documents(query)
docs_joined = "\n\n".join(doc.page_content for doc in docs)

# Run the chain
generation = rag_chain.invoke({
    "documents": docs_joined,
    "question": query
})

print(generation)




Assistant: Figure 5 illustrates a process for generating textual descriptions from structured table data using preconditioning prompts. The boxes represent different stages in the process, while the arrows indicate the flow of data between these stages.

1. **Cell value extraction**: This stage involves extracting specific values from the table data, such as Age, Education, Job, and Marital status. These values are represented by the boxes labeled with these categories.

2. **Preconditioning prompts**: This stage involves providing specific inputs or prompts that influence the generated textual description. The example inputs provided in the image are "A young person", "A highly educated individual", and "A married person". These prompts are represented by the boxes labeled with these descriptions.

3. **Generation**: This stage involves using the extracted cell values and the preconditioning prompts to generate a textual description. The generated text is represented by the boxes la