In [15]:
# %pip install langchain langchain-community pillow pymupdf python-dotenv

In [16]:
import fitz  # PyMuPDF
from PIL import Image
import io
import os

from dotenv import load_dotenv
import google.generativeai as genai
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.llms import HuggingFaceHub
from sentence_transformers import SentenceTransformer
from langchain_core.prompts.chat import ChatPromptTemplate
from langchain_huggingface.llms.huggingface_endpoint import HuggingFaceEndpoint
from langchain_openai import ChatOpenAI

load_dotenv()

True

In [17]:
text_data = []
img_data = []

In [18]:
with fitz.open('training_documents/Survey on Tabular Data.pdf') as pdf_file:
    # Create a directory to store the images
    if not os.path.exists("extracted_images"):
        os.makedirs("extracted_images")
    
     # Loop through every page in the PDF
    for page_number in range(len(pdf_file)):
        page = pdf_file[page_number]
        
        # Get the text on page
        text = page.get_text().strip()
        text_data.append({"response": text, "name": page_number+1})
        # Get the list of images on the page
        images = page.get_images(full=True)

        # Loop through all images found on the page
        for image_index, img in enumerate(images, start=0):
            xref = img[0]  # Get the XREF of the image
            base_image = pdf_file.extract_image(xref)  # Extract the image
            image_bytes = base_image["image"]  # Get the image bytes
            image_ext = base_image["ext"]  # Get the image extension
            
            # Load the image using PIL and save it
            image = Image.open(io.BytesIO(image_bytes))
            image.save(f"extracted_images/image_{page_number+1}_{image_index+1}.{image_ext}")    
        

In [19]:
api_key = os.getenv('GOOGLE_API_KEY')

genai.configure(api_key=api_key)
model = genai.GenerativeModel(model_name="gemini-2.0-flash")

In [20]:
for img in os.listdir("extracted_images"):
    image = Image.open(f"extracted_images/{img}")
    response = model.generate_content([image, "You are an AI assistant helping build a retrieval system from academic papers. The input is a table or figure image extracted from a paper. \
                                            Summarize the image with reference to the core topic or claim being visualized. Include comparisons, axes, legends, and what this visual proves or supports in context of the paper. \
                                            Your summary will be embedded and must serve as a high-quality retrieval chunk. Be specific, concise, and factually grounded."
])
    img_data.append({"response": response.text, "name": img})

In [21]:
model_name = "sentence-transformers/all-MiniLM-L6-v2"
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': False}
embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

# Load the document
docs_list = [Document(page_content=text['response'], metadata={"name": text['name']}) for text in text_data]
img_list = [Document(page_content=img['response'], metadata={"name": img['name']}) for img in img_data]

# Split
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=400, chunk_overlap=50
)

doc_splits = text_splitter.split_documents(docs_list)
img_splits = text_splitter.split_documents(img_list)

In [22]:
# Add to vectorstore
vectorstore = Chroma.from_documents(
    documents=doc_splits + img_splits, # adding the both text and image splits
    collection_name="multi_model_rag",
    embedding=embeddings,
)

retriever = vectorstore.as_retriever(
                search_type="similarity",
                search_kwargs={'k': 1}, # number of documents to retrieve
            )

In [36]:
query = (
    "What are the main challenges and opportunities of applying large language models (LLMs) to tabular data, "
    "as discussed in the 'Survey on Tabular Data'? Please summarize the key techniques, typical preprocessing steps, "
    "and evaluation metrics used for LLMs working with tabular data."
)
query = " ".join([query]) if isinstance(query, tuple) else query

docs = retriever.get_relevant_documents(query)

In [37]:
print(docs[0].page_content)
print(docs[0].metadata)

2. The transformation of tabular data into LLM-readable natural language addresses the curse of
dimensionality (created by the one-hot encoding of categorical data).
3. The emergent capabilities, such as step-by-step reasoning through CoT, have transformed LM from
language modeling to a more general task-solving tool. Research is needed to test the limit of LLM’s
emergent abilities on tabular data modeling.
1.4
Contribution
The key contributions of this work are as follows:
1. A formal break down of key techniques for LLMs’ applications on tabular data We
split the application of LLM in tabular data to tabular data prediction, tabular data synthesis,
tabular data question answering and table understanding. We further extract key techniques that
can apply to all applications. We organize these key techniques in a taxonomy that researchers and
practitioners can leverage to describe their methods, find relevant techniques and understand the
difference between these techniques. We further 

In [40]:
# System prompt
# system = """You are an assistant for QA based on academic papers.
# Always ground your answer strictly in the retrieved evidence below—no outside knowledge.
# If evidence is from an image or table summary, reference the figure/table number if present."""

# prompt = ChatPromptTemplate.from_messages([
#     ("system", system),
#     ("human", "Retrieved documents:\n\n<docs>{documents}</docs>\n\nUser question:\n{question}")
# ])

system = """
You are an assistant for QA on scientific papers. 
You must:
1. You are an expert assistant answering questions about academic papers.
2. If explaining a table or figure, mention the table/figure number (e.g., "Table 5") and page.
3. Explain what the table shows *and* how it connects to the paper's main argument.
Keep it concise .
"""

prompt = ChatPromptTemplate.from_messages([
  ("system", system),
  ("human", "Context:\n<docs>{documents}</docs>\n\nQuestion:\n{question}")
])


# LLM Initialization
llm = ChatOpenAI(
    openai_api_key=os.getenv("OPENROUTER_API_KEY"),
    base_url=os.getenv("OPENROUTER_URL"),
    model=os.getenv("MODEL_NAME"),
    # temperature=0.05,
    # max_tokens=256,
    # streaming=True,
    # extra_body={'repetition_penalty': 1.03},
    # top_p=0.9,
)

# Build the RAG chain
rag_chain = prompt | llm | StrOutputParser()

# Retrieval
docs = retriever.get_relevant_documents(query)
docs_joined = "\n\n".join(doc.page_content for doc in docs)

# Run
generation = rag_chain.invoke({"documents": docs_joined, "question": query})
print(generation)


Based on the provided text snippet from a survey on tabular data and LLMs:

1.  **Main Challenges:**
    *   The high dimensionality issue caused by one-hot encoding categorical data in tables (the "curse of dimensionality").
    *   The need to represent structured tabular data in a way LLMs can process effectively (transforming tabular data into LLM-readable natural language).

2.  **Main Opportunities:**
    *   Leveraging LLMs' emergent capabilities like step-by-step reasoning (Chain-of-Thought, CoT) to move beyond language modeling and tackle more general, structured data tasks (tabular data modeling, prediction, etc.).

3.  **Key Techniques:**
    *   The survey breaks down LLM applications to specific tasks: tabular data prediction, synthesis, question answering, and table understanding.
    *   It organizes techniques used (like natural language description generation from tables) into a taxonomy, subdividing them into subsections for clarity, comparison, and benchmarking.

4. 