In [1]:
# %pip install langchain langchain-community pillow pymupdf python-dotenv

In [2]:
import fitz  # PyMuPDF
from PIL import Image
import io
import os
import os
from openai import OpenAI
from PIL import Image
import base64
from io import BytesIO

from dotenv import load_dotenv
import google.generativeai as genai
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.llms import HuggingFaceHub
from sentence_transformers import SentenceTransformer
from langchain_core.prompts.chat import ChatPromptTemplate
from langchain_huggingface.llms.huggingface_endpoint import HuggingFaceEndpoint
from langchain_openai import ChatOpenAI

load_dotenv()




True

In [3]:
text_data = []

img_data = []

In [4]:
with fitz.open('training_documents/Overclocking LLM Reasoning - Monitoring and Controlling Thinking Path Lengths.pdf') as pdf_file:
    # Create a directory to store the images
    if not os.path.exists("extracted_images"):
        os.makedirs("extracted_images")
    
     # Loop through every page in the PDF
    for page_number in range(len(pdf_file)):
        page = pdf_file[page_number]
        
        # Get the text on page
        text = page.get_text().strip()
        text_data.append({"response": text, "name": page_number+1})
        # Get the list of images on the page
        images = page.get_images(full=True)

        # Loop through all images found on the page
        for image_index, img in enumerate(images, start=0):
            xref = img[0]  # Get the XREF of the image
            base_image = pdf_file.extract_image(xref)  # Extract the image
            image_bytes = base_image["image"]  # Get the image bytes
            image_ext = base_image["ext"]  # Get the image extension
            
            # Load the image using PIL and save it
            image = Image.open(io.BytesIO(image_bytes))
            image.save(f"extracted_images/image_{page_number+1}_{image_index+1}.{image_ext}")    
        

In [5]:
api_key = os.getenv('API_KEY')
client = OpenAI(
    api_key=api_key,
    base_url="https://openrouter.ai/v1"
)

def image_to_base64(image_path, max_size=1024):
    with Image.open(image_path) as im:
        # Resize image if too large
        if max(im.size) > max_size:
            im.thumbnail((max_size, max_size))
        # Convert to RGB if needed (some models dislike RGBA)
        if im.mode != "RGB":
            im = im.convert("RGB")
        # Save to buffer with reduced quality
        buffer = BytesIO()
        im.save(buffer, format="JPEG", quality=70, optimize=True)
        return base64.b64encode(buffer.getvalue()).decode()


img_data = []
for img in os.listdir("extracted_images"):
    img_path = f"extracted_images/{img}"
    img_b64 = image_to_base64(img_path)
    response = client.chat.completions.create(
        model="google/gemini-2.0-flash-001",
        messages=[
            {"role": "system", 
             "content": """
                        You are an AI assistant helping build a retrieval system from academic papers. 
                        The input is a table or figure image extracted from a paper. Summarize the image with reference to the core topic or claim being visualized. 
                        Include comparisons, axes, legends, and what this visual proves or supports in context of the paper. 
                        Your summary will be embedded and must serve as a high-quality retrieval chunk. Be specific, concise, and factually grounded.
                        """
                        },
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/png;base64,{img_b64}"
                        }
                    }
                ]
            }
        ]
    )
    # Adjust to match OpenAI format
    img_data.append({"response": response, "name": img})

In [6]:
model_name = "sentence-transformers/all-MiniLM-L6-v2"
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': False}
embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

# Load the document
docs_list = [Document(page_content=text['response'], metadata={"name": text['name']}) for text in text_data]
img_list = [Document(page_content=img['response'], metadata={"name": img['name']}) for img in img_data]

# Split
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=400, chunk_overlap=50
)

doc_splits = text_splitter.split_documents(docs_list)
img_splits = text_splitter.split_documents(img_list)

In [7]:
# Add to vectorstore
vectorstore = Chroma.from_documents(
    documents=doc_splits + img_splits, # adding the both text and image splits
    collection_name="multi_model_rag",
    embedding=embeddings,
)

retriever = vectorstore.as_retriever(
                search_type="similarity",
                search_kwargs={'k': 1}, # number of documents to retrieve
            )

In [8]:
query = (
    """
            In the "Overclocking LLM Reasoning" paper, there is a set of bar graphs comparing prediction errors for different prompt styles and reasoning path lengths.

            Explain what these bar graphs show about the model’s ability to predict ideal progress across different settings.

            Specifically:

            What does the height of the bars represent?

            How do different prompt styles or bin lengths affect prediction error?

            What does this reveal about the robustness of the TPV (Thinking Progress Vector) model?
                
    """
)
query = " ".join([query]) if isinstance(query, tuple) else query

docs = retriever.get_relevant_documents(query)

  docs = retriever.get_relevant_documents(query)


In [9]:
print(docs[0].page_content)
print(docs[0].metadata)

show that the relative position can be captured by projections that we term “progress vectors".
The extracted information is then used to create an interactive loading bar visualization, see Figure 1(a)
that depicts the model’s progress throughout the thinking phase, making the reasoning process more
transparent and easier for users to collaborate with.
The ability to extract progress information does not mean that the model employs it mechanistically,
unless an intervention analysis is performed. We thus manipulate the internal representation along
the progress vectors and achieve a clear modulation of the length of the thinking phase, showing
overclocking effects. The former is depicted in Figure 1(b). Reassuringly, this modulation does not
tend to be detrimental to the LLM’s performance. In fact, we show that overclocking can improve the
model’s performance by mitigating overthinking, enhancing computational efficiency, and tailoring
the model’s reasoning depth to each task’s comple

In [None]:
system = """
You are an assistant for QA on scientific papers. 
You must:
1. You are an expert assistant answering questions about academic papers.
2. If explaining a table or figure, mention the table/figure number (e.g., "Table 5") and page.
3. Explain what the table shows *and* how it connects to the paper's main argument.
Keep it concise .
"""

prompt = ChatPromptTemplate.from_messages([
  ("system", system),
  ("human", "Context:\n<docs>{documents}</docs>\n\nQuestion:\n{question}")
])


# LLM Initialization
llm = ChatOpenAI(
    openai_api_key=os.getenv("OPENROUTER_API_KEY"),
    base_url=os.getenv("OPENROUTER_URL"),
    model=os.getenv("MODEL_NAME"),
    max_tokens=32000,
)

# Build the RAG chain
rag_chain = prompt | llm | StrOutputParser()

# Retrieval
docs = retriever.get_relevant_documents(query)
docs_joined = "\n\n".join(doc.page_content for doc in docs)

# Run
generation = rag_chain.invoke({"documents": docs_joined, "question": query})
print(generation)

In the "Overclocking LLM Reasoning" paper, the set of bar graphs comparing prediction errors illustrates how well the model can estimate its ideal progress during reasoning under various prompt styles and reasoning path lengths. 

1. **Height of the Bars**: The height of the bars in the graphs represents the level of prediction error, with lower bars indicating better accuracy in the model's estimations of its progress within the thinking phase. 

2. **Effects of Different Prompt Styles and Bin Lengths**: Variations in prompt styles or bin lengths influence prediction error, where certain styles may lead to more accurate progress estimations, resulting in shorter bars (lower prediction errors). For example, more structured or specific prompts could guide the model toward its ideal reasoning trajectory more effectively than vague prompts. Similarly, bin lengths that better align with the model's reasoning process may yield lower prediction errors.

3. **Robustness of the TPV Model**: Th