In [1]:
import os
import instructor
from instructor import Mode
import nest_asyncio
from openai import OpenAI

nest_asyncio.apply()

In [2]:
import ssl
import certifi
import urllib.request

#ssl._create_default_https_context = lambda: ssl.create_default_context(cafile=certifi.where())
ssl._create_default_https_context = ssl._create_unverified_context


In [3]:
# --- LLM compatibility helpers (add this as a new cell) ---
import asyncio
from typing import Any

def run_llm_sync(llm: Any, prompt: str, max_new_tokens: int = 256) -> str:
    """
    Synchronous wrapper that handles:
      - Ollama wrapper with .complete()
      - HF pipeline-like callables that return list/dict with 'generated_text'
      - Generic callable returning a string
      - OpenAI client objects
    """
    # NEW: Handle OpenAI client (including instructor-patched ones)
    if isinstance(llm, OpenAI):
        response = llm.chat.completions.create(
            model="llama3.1:8b",
            messages=[{"role": "user", "content": prompt}],
            max_tokens=max_new_tokens,
        )
        return response.choices[0].message.content.strip()

    # Ollama synchronous wrapper (llama_index.llms.ollama.Ollama provides .complete())
    if hasattr(llm, "complete") and callable(getattr(llm, "complete")):
        resp = llm.complete(prompt)
        # response may have .text
        if hasattr(resp, "text"):
            return str(resp.text).strip()
        return str(resp).strip()

    # If llm is a huggingface pipeline (callable)
    if callable(llm):
        out = llm(prompt, max_new_tokens=max_new_tokens)
        # HF pipeline returns list[{"generated_text": "..."}]
        if isinstance(out, list) and len(out) > 0 and isinstance(out[0], dict):
            return out[0].get("generated_text", "").strip()
        # Some pipelines return a dict
        if isinstance(out, dict) and "generated_text" in out:
            return out["generated_text"].strip()
        # fallback
        return str(out).strip()

    # Last resort: try calling and stringifying
    try:
        out = llm(prompt)
        return str(out).strip()
    except Exception as e:
        raise RuntimeError(f"LLM sync call failed: {e}")

async def run_llm_async(llm: Any, prompt: str, max_new_tokens: int = 512) -> str:
    """
    Async wrapper that tries to use llm.acomplete() if present, otherwise
    runs the synchronous wrapper in a thread pool.
    """
    # If the wrapper supports async completion (ollama .acomplete)
    if hasattr(llm, "acomplete") and callable(getattr(llm, "acomplete")):
        resp = await llm.acomplete(prompt)
        if hasattr(resp, "text"):
            return str(resp.text).strip()
        return str(resp).strip()

    # Fallback: run the synchronous call in executor
    loop = asyncio.get_event_loop()
    return await loop.run_in_executor(None, run_llm_sync, llm, prompt, max_new_tokens)


In [4]:
from llama_index.core import Settings
from llama_index.llms.ollama import Ollama

Settings.llm = Ollama(model="llama3.1:8b")



## Download arxiv papers based on topic

In [5]:
research_paper_topics = ["RAG", "Agent"]

In [6]:
import arxiv

from pathlib import Path

def download_papers(client, topics, num_results_per_topic):
    """Function to download papers from arxiv for given topics and number of results per topic"""
    for topic in topics:

        # sort by recent data and with max results
        search = arxiv.Search(
        query = topic,
        max_results = num_results_per_topic,
        sort_by = arxiv.SortCriterion.SubmittedDate
        )

        # get the results
        results = client.results(search)

        # download the pdf
        for r in results:
            r.download_pdf()

def list_pdf_files(directory):
    # List all .pdf files using pathlib
    pdf_files = [file.name for file in Path(directory).glob('*.pdf')]
    return pdf_files

In [7]:
# create a client
client = arxiv.Client()

download_papers(client, research_paper_topics, 3)

## Parsing the documents using LlamaParse

In [8]:
from llama_index.readers.file import PDFReader

def parse_files(pdf_files):
    """Parse PDFs locally (no API key required) and return a flat list of Document objects."""
    reader = PDFReader()
    all_documents = []

    for pdf_file in pdf_files:
        try:
            docs = reader.load_data(pdf_file)
            
            # Flatten in case it's a list of tuples or nested lists
            if isinstance(docs, tuple):
                docs = list(docs)
            elif not isinstance(docs, list):
                docs = [docs]
            
            all_documents.extend(docs)
            print(f"Parsed: {pdf_file}")
        except Exception as e:
            print(f"Failed to parse {pdf_file}: {e}")

    return all_documents

In [9]:
directory = './'
pdf_files = list_pdf_files(directory)

documents = parse_files(pdf_files)

Parsed: 2510.11483v1.Uncertainty_Quantification_for_Retrieval_Augmented_Reasoning.pdf
Parsed: 2510.11541v1.Query_Specific_GNN__A_Comprehensive_Graph_Representation_Learning_Method_for_Retrieval_Augmented_Generation.pdf
Parsed: 2510.11654v1.FinVet__A_Collaborative_Framework_of_RAG_and_External_Fact_Checking_Agents_for_Financial_Misinformation_Detection.pdf
Parsed: 2510.11694v1.Operand_Quant__A_Single_Agent_Architecture_for_Autonomous_Machine_Learning_Engineering.pdf
Parsed: 2510.11695v1.When_Agents_Trade__Live_Multi_Market_Trading_Benchmark_for_LLM_Agents.pdf
Parsed: 2510.11701v1.Demystifying_Reinforcement_Learning_in_Agentic_Reasoning.pdf


## LlamaIndex Local Logic that substitutes the LlamaCloud Pipeline
in an effort to circumvent subscription fees to paid embedding services

In [10]:
from llama_index.core import VectorStoreIndex, StorageContext, SimpleDirectoryReader
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SentenceSplitter

# Local embedding model
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

# Local chunking configuration
transform = SentenceSplitter(chunk_size=1024, chunk_overlap=20)

# Flatten parsed LlamaParse outputs into text nodes
all_nodes = []
for doc in documents:
    all_nodes.extend(transform.get_nodes_from_documents([doc]))

# Create local vector index
index = VectorStoreIndex.from_documents(all_nodes, embed_model=embed_model)

# Create a query engine from this index
query_engine = index.as_query_engine(similarity_top_k=10)


2025-10-14 17:56:56,951 - INFO - Load pretrained SentenceTransformer: BAAI/bge-small-en-v1.5
2025-10-14 17:57:01,326 - INFO - 1 prompt is loaded, with the key: query
2025-10-14 17:57:48,440 - INFO - HTTP Request: POST http://localhost:11434/api/show "HTTP/1.1 200 OK"


## Utils
Here, we define some utilities to help us extract metadata from each document.

* Metadata - Pydantic model to extract metadata of author names, companies and general AI tags.
* get_papers_metadata - Extracts the metadata information from the research paper.

In [11]:
from pydantic import BaseModel, Field
from typing import List
from llama_index.core.prompts import PromptTemplate
from llama_index.core.async_utils import run_jobs

class Metadata(BaseModel):
    """Output containing the authors names, authors companies, and general AI tags."""

    author_names: List[str] = Field(..., description="List of author names of the paper. Give empty list if not available")

    author_companies: List[str] = Field(..., description="List of author companies of the paper. Give empty list if not available")

    ai_tags: List[str] = Field(..., description="List of general AI tags related to the paper. Give empty list if not available")

client = instructor.patch(
    OpenAI(
        base_url='http://localhost:11434/v1',
        api_key='ollama',  # required but unused
    ),
    mode=Mode.JSON,
)

async def get_papers_metadata(text):
    """Function to get the metadata from the given paper"""
    prompt = f"""Generate authors names, authors companies, and general top 3 AI tags for the given research paper.

Research Paper:
{text}"""
    
    response = client.chat.completions.create(
        model="llama3.1:8b",
        response_model=Metadata,
        messages=[{"role": "user", "content": prompt}]
    )
    return response

# Then just use client directly for other LLM calls
llm = client  # Simple assignment

In [16]:
import re

def extract_title(outline):
    """Function to extract the title from the first line of the outline"""

    first_line = outline.strip().split('\n')[0]
    return first_line.strip('# ').strip()

def generate_query_with_llm_local(title, section, subsection, llm):
    """Generate a concise research query using a local LLM pipeline."""
    prompt = (
        f"Generate a concise research query for the report titled '{title}'. "
        f"The query should be for the subsection '{subsection}' under the section '{section}'. "
        f"The query should help gather relevant information for this part of the report."
    )

    try:
        # Works for local pipelines (e.g. HuggingFace Transformers)
        output = llm(prompt, max_new_tokens=100)
        
        # Handle both dict (transformers) and string outputs
        if isinstance(output, list) and "generated_text" in output[0]:
            return output[0]["generated_text"].strip()
        elif isinstance(output, dict) and "generated_text" in output:
            return output["generated_text"].strip()
        elif isinstance(output, str):
            return output.strip()
        else:
            return str(output).strip()

    except Exception as e:
        print(f"Failed to generate query for {section} → {subsection}: {e}")
        return None


def classify_query(query):
    """Function to classify the query as either 'LLM' or 'INDEX' based on the query content"""

    prompt = f"""Classify the following query as either "LLM" if it can be answered directly by a large language model with general knowledge, or "INDEX" if it likely requires querying an external index or database for specific or up-to-date information.

    Query: "{query}"

    Consider the following:
    1. If the query asks for general knowledge, concepts, or explanations, classify as "LLM".
    2. If the query asks for specific facts, recent events, or detailed information that might not be in the LLM's training data, classify as "INDEX".
    3. If unsure, err on the side of "INDEX".

    Classification:"""

    classification = run_llm_sync(llm, prompt).upper()

    if classification not in ["LLM", "INDEX"]:
        classification = "INDEX"  # Default to INDEX if the response is unclear

    return classification

import re

def parse_outline_and_generate_queries(outline, llm):
    """
    Parse the outline (string or dict) and use a *local* LLM pipeline
    to generate one query per subsection.
    """

    # Step 1 — Convert outline string → dict automatically
    if isinstance(outline, str):
        lines = outline.splitlines()
        title = "Untitled Report"
        sections = {}
        current_section = None

        for line in lines:
            line = line.strip().lstrip('#').strip()
            if not line:
                continue  # skip blank lines

            # Example: "1. Introduction"
            if re.match(r"^\d+\.", line) and not re.match(r"^\d+\.\d+\.", line):
                current_section = line
                sections[current_section] = []

            # Example: "1.1 Background"
            elif re.match(r"^\d+\.\d+\.", line) and current_section:
                sections[current_section].append(line)

        outline = {"title": title, "sections": sections}

    # Step 2 — Safely access fields
    title = outline.get("title", "Untitled Report")
    sections = outline.get("sections", {})

    queries = {}

    # Step 3 — Generate a query per subsection using local llm
    for section, subsections in sections.items():
        queries[section] = {}
        for subsection in subsections:
            prompt = (
                f"Generate one clear, concise query for the subsection '{subsection}' "
                f"under section '{section}' in the report titled '{title}'. "
                f"The query should guide research to gather relevant information."
            )

            # Run the LLM locally (synchronous or pipeline-based)
            # Run the LLM using the sync helper function
            # 1. Generate the query using the sync helper
            query_text = run_llm_sync(llm, prompt)
            
            # 2. Classify the generated query
            classification = classify_query(query_text)
            
            # 3. Store both the query and its classification in a dictionary
            queries[section][subsection] = {
                'query': query_text,
                'classification': classification
            }

    return queries


## ReportGenerationAgent

In [17]:
from typing import Any, List
from llama_index.core.llms.function_calling import FunctionCallingLLM
from llama_index.core.workflow import Workflow, StartEvent, StopEvent, Context, step
from llama_index.core.workflow import Event

class ReportGenerationEvent(Event):
    pass


class ReportGenerationAgent(Workflow):
    """Report generation agent."""

    def __init__(
        self,
        query_engine: Any,
        llm: FunctionCallingLLM | None = None,
        **kwargs: Any,
    ) -> None:
        super().__init__(**kwargs)
        self.query_engine = query_engine
        self.llm = llm or OpenAI(model='gpt-4o-mini')

    async def format_report(self, section_contents, outline):
        """Format the report based on the section contents."""
        report = ""

        for section, subsections in section_contents.items():
            section_match = re.match(r'^(\d+\.)\s*(.*)$', section)
            if section_match:
                section_num, section_title = section_match.groups()
                
                if "introduction" in section.lower():
                    introduction_num, introduction_title = section_num, section_title
                elif "conclusion" in section.lower():
                    conclusion_num, conclusion_title = section_num, section_title
                else:
                    combined_content = "\n".join(subsections.values())
                    summary_query = f"Provide a short summary for section '{section}':\n\n{combined_content}"
                    section_summary = run_llm_sync(self.llm, summary_query)

                    report += f"# {section_num} {section_title}\n\n{section_summary}\n\n"

                    report = self.get_subsections_content(subsections, report)

        # Add introduction

        introduction_query = f"Create an introduction for the report:\n\n{report}"
        introduction = await run_llm_async(self.llm, introduction_query)
        report = f"# {introduction_num} {introduction_title}\n\n{introduction}\n\n" + report

        # Add conclusion

        conclusion_query = f"Create a conclusion for the report:\n\n{report}"
        conclusion = await run_llm_async(self.llm, conclusion_query)
        report += f"# {conclusion_num} {conclusion_title}\n\n{conclusion}"

        # Add title
        title = extract_title(outline)
        report = f"# {title}\n\n{report}"
        return report

    def get_subsections_content(self, subsections, report):
        """Generate content for each subsection in the outline."""
        # Sort subsections by their keys before adding them to the report
        for subsection in sorted(subsections.keys(), key=lambda x: re.search(r'(\d+\.\d+)', x).group(1) if re.search(r'(\d+\.\d+)', x) else x):
            content = subsections[subsection]
            subsection_match = re.search(r'(\d+\.\d+)\.\s*(.+)', subsection)
            if subsection_match:
                subsection_num, subsection_title = subsection_match.groups()
                report += f"## {subsection_num} {subsection_title}\n\n{content}\n\n"
            else:
                report += f"## {subsection}\n\n{content}\n\n"
        return report

    def generate_section_content(self, queries, reverse=False):
        """Generate content for each section and subsection in the outline."""
        section_contents = {}
        for section, subsections in queries.items():
            section_contents[section] = {}
            subsection_keys = reversed(sorted(subsections.keys())) if reverse else sorted(subsections.keys())
            for subsection in subsection_keys:
                data = subsections[subsection]
                query = data['query']
                classification = data['classification']
                if classification == "LLM":
                    # if inside class method and self.llm exists:
                    answer = run_llm_sync(self.llm, query + " Give a short answer.")

                else:
                    answer = str(query_engine.query(query))
                section_contents[section][subsection] = answer
        return section_contents

    @step(pass_context=True)
    async def queries_generation_event(self, ctx: Context, ev: StartEvent) -> ReportGenerationEvent:
        """Generate queries for the report."""
        
        # Store outline directly as an attribute on ctx
        ctx.outline = ev.outline
    
        outline = ctx.outline
        queries = parse_outline_and_generate_queries(outline, self.llm)
    
        return ReportGenerationEvent(queries=queries)

    @step(pass_context=True)
    async def generate_report(self, ctx: Context, ev: ReportGenerationEvent) -> StopEvent:
        """Generate report."""
        
        outline = ctx.outline
        queries = ev.queries
    
        section_contents = self.generate_section_content(queries, reverse=True)
        report = await self.format_report(section_contents, outline)
    
        return StopEvent(result={"response": report})


## Outline of the report

In [18]:
outline = """
# Research Paper Report on RAG - Retrieval Augmented Generation and Agentic World.

## 1. Introduction

## 2. Retrieval Augmented Generation (RAG) and Agents
2.1. Fundamentals of RAG and Agents.
2.2. Current State and Applications

## 3. Latest Papers:
3.1. HEALTH-PARIKSHA: Assessing RAG Models for Health Chatbots in Real-World Multilingual Settings
3.2. MIRAGE-Bench: Automatic Multilingual Benchmark Arena for Retrieval-Augmented Generation Systems
3.3. VLM-Grounder: A VLM Agent for Zero-Shot 3D Visual Grounding

## 4. Conclusion:
"""

## Generate report

In [19]:
agent = ReportGenerationAgent(
    query_engine=query_engine,
    llm=llm,
    verbose=True,
    timeout=1200.0,
)

report = await agent.run(outline=outline)

print(report['response'])

with open("report.md", "w") as f:
    f.write(report['response'])

Running step queries_generation_event


2025-10-14 18:17:21,927 - INFO - HTTP Request: POST http://localhost:11434/v1/chat/completions "HTTP/1.1 200 OK"
2025-10-14 18:19:45,572 - INFO - HTTP Request: POST http://localhost:11434/v1/chat/completions "HTTP/1.1 200 OK"
2025-10-14 18:20:27,829 - INFO - HTTP Request: POST http://localhost:11434/v1/chat/completions "HTTP/1.1 200 OK"
2025-10-14 18:23:22,197 - INFO - HTTP Request: POST http://localhost:11434/v1/chat/completions "HTTP/1.1 200 OK"
2025-10-14 18:24:17,385 - INFO - HTTP Request: POST http://localhost:11434/v1/chat/completions "HTTP/1.1 200 OK"
2025-10-14 18:25:47,007 - INFO - HTTP Request: POST http://localhost:11434/v1/chat/completions "HTTP/1.1 200 OK"
2025-10-14 18:27:38,265 - INFO - HTTP Request: POST http://localhost:11434/v1/chat/completions "HTTP/1.1 200 OK"
2025-10-14 18:29:44,478 - INFO - HTTP Request: POST http://localhost:11434/v1/chat/completions "HTTP/1.1 200 OK"
2025-10-14 18:30:46,709 - INFO - HTTP Request: POST http://localhost:11434/v1/chat/completions "

Step queries_generation_event produced event ReportGenerationEvent
Running step generate_report


2025-10-14 18:33:05,647 - INFO - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 500 Internal Server Error"


ResponseError: model requires more system memory (20.3 GiB) than is available (11.2 GiB) (status code: 500)