# Patched local pipeline (Ollama phi3:mini)

Consolidated notebook with a single pinned Ollama instance (`index_llm`), LLM helpers, local parsing, embeddings, index creation, query engine bound to `index_llm`, and a simplified report agent.

In [1]:
import os

# Force Ollama default to phi3:mini for any low-level calls.
os.environ["OLLAMA_MODEL"] = "phi3:mini"

# Do this before importing the Ollama wrapper or anything that talks to Ollama:
from llama_index.llms.ollama import Ollama
from llama_index.core import Settings

# Create a single LlamaIndex-compatible Ollama instance and pin it:
index_llm = Ollama(model="phi3:mini", request_timeout=600.0, keep_alive="0s")

# Optional: set global Settings.llm to this exact instance to catch code paths that read the global.
Settings.llm = index_llm

# Diagnostics:
print("index_llm created. model attr:", getattr(index_llm, "model", None))



index_llm created. model attr: phi3:mini


In [2]:
# Imports and SSL cert setup
import os
import ssl
import certifi
ssl._create_default_https_context = lambda: ssl.create_default_context(cafile=certifi.where())

import asyncio
import re
from typing import Any

from llama_index.core import Settings
from llama_index.llms.ollama import Ollama
from llama_index.readers.file import PDFReader
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Document
from llama_index.core import VectorStoreIndex

import instructor
from instructor import Mode
from openai import OpenAI

import arxiv

print('Notebook imports ready')

Notebook imports ready


In [3]:
# LLM helpers
import asyncio

def run_llm_sync(llm: Any, prompt: str, max_new_tokens: int = 256) -> str:
    if hasattr(llm, "complete") and callable(getattr(llm, "complete")):
        resp = llm.complete(prompt)
        if hasattr(resp, "text"):
            return str(resp.text).strip()
        return str(resp).strip()
    if callable(llm):
        out = llm(prompt, max_new_tokens=max_new_tokens)
        if isinstance(out, list) and len(out) > 0 and isinstance(out[0], dict):
            return out[0].get("generated_text", "").strip()
        if isinstance(out, dict) and "generated_text" in out:
            return out["generated_text"].strip()
        return str(out).strip()
    try:
        from openai import OpenAI as _OpenAI
        if isinstance(llm, _OpenAI):
            resp = llm.chat.completions.create(model="phi3:mini", messages=[{"role": "user", "content": prompt}], max_tokens=max_new_tokens)
            return resp.choices[0].message.content.strip()
    except Exception:
        pass
    try:
        out = llm(prompt)
        return str(out).strip()
    except Exception as e:
        raise RuntimeError(f"LLM sync call failed: {e}")

async def run_llm_async(llm: Any, prompt: str, max_new_tokens: int = 512) -> str:
    if hasattr(llm, "acomplete") and callable(getattr(llm, "acomplete")):
        resp = await llm.acomplete(prompt)
        if hasattr(resp, "text"):
            return str(resp.text).strip()
        return str(resp).strip()
    loop = asyncio.get_event_loop()
    return await loop.run_in_executor(None, run_llm_sync, llm, prompt, max_new_tokens)

print('LLM helpers created')

LLM helpers created


In [4]:
# Create and pin one Ollama instance for the index
index_llm = Ollama(model="phi3:mini", request_timeout=600.0, keep_alive="0s", additional_kwargs={"num_ctx": 4096})
Settings.llm = index_llm

# Instructor/OpenAI client for metadata extraction (separate)
metadata_client = instructor.patch(
    OpenAI(base_url='http://localhost:11434/v1', api_key='ollama'),
    mode=Mode.JSON,
)
print('index_llm and metadata_client created')
print('index_llm.model =', getattr(index_llm, 'model', None))

index_llm and metadata_client created
index_llm.model = phi3:mini


In [5]:
# Parse local PDFs (if any)
from pathlib import Path
reader = PDFReader()

def list_pdf_files(directory):
    return [str(p) for p in Path(directory).glob('*.pdf')]

def parse_files(pdf_files):
    all_documents = []
    for pdf_file in pdf_files:
        try:
            docs = reader.load_data(pdf_file)
            if isinstance(docs, tuple):
                docs = list(docs)
            elif not isinstance(docs, list):
                docs = [docs]
            all_documents.extend(docs)
            print('Parsed:', pdf_file)
        except Exception as e:
            print('Failed to parse', pdf_file, e)
    return all_documents

pdf_files = list_pdf_files('.')
documents = parse_files(pdf_files)
print('Parsed documents:', len(documents))

Parsed: 2510.11483v1.Uncertainty_Quantification_for_Retrieval_Augmented_Reasoning.pdf
Parsed: 2510.11541v1.Query_Specific_GNN__A_Comprehensive_Graph_Representation_Learning_Method_for_Retrieval_Augmented_Generation.pdf
Parsed: 2510.11654v1.FinVet__A_Collaborative_Framework_of_RAG_and_External_Fact_Checking_Agents_for_Financial_Misinformation_Detection.pdf
Parsed: 2510.11694v1.Operand_Quant__A_Single_Agent_Architecture_for_Autonomous_Machine_Learning_Engineering.pdf
Parsed: 2510.11695v1.When_Agents_Trade__Live_Multi_Market_Trading_Benchmark_for_LLM_Agents.pdf
Parsed: 2510.11701v1.Demystifying_Reinforcement_Learning_in_Agentic_Reasoning.pdf
Parsed: 2510.12460v1.Probing_Latent_Knowledge_Conflict_for_Faithful_Retrieval_Augmented_Generation.pdf
Parsed: 2510.12668v1.The_Role_of_Parametric_Injection_A_Systematic_Study_of_Parametric_Retrieval_Augmented_Generation.pdf
Parsed: 2510.12750v1.VQArt_Bench__A_semantically_rich_VQA_Benchmark_for_Art_and_Cultural_Heritage.pdf
Parsed: 2510.12787v1.Ax_Pr

In [6]:
# Embeddings, chunking, index and query engine
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
transform = SentenceSplitter(chunk_size=512, chunk_overlap=20)
all_nodes = []
for doc in documents:
    all_nodes.extend(transform.get_nodes_from_documents([doc]))

index = VectorStoreIndex.from_documents(all_nodes, embed_model=embed_model)

query_engine = index.as_query_engine(
    similarity_top_k=3,
    response_mode='compact',
    llm=index_llm,
    verbose=True,
)
print('Index and query engine ready')

2025-10-15 04:28:13,319 - INFO - Load pretrained SentenceTransformer: BAAI/bge-small-en-v1.5
2025-10-15 04:28:17,442 - INFO - 1 prompt is loaded, with the key: query
2025-10-15 04:30:27,516 - INFO - HTTP Request: POST http://localhost:11434/api/show "HTTP/1.1 200 OK"


Index and query engine ready


In [7]:
#Diagnostic Cell

import traceback
from ollama import Client as OllamaClient

# Show which models are running on the server
print("ollama ps (server-side):")
import subprocess, json, sys
try:
    out = subprocess.check_output(["ollama", "ps"], stderr=subprocess.STDOUT)
    print(out.decode())
except Exception as e:
    print("Could not run ollama ps from notebook:", e)

# Low-level check of model info:
oc = OllamaClient()
try:
    info = oc.show("phi3:mini")
    print("oc.show('phi3:mini') OK. modelinfo keys:", list(info.modelinfo.keys())[:10])
except Exception as e:
    print("oc.show failed:", e)

try:
    resp = index_llm.complete("Say hello from phi3:mini")
    if hasattr(resp, "text"):
        print("index_llm.complete.text:", resp.text[:300])
    else:
        print("index_llm.complete raw repr:", repr(resp)[:400])
except Exception as e:
    print("index_llm.complete raised:")
    traceback.print_exc()

try:
    qres = query_engine.query("What is Retrieval-Augmented Generation?")
    print("query_engine success. Short repr:", str(qres)[:400])
except Exception as e:
    print("query_engine raised:")
    traceback.print_exc()


ollama ps (server-side):
NAME    ID    SIZE    PROCESSOR    CONTEXT    UNTIL 



2025-10-15 04:30:28,324 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/show "HTTP/1.1 200 OK"


oc.show('phi3:mini') OK. modelinfo keys: ['general.architecture', 'general.basename', 'general.file_type', 'general.finetune', 'general.languages', 'general.license', 'general.license.link', 'general.parameter_count', 'general.quantization_version', 'general.size_label']


2025-10-15 04:31:01,188 - INFO - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"


index_llm.complete.text: Hello, I'm Phi3 and this is my mini introduction. As a virtual assistant designed to interact with users like you on platforms such as Club Penguin, remember that while I am here for playful conversations and immersive experiences in the game world of penguins or any similar environment, it’s import


2025-10-15 04:33:37,376 - INFO - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"


query_engine success. Short repr: RAG refers to a technique where external documents are retrieved during inference for supplementation of language models' internal knowledge in order to enhance performance on tasks that require factual information, while also dealing with limitations such as increased context length and potential comprehension issues.


In [8]:
# Helper functions used by the agent
import re

def extract_title(outline):
    first_line = outline.strip().split('\n')[0]
    return first_line.strip('# ').strip()

# reuse parse_outline_and_generate_queries from earlier patterns
def parse_outline_and_generate_queries(outline, llm):
    if isinstance(outline, str):
        lines = outline.splitlines()
        title = 'Untitled Report'
        sections = {}
        current_section = None
        for line in lines:
            line = line.strip().lstrip('#').strip()
            if not line:
                continue
            if re.match(r"^\d+\.", line) and not re.match(r"^\d+\.\d+\.", line):
                current_section = line
                sections[current_section] = []
            elif re.match(r"^\d+\.\d+\.", line) and current_section:
                sections[current_section].append(line)
        outline = {"title": title, "sections": sections}
    title = outline.get('title', 'Untitled Report')
    sections = outline.get('sections', {})
    queries = {}
    for section, subsections in sections.items():
        queries[section] = {}
        for subsection in subsections:
            prompt = f"Generate one clear, concise query for the subsection '{subsection}' under section '{section}' in the report titled '{title}'."
            query_text = run_llm_sync(llm, prompt)
            # simple classify: shorter -> LLM else INDEX (you may replace with a real classifier)
            classification = 'LLM' if len(query_text.split()) < 40 else 'INDEX'
            queries[section][subsection] = {'query': query_text, 'classification': classification}
    return queries

print('Helper functions ready')

Helper functions ready


In [9]:
from typing import Any, Dict
from llama_index.core.workflow import Workflow, StartEvent, StopEvent, Context, step
from llama_index.core.workflow import Event

# Define the event used to pass queries from one step to the next
class ReportGenerationEvent(Event):
    """Event carrying generated queries for report generation."""
    pass

class ReportGenerationAgent(Workflow):
    """Report generation agent."""

    def __init__(self, query_engine: Any, llm: Any, **kwargs: Any) -> None:
        super().__init__(**kwargs)
        self.query_engine = query_engine
        self.llm = llm

    def generate_section_content(self, queries: Dict[str, Dict[str, Dict[str, str]]], reverse: bool = False) -> Dict[str, Dict[str, str]]:
        """
        Generate content for each section/subsection. Returns a mapping:
        { section_title: { subsection_key: answer_text, ... }, ... }
        """
        section_contents: Dict[str, Dict[str, str]] = {}
        for section, subsections in queries.items():
            section_contents[section] = {}
            subsection_keys = reversed(sorted(subsections.keys())) if reverse else sorted(subsections.keys())
            for subsection in subsection_keys:
                data = subsections[subsection]
                query = data['query']
                classification = data['classification']
                if classification == "LLM":
                    answer = run_llm_sync(self.llm, query + " Give a short answer.")
                else:
                    # Use the query engine for INDEX queries
                    answer = str(self.query_engine.query(query))
                section_contents[section][subsection] = answer
        return section_contents

    async def format_report(self, section_contents: Dict[str, Dict[str, str]], outline: Any) -> str:
        """Format the report from the generated section contents and outline; returns markdown string."""
        report = ""

        for section, subsections in section_contents.items():
            section_match = re.match(r'^(\d+\.)\s*(.*)$', section)
            if section_match:
                section_num, section_title = section_match.groups()

                if "introduction" in section.lower():
                    introduction_num, introduction_title = section_num, section_title
                elif "conclusion" in section.lower():
                    conclusion_num, conclusion_title = section_num, section_title
                else:
                    combined_content = "\n".join(subsections.values())
                    summary_query = f"Provide a short summary for section '{section}':\n\n{combined_content}"
                    section_summary = run_llm_sync(self.llm, summary_query)

                    report += f"# {section_num} {section_title}\n\n{section_summary}\n\n"
                    report = self.get_subsections_content(subsections, report)

        # Add introduction (async)
        introduction_query = f"Create an introduction for the report:\n\n{report}"
        introduction = await run_llm_async(self.llm, introduction_query)
        # Prepend introduction
        report = f"# {introduction_num} {introduction_title}\n\n{introduction}\n\n" + report

        # Add conclusion (async)
        conclusion_query = f"Create a conclusion for the report:\n\n{report}"
        conclusion = await run_llm_async(self.llm, conclusion_query)
        report += f"# {conclusion_num} {conclusion_title}\n\n{conclusion}"

        # Add title
        title = extract_title(outline) if isinstance(outline, str) else outline.get("title", "Report")
        report = f"# {title}\n\n{report}"
        return report

    def get_subsections_content(self, subsections: Dict[str, str], report: str) -> str:
        """Generate content for each subsection in the outline and append to report string."""
        for subsection in sorted(subsections.keys(), key=lambda x: (re.search(r'(\d+\.\d+)', x).group(1) if re.search(r'(\d+\.\d+)', x) else x)):
            content = subsections[subsection]
            subsection_match = re.search(r'(\d+\.\d+)\.\s*(.+)', subsection)
            if subsection_match:
                subsection_num, subsection_title = subsection_match.groups()
                report += f"## {subsection_num} {subsection_title}\n\n{content}\n\n"
            else:
                report += f"## {subsection}\n\n{content}\n\n"
        return report

    # ---- workflow steps: MUST have explicit return type annotations ----

    @step(pass_context=True)
    async def queries_generation_event(self, ctx: Context, ev: StartEvent) -> ReportGenerationEvent:
        """Generate queries for the report and emit ReportGenerationEvent."""
        # store outline on context for downstream steps
        ctx.outline = ev.outline
        outline = ctx.outline
        queries = parse_outline_and_generate_queries(outline, self.llm)
        return ReportGenerationEvent(queries=queries)

    @step(pass_context=True)
    async def generate_report(self, ctx: Context, ev: ReportGenerationEvent) -> StopEvent:
        """Generate the report and return it in a StopEvent result."""
        outline = ctx.outline
        queries = ev.queries

        section_contents = self.generate_section_content(queries, reverse=True)
        report = await self.format_report(section_contents, outline)

        return StopEvent(result={"response": report})

print('Agent ready')

Agent ready


In [10]:
# Outline and run (test). Note: running the agent can be slow depending on model speed.
outline = '''
# Research Paper Report on RAG - Retrieval Augmented Generation and Agentic World.

## 1. Introduction

## 2. Retrieval Augmented Generation (RAG) and Agents
2.1. Fundamentals of RAG and Agents.
2.2. Current State and Applications

## 3. Latest Papers:
3.1. HEALTH-PARIKSHA: Assessing RAG Models for Health Chatbots in Real-World Multilingual Settings
3.2. MIRAGE-Bench: Automatic Multilingual Benchmark Arena for Retrieval-Augmented Generation Systems
3.3. VLM-Grounder: A VLM Agent for Zero-Shot 3D Visual Grounding

## 4. Conclusion:
'''

# Quick engine smoke test
print('Query engine smoke test:')
try:
    res = query_engine.query('What is Retrieval-Augmented Generation?')
    print('OK - query engine response preview:', str(res)[:400])
except Exception as e:
    print('Query engine failed:', e)

# Run agent
agent = ReportGenerationAgent(query_engine=query_engine, llm=index_llm)
import asyncio
try:
    rep = asyncio.run(agent.run(outline=outline))
    print('Agent completed. Writing report.md')
    with open('report.md', 'w', encoding='utf-8') as f:
        f.write(rep['response'])
    print('Wrote report.md')
except Exception as e:
    print('Agent run failed:', e)


Query engine smoke test:


2025-10-15 04:36:45,749 - INFO - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"


OK - query engine response preview: Retrieval-augmented generation retrieves external documents to supplement large language model's internal knowledge for enhancing performance on tasks requiring factual information and mitigating hallucinations. It aims at addressing limitations of traditional methods by incorporating additional relevant data sources during the inference time, facilitating better task execution in domains where co
Agent run failed: asyncio.run() cannot be called from a running event loop
