In [1]:
# Cell 1 — run first
import os

# Disable Autogen telemetry (if you later import autogen)
os.environ["AUTOGEN_DISABLE_TELEMETRY"] = "true"

# Silence HF symlink warning on Windows (optional)
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

# Install required packages (uncomment if not installed)
# Use !pip in notebook to install into the notebook env
!pip install -q sentence-transformers PyPDF2

print("Ready — make sure Ollama is running (run `ollama list` in a terminal).")


Ready — make sure Ollama is running (run `ollama list` in a terminal).


In [2]:
# Cell 2
from sentence_transformers import SentenceTransformer
import numpy as np
from PyPDF2 import PdfReader
import subprocess
import math
import textwrap


In [3]:
# Cell 3
def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    pages = []
    for page in reader.pages:
        text = page.extract_text() or ""
        pages.append(text)
    return "\n\n".join(pages)

def chunk_text(text, max_chars=1200, overlap=200):
    """Simple character-based chunking with overlap.
       Returns list of dicts: [{'id':i, 'text': chunk_text}, ...]"""
    chunks = []
    start = 0
    n = len(text)
    i = 0
    while start < n:
        end = start + max_chars
        chunk = text[start:end]
        chunks.append({"id": i, "text": chunk.strip()})
        i += 1
        start = end - overlap  # overlap
    return chunks

# Example: load your PDF and create chunks
pdf_path = "BELT STOCK.pdf"   # <-- change to your PDF filename/path
print("Extracting text (this may take a few seconds)...")
doc_text = extract_text_from_pdf(pdf_path)
print(f"Total characters extracted: {len(doc_text)}")

chunks = chunk_text(doc_text, max_chars=1200, overlap=200)
print(f"Created {len(chunks)} chunks.")


Extracting text (this may take a few seconds)...
Total characters extracted: 993
Created 1 chunks.


In [4]:
# Cell 4
print("Loading embedding model (this will download if not present)...")
embed_model_name = "sentence-transformers/all-MiniLM-L6-v2"
embedder = SentenceTransformer(embed_model_name)

# Create embeddings for chunks
texts = [c["text"] for c in chunks]
embs = embedder.encode(texts, show_progress_bar=True, convert_to_numpy=True)

# Normalize embeddings for cosine similarity speed
def normalize(v):
    norms = np.linalg.norm(v, axis=1, keepdims=True)
    norms[norms == 0] = 1.0
    return v / norms

embs_norm = normalize(embs)

class SimpleVectorStore:
    def __init__(self, embeddings_norm, chunks):
        self.emb = embeddings_norm  # numpy array (n, d)
        self.chunks = chunks

    def search(self, query_emb, top_k=3):
        """Return list of (chunk, score) sorted by score desc."""
        # normalize query
        q = query_emb / (np.linalg.norm(query_emb) + 1e-12)
        scores = (self.emb @ q).flatten()  # cosine similarity
        idx = np.argsort(-scores)[:top_k]
        results = []
        for i in idx:
            results.append((self.chunks[i], float(scores[i])))
        return results

store = SimpleVectorStore(embs_norm, chunks)
print("Vector store ready.")


Loading embedding model (this will download if not present)...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Vector store ready.


In [9]:
# Cell 5
OLLAMA_MODEL = "llama3.2:latest"   # change if needed

def call_ollama_cli(prompt, model=OLLAMA_MODEL, timeout=120):
    """Call `ollama run <model> "<prompt>"` via subprocess and return stdout or stderr."""
    # Use subprocess with list args so quoting is safe
    cmd = ["ollama", "run", model, prompt]
    try:
        proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
    except subprocess.TimeoutExpired:
        return "ERROR: Ollama call timed out."

    if proc.returncode != 0:
        return f"ERROR (return code {proc.returncode}):\n{proc.stderr.strip()}"
    return proc.stdout.strip()

def ask(question, top_k=3, max_context_chars=3000, model=OLLAMA_MODEL):
    # 1) embed query
    q_emb = embedder.encode([question], convert_to_numpy=True)[0]
    # 2) search
    hits = store.search(q_emb, top_k=top_k)
    # 3) build context (concatenate, but keep length bounded)
    ctx_parts = []
    total = 0
    for chunk, score in hits:
        txt = chunk["text"].strip()
        if total + len(txt) > max_context_chars:
            # truncate last chunk if necessary
            txt = txt[: max_context_chars - total]
            ctx_parts.append(txt)
            break
        ctx_parts.append(txt)
        total += len(txt) + 2
    context = "\n\n---\n\n".join(ctx_parts)

    # 4) Compose prompt for the model
    system_msg = (
        "You are an assistant that answers questions using the provided CONTEXT. "
        "If the context doesn't contain the answer, say you don't know instead of making things up. "
        "Be concise and cite the relevant context snippet numbers if useful."
    )

    # Include chunk ids for traceability
    context_with_ids = []
    for c in hits:
        chunk_obj = c[0]
        context_with_ids.append(f"[chunk {chunk_obj['id']}] {chunk_obj['text'][:1000]}")  # show first 1000 chars

    context_block = "\n\n".join(context_with_ids)

    prompt = textwrap.dedent(f"""
    {system_msg}

    CONTEXT (retrieved snippets):
    {context_block}

    USER QUESTION:
    {question}

    ANSWER:
    """).strip()

    # 5) Call Ollama via CLI
    print("Calling Ollama... (this may take a few seconds)")
    out = call_ollama_cli(prompt, model=model)
    return out


In [6]:
# Cell 6 — ask a question
question = "What is the total belt stock available"
answer = ask(question, top_k=4)
print("-----\nModel answer:\n")
print(answer)


Calling Ollama... (this may take a few seconds)
-----
Model answer:

TOTAL BELT STOCK IN HAND = 9655 MTRS
