In [34]:
from openai import OpenAI
from dotenv import load_dotenv
from chromadb import PersistentClient
import litellm
from pydantic import BaseModel, Field
from pathlib import Path
from tenacity import retry, wait_exponential
from tqdm import tqdm
import numpy as np
from sklearn.manifold import TSNE
import plotly.graph_objects as go


In [12]:
MODEL = "gpt-oss:20b"
OLLAMA_BASE_URL = "http://localhost:11434/v1"
DB_NAME = "preprocessed_db"
collection_name = "docs"
embedding_model = "text-embedding-3-large"
KNOWLEDGE_BASE_PATH = Path("notes-new")
AVERAGE_CHUNK_SIZE = 500
# KNOWLEDGE_BASE_PATH

In [8]:
openai = OpenAI(base_url=OLLAMA_BASE_URL,api_key='ollama')

## Define Classes

In [9]:
# Inspired by LangChain's Document - let's have something similar

class Result(BaseModel):
    page_content: str
    metadata: dict

In [10]:
# A class to perfectly represent a chunk

class Chunk(BaseModel):
    headline: str = Field(description="A brief heading for this chunk, typically a few words, that is most likely to be surfaced in a query")
    summary: str = Field(description="A few sentences summarizing the content of this chunk to answer common questions")
    original_text: str = Field(description="The original text of this chunk from the provided document, exactly as is, not changed in any way")

    def as_result(self, document):
        metadata = {"source": document["source"], "type": document["type"]}
        return Result(page_content=self.headline + "\n\n" + self.summary + "\n\n" + self.original_text,metadata=metadata)


class Chunks(BaseModel):
    chunks: list[Chunk]

## Fetch Documents from knowledge base

In [None]:
def fetch_documents():
    """A homemade version of the LangChain DirectoryLoader"""

    documents = []

    for folder in KNOWLEDGE_BASE_PATH.iterdir():
        doc_type = folder.name
        for file in folder.rglob("*.md"):
            with open(file, "r", encoding="utf-8") as f:
                documents.append({"type": doc_type, "source": file.as_posix(), "text": f.read()})

    print(f"Loaded {len(documents)} documents")
    return documents

In [14]:
documents = fetch_documents()

Loaded 146 documents


## Use LLM for Chunking

In [15]:
def make_prompt(document):
    how_many = (len(document["text"]) // AVERAGE_CHUNK_SIZE) + 1

    return f"""
You are preparing documents for a fully local Retrieval-Augmented Generation (RAG) system.

The source is an Obsidian knowledge vault (Markdown files).

This content will be embedded and stored in a vector database for semantic + keyword retrieval.

Document metadata:
- Vault category: {document["type"]}
- File path: {document["source"]}

Your task is to split this Markdown document into overlapping, retrieval-optimized chunks.

IMPORTANT OBJECTIVES:

1. Preserve Markdown structure:
   - Headers (#, ##, ###)
   - Bullet lists
   - Code blocks
   - Tables
   - Paragraph boundaries

2. Respect semantic coherence:
   - Do NOT split in the middle of logical ideas.
   - Prefer chunk boundaries at section headers.
   - Sub-chunk long sections if needed.

3. Maintain Obsidian context:
   - Preserve wiki links ([[Note Name]])
   - Preserve tags (#tag)
   - Keep heading hierarchy
   - Do NOT remove backlinks or references.

4. Chunking rules:
   - Target approximately {how_many} chunks (flexible).
   - Each chunk should be ~400–800 tokens.
   - Include ~20–25% overlap (or ~50–100 words) between adjacent chunks.
   - Ensure NO information is lost.

5. For each chunk, output:

- chunk_id (incrementing integer)
- headline (derived from closest Markdown header)
- summary (2–3 sentence semantic summary)
- headings (full heading path, e.g. H1 > H2 > H3)
- tags (if present)
- wiki_links (all [[links]] inside this chunk)
- text (original chunk content, unmodified Markdown)

Together, your chunks MUST reconstruct the entire document with overlap.

This content will later be used for:

- Dense embedding retrieval
- BM25 keyword search
- Graph-based retrieval via backlinks
- Context assembly for a local LLM

Do NOT hallucinate.
Do NOT rewrite content.
Do NOT drop formatting.

Here is the document:

{document["text"]}

Respond ONLY with a structured list of chunks following the schema above.
"""

In [17]:
def make_messages(document):
    return [
        {"role": "user", "content": make_prompt(document)},
    ]

In [None]:
from litellm import completion

def process_document(document):
    messages = make_messages(document)
    response = completion(
        model="ollama/MODEL_NAME",  # prefix with "ollama/"
        messages=messages,
        response_format=Chunks
    )
    reply = response.choices[0].message.content
    doc_as_chunks = Chunks.model_validate_json(reply).chunks
    return [chunk.as_result(document) for chunk in doc_as_chunks]

In [39]:
import ollama
import json

def process_document(document):
    messages = make_messages(document)
    
    response = ollama.chat(
        model="qwen2.5:14b",
        messages=messages,
        format=Chunks.model_json_schema(),
        options={
            "num_predict": 8192,  # Max tokens to generate
            "temperature": 0.1,   # Lower temperature for more consistent JSON
        }
    )
    
    reply = response['message']['content']
    
    if not reply:
        raise ValueError("Model returned empty response")
    
    print("Reply length:", len(reply))  # Debug
    
    doc_as_chunks = Chunks.model_validate_json(reply).chunks
    return [chunk.as_result(document) for chunk in doc_as_chunks]

In [40]:
def create_chunks(documents):
    chunks = []
    for doc in tqdm(documents):
        chunks.extend(process_document(doc))
    return chunks

In [41]:
chunks = create_chunks(documents)

  1%|          | 1/146 [01:59<4:47:48, 119.09s/it]

Reply length: 6735


  1%|▏         | 2/146 [03:10<3:38:03, 90.86s/it] 

Reply length: 4866


  2%|▏         | 3/146 [04:02<2:54:45, 73.33s/it]

Reply length: 3919


  3%|▎         | 4/146 [06:35<4:08:04, 104.82s/it]

Reply length: 6588


  3%|▎         | 5/146 [07:15<3:10:52, 81.22s/it] 

Reply length: 2770


  4%|▍         | 6/146 [08:36<3:09:58, 81.42s/it]

Reply length: 5585


  5%|▍         | 7/146 [09:30<2:47:14, 72.19s/it]

Reply length: 4071


  5%|▌         | 8/146 [11:11<3:07:22, 81.47s/it]

Reply length: 7414


  6%|▌         | 9/146 [11:16<2:11:38, 57.65s/it]

Reply length: 290


  6%|▌         | 9/146 [19:02<4:49:49, 126.93s/it]

Reply length: 30321





ValidationError: 1 validation error for Chunks
  Invalid JSON: EOF while parsing a string at line 54 column 1829 [type=json_invalid, input_value='{\n  "chunks": [\n    {\...retriever = vectorstore', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/json_invalid

In [43]:
import re
from typing import List, Dict, Tuple
from dataclasses import dataclass

In [44]:


@dataclass
class ChunkBoundary:
    """Represents a potential chunk boundary with context"""
    position: int
    header_level: int
    header_text: str
    priority: int  # Higher = better boundary

def extract_markdown_structure(text: str) -> List[ChunkBoundary]:
    """Find all potential chunk boundaries (headers, paragraph breaks)"""
    boundaries = []
    
    # Find all headers
    for match in re.finditer(r'^(#{1,6})\s+(.+)$', text, re.MULTILINE):
        level = len(match.group(1))
        boundaries.append(ChunkBoundary(
            position=match.start(),
            header_level=level,
            header_text=match.group(2).strip(),
            priority=10 - level  # H1 = priority 9, H2 = 8, etc.
        ))
    
    # Find paragraph breaks (double newlines)
    for match in re.finditer(r'\n\n+', text):
        boundaries.append(ChunkBoundary(
            position=match.end(),
            header_level=99,  # Not a header
            header_text="",
            priority=1
        ))
    
    return sorted(boundaries, key=lambda b: b.position)

def find_best_boundary(text: str, target_pos: int, boundaries: List[ChunkBoundary], 
                       min_pos: int, max_pos: int) -> int:
    """Find the best chunk boundary near target_pos"""
    candidates = [b for b in boundaries if min_pos <= b.position <= max_pos]
    
    if not candidates:
        # Fallback: find nearest sentence boundary
        for offset in range(0, max_pos - target_pos + 1, 50):
            for punct in ['. ', '.\n', '! ', '\n']:
                pos = text.find(punct, target_pos + offset)
                if min_pos <= pos <= max_pos:
                    return pos + len(punct)
        return min(max_pos, len(text))
    
    # Prefer boundaries closer to target with higher priority
    best = max(candidates, key=lambda b: (
        b.priority * 100 - abs(b.position - target_pos)
    ))
    return best.position

def split_text_into_chunks(text: str, target_chunk_size: int = 600, 
                          overlap_size: int = 100) -> List[Dict]:
    """Split text into overlapping chunks using smart boundaries"""
    if not text.strip():
        return []
    
    boundaries = extract_markdown_structure(text)
    chunks = []
    start = 0
    chunk_id = 0
    
    while start < len(text):
        # Calculate target end position
        target_end = start + target_chunk_size
        
        if target_end >= len(text):
            # Last chunk - take everything
            chunks.append({
                'chunk_id': chunk_id,
                'text': text[start:],
                'start_pos': start,
                'end_pos': len(text)
            })
            break
        
        # Find best boundary within acceptable range
        min_end = start + int(target_chunk_size * 0.7)  # At least 70% of target
        max_end = start + int(target_chunk_size * 1.3)  # At most 130% of target
        
        actual_end = find_best_boundary(text, target_end, boundaries, 
                                       min_end, min(max_end, len(text)))
        
        chunks.append({
            'chunk_id': chunk_id,
            'text': text[start:actual_end],
            'start_pos': start,
            'end_pos': actual_end
        })
        
        # Next chunk starts with overlap
        start = max(start + 1, actual_end - overlap_size)
        chunk_id += 1
    
    return chunks

def extract_chunk_context(chunk_text: str, full_text: str, start_pos: int) -> Dict:
    """Extract structural context for a chunk"""
    # Find the heading hierarchy for this chunk
    text_before = full_text[:start_pos]
    headings = []
    
    for match in re.finditer(r'^(#{1,6})\s+(.+)$', text_before, re.MULTILINE):
        level = len(match.group(1))
        text = match.group(2).strip()
        
        # Keep only headings that are ancestors of current position
        headings = [h for h in headings if h['level'] < level]
        headings.append({'level': level, 'text': text})
    
    # Find closest header in current chunk
    chunk_headers = list(re.finditer(r'^(#{1,6})\s+(.+)$', chunk_text, re.MULTILINE))
    headline = chunk_headers[0].group(2).strip() if chunk_headers else (
        headings[-1]['text'] if headings else "Untitled"
    )
    
    # Extract tags
    tags = list(set(re.findall(r'#[\w\-]+', chunk_text)))
    
    # Extract wiki links
    wiki_links = list(set(re.findall(r'\[\[([^\]]+)\]\]', chunk_text)))
    
    return {
        'headline': headline,
        'headings': ' > '.join(h['text'] for h in headings),
        'tags': tags,
        'wiki_links': wiki_links
    }

def make_metadata_prompt(chunk: Dict, document: Dict) -> str:
    """Create a focused prompt for metadata generation only"""
    return f"""Generate a concise 2-3 sentence semantic summary for this chunk.

Document context:
- Vault category: {document["type"]}
- File path: {document["source"]}
- Heading context: {chunk['headings']}
- Headline: {chunk['headline']}

Chunk content:
{chunk['text'][:1000]}{'...' if len(chunk['text']) > 1000 else ''}

Respond with ONLY a JSON object:
{{"summary": "your 2-3 sentence summary here"}}"""

def generate_chunk_metadata(chunk: Dict, document: Dict) -> str:
    """Use LLM to generate only the summary"""
    import ollama
    import json
    
    response = ollama.chat(
        model="qwen2.5:14b",
        messages=[{"role": "user", "content": make_metadata_prompt(chunk, document)}],
        format={"type": "object", "properties": {"summary": {"type": "string"}}},
        options={"temperature": 0.3, "num_predict": 256}
    )
    
    result = json.loads(response['message']['content'])
    return result.get('summary', '')

def process_document(document: Dict) -> List[Dict]:
    """Process a document into enriched chunks"""
    text = document["text"]
    
    # Step 1: Split text using Python (fast, reliable)
    raw_chunks = split_text_into_chunks(text, target_chunk_size=600, overlap_size=100)
    
    # Step 2: Extract structural metadata (fast, deterministic)
    for chunk in raw_chunks:
        context = extract_chunk_context(chunk['text'], text, chunk['start_pos'])
        chunk.update(context)
    
    # Step 3: Generate semantic summaries using LLM (only when needed)
    enriched_chunks = []
    for chunk in raw_chunks:
        summary = generate_chunk_metadata(chunk, document)
        
        enriched_chunks.append({
            'chunk_id': chunk['chunk_id'],
            'headline': chunk['headline'],
            'summary': summary,
            'headings': chunk['headings'],
            'tags': chunk['tags'],
            'wiki_links': chunk['wiki_links'],
            'text': chunk['text'],
            'source': document['source'],
            'type': document['type']
        })
    
    return enriched_chunks

def create_chunks(documents: List[Dict]) -> List[Dict]:
    """Process all documents"""
    from tqdm import tqdm
    chunks = []
    for doc in tqdm(documents):
        chunks.extend(process_document(doc))
    return chunks

# Usage
chunks = create_chunks(documents)

  0%|          | 0/146 [00:00<?, ?it/s]

100%|██████████| 146/146 [32:37<00:00, 13.40s/it] 


In [46]:
print(len(chunks))

574


## Create embeddings

In [54]:
from sentence_transformers import SentenceTransformer
import json

def create_embeddings(chunks):
    chroma = PersistentClient(path=DB_NAME)
    if collection_name in [c.name for c in chroma.list_collections()]:
        chroma.delete_collection(collection_name)
    
    # Extract text from dict chunks
    texts = [chunk['text'] for chunk in chunks]
    
    # Extract and clean metadata - convert lists to JSON strings
    metas = []
    for chunk in chunks:
        meta = {}
        for k, v in chunk.items():
            if k == 'text':
                continue
            # Convert lists to JSON strings
            if isinstance(v, list):
                meta[k] = json.dumps(v) if v else ""
            else:
                meta[k] = v
        metas.append(meta)
    
    # Load the all-mpnet-base-v2 model (downloads on first use)
    model = SentenceTransformer('all-mpnet-base-v2')
    
    # Generate embeddings
    vectors = model.encode(texts, show_progress_bar=True)
    
    collection = chroma.get_or_create_collection(collection_name)
    
    ids = [str(i) for i in range(len(chunks))]
    
    collection.add(ids=ids, embeddings=vectors.tolist(), documents=texts, metadatas=metas)
    print(f"Vectorstore created with {collection.count()} documents")

In [53]:
create_embeddings(chunks)

Loading weights: 100%|██████████| 199/199 [00:00<00:00, 1155.93it/s, Materializing param=pooler.dense.weight]                        
MPNetModel LOAD REPORT from: sentence-transformers/all-mpnet-base-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
Batches: 100%|██████████| 18/18 [00:04<00:00,  4.06it/s]


Vectorstore created with 574 documents


## Visualizing the embeddings

In [55]:
chroma = PersistentClient(path=DB_NAME)
collection = chroma.get_or_create_collection(collection_name)
result = collection.get(include=['embeddings', 'documents', 'metadatas'])
vectors = np.array(result['embeddings'])
documents = result['documents']
metadatas = result['metadatas']
doc_types = [metadata['type'] for metadata in metadatas]

In [56]:
# Define colors for your note types
note_types = ['CMM', 'RNN', 'RAG']
color_map = {
    'CMM': 'blue',
    'RNN': 'green', 
    'RAG': 'red'
}

# Get colors based on doc_types
colors = [color_map.get(t, 'orange') for t in doc_types]

In [59]:
tsne = TSNE(n_components=2, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 2D scatter plot
fig = go.Figure(data=[go.Scatter(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='2D Chroma Vector Store Visualization',
    xaxis_title='x',
    yaxis_title='y',
    width=800,
    height=600,
    margin=dict(r=20, b=10, l=10, t=40)
)

# Save to HTML file and open in browser
fig.write_html('vector_visualization.html')
# Or use this to open automatically
fig.show(renderer='browser')

Opening in existing browser session.


In [60]:
tsne = TSNE(n_components=3, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 3D scatter plot
fig = go.Figure(data=[go.Scatter3d(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    z=reduced_vectors[:, 2],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='3D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),
    width=900,
    height=700,
    margin=dict(r=10, b=10, l=10, t=40)
)
# Save to HTML file and open in browser
fig.write_html('vector_visualization.html')
# Or use this to open automatically
fig.show(renderer='browser')

Opening in existing browser session.


## Reranking 

In [62]:
class RankOrder(BaseModel):
    order: list[int] = Field(
        description="The order of relevance of chunks, from most relevant to least relevant, by chunk id number"
    )

In [None]:
def rerank(question, chunks):
    MODEL = "ollama/llama3.2"  # or any model you have in Ollama
# Make sure Ollama is running: ollama serve
    system_prompt = """
You are a document re-ranker.
You are provided with a question and a list of relevant chunks of text from a query of a knowledge base.
The chunks are provided in the order they were retrieved; this should be approximately ordered by relevance, but you may be able to improve on that.
You must rank order the provided chunks by relevance to the question, with the most relevant chunk first.
Reply only with the list of ranked chunk ids, nothing else. Include all the chunk ids you are provided with, reranked.
"""
    user_prompt = f"The user has asked the following question:\n\n{question}\n\nOrder all the chunks of text by relevance to the question, from most relevant to least relevant. Include all the chunk ids you are provided with, reranked.\n\n"
    user_prompt += "Here are the chunks:\n\n"
    
    for index, chunk in enumerate(chunks):
        # Handle both dict and Result object formats
        if isinstance(chunk, dict):
            content = chunk.get('page_content') or chunk.get('text', '')
        else:
            content = chunk.page_content
        
        user_prompt += f"# CHUNK ID: {index + 1}:\n\n{content}\n\n"
    
    user_prompt += "Reply only with the list of ranked chunk ids, nothing else."
    
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt},
    ]
    
    response = completion(model=MODEL, messages=messages, response_format=RankOrder)
    reply = response.choices[0].message.content
    order = RankOrder.model_validate_json(reply).order
    print(order)
    
    return [chunks[i - 1] for i in order]

In [71]:
model = SentenceTransformer('all-mpnet-base-v2')

RETRIEVAL_K = 10

def fetch_context_unranked(question):
    # Generate embedding using local model
    query = model.encode(question).tolist()
    
    results = collection.query(query_embeddings=[query], n_results=RETRIEVAL_K)
    
    chunks = []
    for result in zip(results["documents"][0], results["metadatas"][0]):
        chunks.append(Result(page_content=result[0], metadata=result[1]))
    
    return chunks

Loading weights: 100%|██████████| 199/199 [00:00<00:00, 1194.69it/s, Materializing param=pooler.dense.weight]                        
MPNetModel LOAD REPORT from: sentence-transformers/all-mpnet-base-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


In [72]:
question = "What is Sequence Model"
chunks = fetch_context_unranked(question)

In [73]:
for chunk in chunks:
    print(chunk.page_content[:15]+"...")

---
tags:
  - u...
 $T_y$: output ...
---
tags:
  - m...
---
tags:
  - m...
N
* Vanilla RNN...
---
created: 20...
---
tags:
  - e...
---
tags:
  - m...
---
tags:
  - m...
; if that seque...


In [74]:
reranked = rerank(question, chunks)




[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m

SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False

[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m



BadRequestError: litellm.BadRequestError: LLM Provider NOT provided. Pass in the LLM provider you are trying to call. You passed model=gpt-oss:20b
 Pass model as E.g. For 'Huggingface' inference endpoints pass in `completion(model='huggingface/starcoder',..)` Learn more: https://docs.litellm.ai/docs/providers