In [1]:
import os
import re
import chromadb
import openai
import hashlib
import json
import time
from dotenv import load_dotenv
from openai import OpenAI
from chromadb.utils import embedding_functions

In [2]:
load_dotenv(override=True)

True

## Setup vector database

In [3]:
client = chromadb.Client()
# For persistence to disk
client = chromadb.PersistentClient(path="./chroma_db")

In [4]:
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
                api_key=os.getenv("OPENAI_API_KEY"),
                model_name="text-embedding-3-small")

In [5]:
openai_ef('tere')

[array([ 0.01191489,  0.00940918,  0.00219249, ...,  0.02120901,
         0.02228288, -0.04032141], shape=(1536,), dtype=float32)]

In [6]:
# Create a collection with the custom embedding function
# collection = client.create_collection(
#     name="obsidian_notes",
#     embedding_function=openai_ef
# )

In [7]:
#load if this has been setup
collection = client.get_collection(
    name="obsidian_notes",
    embedding_function=openai_ef  # Your embedding function
)

## Upload documents

In [8]:
def get_file_info(file_path):
    """Get file metadata needed for change detection"""
    stats = os.stat(file_path)
    return {
        "modified_time": stats.st_mtime,
        "size": stats.st_size,
        "hash": calculate_file_hash(file_path)
    }

def calculate_file_hash(file_path):
    """Calculate MD5 hash of file contents"""
    hash_md5 = hashlib.md5()
    with open(file_path, "rb") as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()

def save_file_metadata(file_path):
    """Save metadata for a single file"""
    file_info = get_file_info(file_path)
    
    # Create a unique filename for the metadata
    metadata_filename = hashlib.md5(file_path.encode()).hexdigest() + ".json"
    metadata_path = os.path.join(METADATA_DIR, metadata_filename)
    
    # Save the metadata
    with open(metadata_path, "w") as f:
        json.dump({
            "file_path": file_path,
            "info": file_info,
            "last_processed": time.time()
        }, f)

In [9]:
def extract_tags_from_note(note_content):
    """Extract tags from an Obsidian note (inline hashtags)"""    
    # This regex finds hashtags but ignores URLs and code blocks
    inline_tags = re.findall(r'(?<!`|\w)#([a-zA-Z0-9_/-]+)', note_content)

    return list(inline_tags)

In [10]:
def process_single_note(file_path, collection):
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    if len(content)>5:
        # Generate a unique ID based on file path
        note_id = hashlib.md5(file_path.encode()).hexdigest()

        # Extract title from filename
        title = os.path.splitext(os.path.basename(file_path))[0]
        folder= file_path.replace('\\', '/').split('/')[1]
        tags = extract_tags_from_note(content)

        # Add to collection
        collection.add(
            ids=[note_id],
            documents=[content],
            metadatas=[{"title": title,"folder":folder, "path": file_path,
                        "last_updated": time.time(),
                        "tags": ",".join(tags)}]
        )

        # Save individual file metadata
        save_file_metadata(file_path)
    else:
        print(f"file {file_path} has length of smaller than 5, skipping")

In [11]:
# Directory to store metadata about processed files
METADATA_DIR = "./note_metadata"
os.makedirs(METADATA_DIR, exist_ok=True)

def process_notes_initially(vault_path, collection):
    # Track all processed files
    processed_files = {}
    
    for root, _, files in os.walk(vault_path):
        for i, file in enumerate(files):
            if i%10==0:
                print(f"working on file {i}")
            if file.endswith('.md'):
                file_path = os.path.join(root, file)
                
                try:
                    # Process and add the note
                    process_single_note(file_path, collection)
                    
                    # Record this file as processed
                    file_info = get_file_info(file_path)
                    processed_files[file_path] = file_info
                    
                except Exception as e:
                    print(f"Error processing {file_path}: {e}")
    
    # Save metadata about all processed files
    with open(os.path.join(METADATA_DIR, "processed_files.json"), "w") as f:
        json.dump(processed_files, f)

In [None]:
process_notes_initially('Obsidian Vault/', collection)

working on file 0
working on file 10
working on file 20
working on file 30
working on file 40
working on file 0
working on file 0
working on file 0
working on file 0
working on file 10
working on file 20
working on file 0
working on file 0
working on file 0
working on file 0
working on file 0
working on file 10
working on file 20
working on file 0
working on file 0
working on file 10
working on file 0
working on file 0
working on file 0
working on file 10
working on file 20
working on file 0
working on file 0
working on file 0
working on file 0
working on file 10
working on file 20
working on file 0
working on file 0
working on file 0
working on file 0
working on file 10
working on file 20
working on file 0
working on file 0


### detect changes to change only this part in db

In [None]:
def update_note_in_collection(file_path, collection):
    """Update an existing note in the collection"""
    # Generate the consistent ID for this file
    note_id = hashlib.md5(file_path.encode()).hexdigest()
    
    # Read the updated content
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    
    # Extract title from filename
    title = os.path.splitext(os.path.basename(file_path))[0]
    
    # Update the note in collection
    # First, check if it exists
    results = collection.get(ids=[note_id])
    
    if len(results['ids']) > 0:
        # Update existing entry
        collection.update(
            ids=[note_id],
            documents=[content],
            metadatas=[{"title": title, "path": file_path, "last_updated": time.time()}]
        )
    else:
        # Add as new if not found (shouldn't happen normally)
        collection.add(
            ids=[note_id],
            documents=[content],
            metadatas=[{"title": title, "path": file_path, "last_updated": time.time()}]
        )
    
    # Update file metadata
    save_file_metadata(file_path)


def remove_note_from_collection(file_path, collection):
    """Remove a note from the collection"""
    # Generate the consistent ID for this file
    note_id = hashlib.md5(file_path.encode()).hexdigest()
    
    # Remove from collection
    collection.delete(ids=[note_id])
    
    # Remove metadata file
    metadata_filename = hashlib.md5(file_path.encode()).hexdigest() + ".json"
    metadata_path = os.path.join(metadata_dir, metadata_filename)
    
    if os.path.exists(metadata_path):
        os.remove(metadata_path)

In [None]:
def update_changed_notes(vault_path, collection):
    # Load previously processed files
    try:
        with open(os.path.join(metadata_dir, "processed_files.json"), "r") as f:
            processed_files = json.load(f)
    except FileNotFoundError:
        processed_files = {}
    
    # Track current files
    current_files = set()
    
    # Check all files in the vault
    for root, _, files in os.walk(vault_path):
        for file in files:
            if file.endswith('.md'):
                file_path = os.path.join(root, file)
                current_files.add(file_path)
                
                # Get current file info
                current_info = get_file_info(file_path)
                
                if file_path in processed_files:
                    # File exists in our records, check if modified
                    old_info = processed_files[file_path]
                    
                    if (current_info["modified_time"] != old_info["modified_time"] or
                        current_info["size"] != old_info["size"] or
                        current_info["hash"] != old_info["hash"]):
                        
                        print(f"File changed: {file_path}")
                        
                        # Update in collection
                        update_note_in_collection(file_path, collection)
                        
                        # Update metadata
                        processed_files[file_path] = current_info
                else:
                    # New file
                    print(f"New file: {file_path}")
                    process_single_note(file_path, collection)
                    processed_files[file_path] = current_info
    
    # Check for deleted files
    deleted_files = set(processed_files.keys()) - current_files
    for file_path in deleted_files:
        print(f"File deleted: {file_path}")
        remove_note_from_collection(file_path, collection)
        del processed_files[file_path]
    
    # Save updated metadata
    with open(os.path.join(metadata_dir, "processed_files.json"), "w") as f:
        json.dump(processed_files, f)