# Outline:

## Pre-reqs
1. FileNode model
2. Embedding model
3. LLM for featurisation

## Execution 
1. Grab all files markdown files from Obsidian-vault
2. Create FileNodes based on them
3. Featurise to LLMNode
5. Collect them to a full Node
6. Place them in small Neo4j database 

## Config

In [5]:
BASE_LLM_MODEL = "gemini-2.0-flash"
EMBEDDING_MODEL = "text-embedding-004"

# Pre-reqs

### 1. Node model

In [77]:
from datetime import datetime

from pydantic import BaseModel, Field
from datetime import datetime

class FileNode(BaseModel):
    """Properties extracted directly from the file system and file content"""
    primary_id: str = Field(description="Hash based on first and last 1024 characters of file uniquely identifying node.\
    Ontology of node is thus its content rather than anything reliant on metadata. Note issue with not registering changes in the middle")
    content: str = Field(description="The underlying content of the node")
    file_size: float = Field(description="Size of the node file in KB")
    file_creation_time: datetime = Field(description="Date the node was created")
    file_modification_time: datetime = Field(description="Date the node was last modified")
    filetype: str = Field(description="File type of the content")
    location: str = Field(description="Location of the node (e.g., Notion, Local Files, OneDrive)")
    path: str = Field(description="Path to the node file")
    chunks_to_embeddings: dict[str, list[float]] = Field(description="Chunks of the node content")
 
class LLMNode(BaseModel):
    label: str = Field(description="The type of content contained in the node")
    author: list[str] = Field(description="The author(s) of the content if it appears in the text")
    problem_space: list[str] = Field(description="The problem space(s) the content is related to")
    research_question: str = Field(description="Research question addressed by the node content")
    main_argument: str = Field(description="Main argument of the node content")
    summary: str = Field(description="Summary of the node content")
    domain: str = Field(description="Domain of the node content")
    tags: list[str] = Field(description="Tags associated with the node. ")
    themes: list[str] = Field(description="Themes related to the node's content. ")
    keywords: list[str] = Field(description="Keywords related to the node's content. ")
    quotes: list[str] = Field(description="Notable quotes from the node. List only a couple based on the length of the text. For example, a short text should only have 2-3 while longer papers should be upwards of 10")
    content_creation_date: datetime = Field(description="When the node file was from if there are hints in the text, otherwise use the current date")
    entities_persons: list[str] = Field(description="People mentioned in the node")
    entities_places: list[str] = Field(description="Places mentioned in the node")
    entities_organizations: list[str] = Field(description="Organizations mentioned in the node")
    entities_references: list[str] = Field(description="References mentioned in the node")

class Node(FileNode, LLMNode):
    """Combined node inheriting all properties from FileNode and LLMNode"""
    
    @classmethod
    def from_components(cls, file: FileNode, llm: LLMNode) -> "Node":
        """Create a Node instance from separate FileNode and LLMNode components"""
        file_dict = file.model_dump()
        llm_dict = llm.model_dump()
        
        # Combine both dictionaries
        combined_dict = {**file_dict, **llm_dict}
        return cls(**combined_dict)

### 2. Embeddign model

Relevant parameters to be set:
* Embedding model specialised to distinct task_types:
    * Files will be embedded as `task_type="RETRIEVAL_DOCUMENT"`
    * And queries using `task_type="RETRIEVAL_QUERY"`
    


In [7]:
from google import genai
from google.genai.types import EmbedContentConfig

import os
import numpy as np

class EmbeddingModel:
    def __init__(
            self,
            model_name, 
            output_dimensionality: int=768,
        ):
        self.client = genai.Client(api_key=os.environ['GEMINI_API_KEY'])
        self.model_name: str = model_name
        self.output_dimensionality: int = output_dimensionality

    def _embed_call(self, content: str | list[str], config: EmbedContentConfig) -> np.ndarray:
        embedding = self.client.models.embed_content(
            model=self.model_name,
            contents=content,
            config=config
        )
        return embedding
    
    def embed_query(self, query: str) -> np.ndarray:
        return self._embed_call(
            content=query,
            config=EmbedContentConfig(
                task_type="RETRIEVAL_QUERY",
                output_dimensionality=self.output_dimensionality
            )
        )
    
    def embed_file(self, file: str) -> np.ndarray:
        return self._embed_call(
            content=file,
            config=EmbedContentConfig(
                task_type="RETRIEVAL_DOCUMENT",
                output_dimensionality=self.output_dimensionality
            )
        )

embedding_model = EmbeddingModel(model_name=EMBEDDING_MODEL)
embedding_model.embed_file("test")

EmbedContentResponse(embeddings=[ContentEmbedding(values=[0.023835836, 0.032532666, -0.052185923, -0.019886132, 0.026069641, -0.0094205765, 0.0040515373, 0.04901829, -0.05772261, 0.02971936, 0.0033174066, 0.007165582, 0.03826761, 0.019403733, -0.0124670975, -0.07177179, 0.040335946, 0.0284832, -0.105195366, 0.0038846806, 0.035758987, -0.032837637, 0.031979468, -0.010537732, -0.013240267, -0.01915155, -0.009758277, 0.0014636988, -3.48189e-05, -0.006086665, 0.074584596, 0.047881678, 0.023218602, -0.047333293, 0.039103195, 0.0546853, 0.00990299, 0.043736435, 0.027497908, -0.033419862, -0.064396404, 0.008361147, -0.02442376, 0.04099201, -0.05026595, -0.039420515, -0.014952215, 0.076169856, -0.04816034, 0.024997566, 0.020614417, 0.08048106, -0.055345505, 0.027728302, -0.021067938, -0.030663263, -0.019208597, -0.003718978, 0.014299155, -0.025252145, 0.0031429396, -0.0041793208, -0.0034405375, -0.017077021, 0.03370819, -0.03766928, -0.019878464, -0.03647229, -0.053720966, 0.06277102, 0.001477

### 3. LLM model with structured output

In [59]:
import os
import time
import logging
from pathlib import Path
from typing import Any, List, Union
import PIL.Image

from google import genai


logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

class Gemini:
    """
    A client for interacting with the Gemini API with automatic retry on timeouts.
    """
    def __init__(self, response_schema: Any, model_name: str = BASE_LLM_MODEL):
        self.api_key = os.getenv("GEMINI_API_KEY")
        if not self.api_key:
            raise ValueError("GEMINI_API_KEY environment variable is not set.")
        self.client = genai.Client(api_key=self.api_key)
        self.model_name = model_name
        self.response_schema = response_schema

    def _call_api_with_retry(self, contents: List[Any], config: dict, retries: int = 3, delay: int = 5):
        for attempt in range(retries):
            try:
                response = self.client.models.generate_content(
                    model=self.model_name,
                    contents=contents,
                    config=config
                )
                return response
            except Exception as e:
                # If a timeout error is detected in the exception message, retry.
                if attempt < retries - 1:
                    logger.warning("API timeout detected (attempt %d/%d). Retrying in %d seconds...", attempt+1, retries, delay)
                    time.sleep(delay)
                    continue
                logger.exception("API call failed: %s", e)
                raise

    def prompt_with_image(self, prompt: str, image_paths: Union[Path, List[Path]]) -> Any:
        """
        Sends a prompt along with one or more images to the Gemini API and returns the parsed response.
        """
        try:
            if isinstance(image_paths, Path):
                image_paths = [image_paths]

            images = []
            for path in image_paths:
                try:
                    image = PIL.Image.open(path)
                    images.append(image)
                except Exception as e:
                    logger.error("Failed to open image %s: %s", path, e)
                    continue

            contents = [prompt] + images
            config = {
                "response_mime_type": "application/json",
                "response_schema": self.response_schema
            }
            response = self._call_api_with_retry(contents, config)
            return response.parsed
        except Exception as e:
            logger.exception("Error in prompt_with_image: %s", e)
            raise

    def query(self, prompt: str) -> Any:
        """
        Sends a text prompt to the Gemini API and returns the parsed response.
        """
        try:
            config = {
                "response_mime_type": "application/json",
                "response_schema": self.response_schema
            }
            response = self._call_api_with_retry([prompt], config)
            return response.parsed
        except Exception as e:
            logger.exception("Error in query: %s", e)
            raise

# Execution

### 1. Grabbing all files from Obsidian-vault

In [19]:
from pathlib import Path
path_to_obsidian = Path("~/Google Drive/My Drive/Obsidian").expanduser().resolve()

counter = 0
for i in path_to_obsidian.rglob("*.md"):
    counter += 1
print(f"Number of notes in Obsidian-vault: {counter}")

Number of notes in Obsidian-vault: 1309


### 2. Placing them inside FileNode model

In [21]:
# hash function
import hashlib

HASH_BASE_SIZE = 1024

def hash_file(file_path: Path) -> str:
    """
    Create a hash of the file based on its content and metadata.
    We should be able to catch changes in the middle of the file because of the metadata
    """
    file_stat = os.stat(file_path)
    file_size = file_stat.st_size
    mod_time = file_stat.st_mtime # modification time invariant to changing file content
        
    # Read the entire content for small files, or first and last 1024 bytes for larger files
    with open(file_path, 'rb') as f:
        if file_size <= 2*HASH_BASE_SIZE:  # If file is 2KB or smaller, read entire file
            file_content = f.read()
        else:
            first_bytes = f.read(HASH_BASE_SIZE)
            f.seek(-HASH_BASE_SIZE, 2)  # Seek to 1024 bytes from the end
            last_bytes = f.read()
            file_content = first_bytes + last_bytes
    
    hash_input = f"{file_size}_{mod_time}_{file_content}"
    
    return hashlib.md5(hash_input.encode()).hexdigest()


Reference for SemanticChunking (insanely interesting!):
[Link](https://github.com/FullStackRetrieval-com/RetrievalTutorials/blob/main/tutorials/LevelsOfTextSplitting/5_Levels_Of_Text_Splitting.ipynb)

In [31]:
# chunker
from langchain_experimental.text_splitter import SemanticChunker
from langchain_google_genai import GoogleGenerativeAIEmbeddings

os.environ["GOOGLE_API_KEY"] = os.environ['GEMINI_API_KEY']

google_embeddings_retrieval_documents = GoogleGenerativeAIEmbeddings(
    model="models/text-embedding-004",
    task_type="RETRIEVAL_DOCUMENT"
)
google_embeddings_retrieval_query = GoogleGenerativeAIEmbeddings(
    model="models/text-embedding-004",
    task_type="RETRIEVAL_QUERY"
)
# Usage: vector = embeddings.embed_query("hello, world!")

text_splitter = SemanticChunker(
    embeddings=google_embeddings_retrieval_documents,
    breakpoint_threshold_type="percentile",
)
# Usage: docs = text_splitter.create_documents(["Hello, world!"])

In [57]:
# Create a dictionary to store file nodes and their corresponding chunks
from tqdm import tqdm

file_nodes = []

# we can take the length of a generator if we first convert it to a list apparently!
total_count = len(list(path_to_obsidian.rglob("*.md")))

for i in tqdm(path_to_obsidian.rglob("*.md"), total=total_count):
    content = i.read_text()

    chunks = text_splitter.create_documents([content])
    chunks_to_embeddings = {
        chunk.page_content: google_embeddings_retrieval_documents.embed_query(
            chunk.page_content
        ) for chunk in chunks
    }

    file_nodes.append(
        FileNode(
            primary_id=hash_file(i),
            content=content,
            file_size=i.stat().st_size,
            file_creation_time=i.stat().st_ctime,
            file_modification_time=i.stat().st_mtime,
            filetype=i.suffix,
            location="Obsidian",
            path=i.as_posix(),
            chunks_to_embeddings=chunks_to_embeddings
        )
    )
    # what's probably pretty improtant when we create the nodes
    # is that we'll have Files/Documents as one lable
    # then we'll have chunks with a `chunk_id`
    # tying back to a `parent_id` and `next_chunk_id` for the next chunk

    break

test = file_nodes[0]

  0%|          | 0/1309 [00:00<?, ?it/s]


In [71]:
from prompts import PROMPT

llm_featuriser = Gemini(response_schema=LLMNode)

prompt_with_content = PROMPT + f"\n\n{test.content}"

llm_test = llm_featuriser.query(prompt_with_content)

In [78]:
full_node = Node.from_components(
    file=test, llm=llm_test
)
full_node

Node(label='To-do list', author=[], problem_space=['Project management', 'Personal task management', 'Research planning'], research_question='Not applicable', main_argument='A list of tasks to be completed across various areas, including academic work, personal matters, and professional responsibilities.', summary='This document is a to-do list containing a variety of tasks related to academic projects, job applications, personal tasks, and work-related assignments. The tasks range from finishing assignments and finding a new place to live to exploring data analysis techniques and following up on work leads.', domain='Project Management', tags=['task-management', 'personal-projects', 'academic-tasks', 'work-tasks'], themes=['task management', 'research planning', 'personal goals', 'academic workload', 'professional development'], keywords=['blog post', 'midterm', 'new place', 'embedding space', 'private mirror', 'PSet', 'CSE', 'writing application', 'datasets', 'Orbis', 'ICIJ leaks', '