# Embeddings by document section

Rather than using a single embedding for each *document*, we first split the document into sections (as illustrated in `split_markdown.ipynb`) and then find embeddings for each section. This should allow more relevant information to be found based on a query, as the embeddings are more specific to the section.

## Define functions to split a document into sections

These functions are based on the one defined in `split_markdown.ipynb`.

In [1]:
import pandoc
from pandoc.types import Header, Para, BulletList, OrderedList

def split_doc_by_header(doc, include_orphans=True):
    """From split_markdown.ipynb, split a pandoc document into sections by header.
    
    Args:
        doc (pandoc.Document): The document to split.
        include_orphans (bool, optional): Whether to include sections that don't have a header. Defaults to True.

    Returns:
        dict: A dictionary mapping section titles to lists of strings.

    Example:
        >>> doc = pandoc.read("test.md")
        >>> split_doc_by_header(doc)
            {'Header 1': ['This is a test.'], 'Header 2': ['This is another test.']}
    """
    sections = {}
    title_str = ""
    for elt in pandoc.iter(doc):
        if elt == doc:
            continue
        match elt:
            case Header(_, _, title):
                title_str = pandoc.write(title).strip()
                sections[pandoc.write(title).strip()] = []
            case Para(x):
                if not title_str and include_orphans:
                    title_str = "[No Header]"
                if title_str not in sections and include_orphans:
                    sections[title_str] = []
                sections[title_str].append(pandoc.write(x, options=["--wrap=none"]).strip())
            case BulletList(_) | OrderedList(_):
                if not title_str and include_orphans:
                    title_str = "[No Header]"
                if title_str not in sections and include_orphans:
                    sections[title_str] = []
                # split the list on newlines
                list_str = pandoc.write(elt, options=["--wrap=none"])
                list_items = list_str.splitlines()
                sections[title_str].extend(list_items)
    return sections

def split_markdown_by_header(md, include_orphans=True):
    """Split a markdown document into sections by header.
    
    Args:
        md (str): The markdown document to split.
        include_orphans (bool, optional): Whether to include sections that don't have a header. Defaults to True.

    Returns:
        dict: A dictionary mapping section titles to lists of strings.

    Example:
        >>> split_markdown_by_header("# Header 1\nThis is a test.\n\n# Header 2\nThis is another test.")
            {'Header 1': ['This is a test.'], 'Header 2': ['This is another test.']}
    """
    doc = pandoc.read(md)
    return split_doc_by_header(doc, include_orphans=include_orphans)

def split_markdown_by_header_file(path, include_orphans=True):
    """Split a markdown document into sections by header.
    
    Args:
        path (str): The path to the markdown document to split.
        include_orphans (bool, optional): Whether to include sections that don't have a header. Defaults to True.

    Returns:
        dict: A dictionary mapping section titles to lists of strings.

    Example:
        >>> split_markdown_by_header_file("test.md")
            {'Header 1': ['This is a test.'], 'Header 2': ['This is another test.']}
    """
    with open(path, "r") as f:
        try:
            doc = pandoc.read(file=f, format="markdown")
        except Exception as e:
            raise Exception(f"Error reading {path}: {e}") from e
    return split_doc_by_header(doc, include_orphans=include_orphans)

Define utility function to return all markdown files in a directory

In [2]:
import glob

def get_markdown_files(dir, exclude_dirs: list = None):
    """Recursively get the paths to all markdown files in a directory, excluding any directories in exclude_dirs.
    Directories in exclude_dirs are matched using glob syntax and may appear anywhere in the path."""
    if exclude_dirs is None:
        exclude_dirs = []
    paths = []
    for path in glob.glob(f"{dir}/**", recursive=True):
        if not path.endswith(".md"):
            continue
        if any(glob.fnmatch.fnmatch(path, f"{dir}/**/{exclude_dir}/**") for exclude_dir in exclude_dirs):
            continue
        paths.append(path)
    return paths
   
# test it out
print(f"Found {len(get_markdown_files('tmp'))} markdown files in tmp directory.")
print(f"Found {len(get_markdown_files('tmp', exclude_dirs=['Templates']))} markdown files in tmp directory, excluding Templates directory.")

Found 97 markdown files in tmp directory.
Found 94 markdown files in tmp directory, excluding Templates directory.


## Define functions to prepare the data for embedding

We'll be using a pandas dataframe to store and manipulate the dataset. Here I define a few utility functions to be used in the data pipeline, namely `count_tokens`, `filter_by_num_tokens`, and `filter_empty_content`.

In [3]:
import tiktoken
import pandas as pd

EMBEDDINGS_MODEL = "text-embedding-ada-002"
EMBEDDINGS_ENCODING = "cl100k_base"  # encoding for ada-002
EMBEDDINGS_MAX_TOKENS = 8000  # max tokens for ada-002

encoding = tiktoken.get_encoding(EMBEDDINGS_ENCODING)

def count_tokens(df) -> pd.DataFrame:
    """Count the number of tokens in a dataframe column."""
    df['n_tokens'] = df.content.apply(lambda x: len(encoding.encode(x)))
    return df

def filter_by_num_tokens(df, n_tokens) -> pd.DataFrame:
    """Filter a dataframe by the number of tokens in a column."""
    return df[df.n_tokens <= n_tokens]

def filter_empty_content(df) -> pd.DataFrame:
    """Filter a dataframe by the number of tokens in a column."""
    return df[df.content != ""]

Define a function to load the data into a pandas dataframe:

In [9]:
import os

def load_documents_as_df(paths: list) -> pd.DataFrame:
    """Load a list of markdown files into a dataframe of sections."""
    docs = []
    for path in paths:
        filename = os.path.basename(path)
        sections = split_markdown_by_header_file(path)
        docs.extend(
            {
                "filename": filename,
                "title": title,
                "content": "\n".join(content),
            }
            for title, content in sections.items()
        )
    return pd.DataFrame(docs)

def load_documents_as_df_from_dir(dir: str) -> pd.DataFrame:
    """Load a directory of markdown files into a dataframe of sections."""
    paths = get_markdown_files(dir, exclude_dirs=["Templates"])
    return load_documents_as_df(paths)

# test it out
df = load_documents_as_df_from_dir("tmp")
df.head()

Unnamed: 0,filename,title,content
0,The End of Piracy.md,The End of Piracy,
1,The Astral Horde.md,[No Header],treasure in the astral sea astral sea accessed...
2,The Astral Horde.md,The Astral Horde,This is a series of adventures in which the pa...
3,The Astral Horde.md,Arc steps,1. Level Range\n - I can see this advent...
4,1 Planes Gone Wild.md,Planes Gone Wild,\[\[Pasted image 20221008163518.png\]\]


Apply the functions to the data:

In [12]:
df = (
    df.pipe(filter_empty_content)
    .pipe(count_tokens)
    .pipe(filter_by_num_tokens, n_tokens=EMBEDDINGS_MAX_TOKENS)
)
df.head()

Unnamed: 0,filename,title,content,n_tokens
1,The Astral Horde.md,[No Header],treasure in the astral sea astral sea accessed...,53
2,The Astral Horde.md,The Astral Horde,This is a series of adventures in which the pa...,232
3,The Astral Horde.md,Arc steps,1. Level Range\n - I can see this advent...,151
4,1 Planes Gone Wild.md,Planes Gone Wild,\[\[Pasted image 20221008163518.png\]\],16
5,1 Planes Gone Wild.md,Background,The Planar Seal atop the Hearthstar Peaks was ...,90


## Create and save embeddings for the content in each section

In [15]:
from openai.embeddings_utils import get_embedding

def get_embeddings(df) -> pd.DataFrame:
    """Get embeddings for a dataframe of documents."""
    df['embeddings'] = df.content.apply(lambda x: get_embedding(text=x, engine=EMBEDDINGS_MODEL))
    return df

df = get_embeddings(df)  # this will take a while, depending on the number of documents
# save the dataframe to a csv for later use
df.to_csv("tmp/embeddings-sections.csv", index=False)  # index=False to avoid saving the index column, which is just a row number that doesn't add any value