# Embeddings by document section

Rather than using a single embedding for each *document*, we first split the document into sections (as illustrated in `split_markdown.ipynb`) and then find embeddings for each section. This should allow more relevant information to be found based on a query, as the embeddings are more specific to the section.

## Define functions to split a document into sections

These functions are based on the one defined in `split_markdown.ipynb`.

In [1]:
import re

def parse_markdown(markdown_string):
    """Splits a markdown string into a dictionary of headings and content."""
    headings_regex = r'(?P<level>#+)\s*(?P<heading>.+)'
    headings = {}
    current_heading = ''
    current_content = []
    for line in markdown_string.splitlines():
        if heading_match := re.match(headings_regex, line):
            if current_heading:
                headings[current_heading] = current_content
                current_content = []
            current_heading = heading_match['heading']
        elif line.strip():
            current_content.append(line.strip())
    if current_heading:
        headings[current_heading] = current_content
    return headings

def parse_markdown_files(paths: list[str]) -> dict[str, dict[str, list[str]]]:
    """Parses a list of markdown files into a dictionary of headings and content."""
    markdown = {}
    for path in paths:
        with open(path) as f:
            markdown[path] = parse_markdown(f.read())
    return markdown

Define utility function to return all markdown files in a directory

In [2]:
import glob

def get_markdown_files(dir, exclude_dirs_patterns: list = None):
    """Recursively get the paths to all markdown files in a directory, excluding any directories in exclude_dirs.
    Directories in exclude_dirs are matched using glob syntax and may appear anywhere in the path."""
    if exclude_dirs_patterns is None:
        exclude_dirs_patterns = []
    paths = []
    for path in glob.glob(f"{dir}/**", recursive=True):
        if not path.endswith(".md"):
            continue
        # Skip any paths that match any of the exclude_dirs patterns anywhere in the path
        if any(re.search(pattern, path) for pattern in exclude_dirs_patterns):
            continue
        paths.append(path)
    return paths
   

## Define functions to prepare the data for embedding

We'll be using a pandas dataframe to store and manipulate the dataset. Here I define a few utility functions to be used in the data pipeline, namely `count_tokens`, `filter_by_num_tokens`, and `filter_empty_content`.

In [3]:
import tiktoken
import pandas as pd

EMBEDDINGS_MODEL = "text-embedding-ada-002"
EMBEDDINGS_ENCODING = "cl100k_base"  # encoding for ada-002
EMBEDDINGS_MAX_TOKENS = 8000  # max tokens for ada-002

encoding = tiktoken.get_encoding(EMBEDDINGS_ENCODING)

def count_tokens(df) -> pd.DataFrame:
    """Count the number of tokens in a dataframe column."""
    df['n_tokens'] = df.combined.apply(lambda x: len(encoding.encode(x)))
    return df

def filter_by_num_tokens(df, n_tokens) -> pd.DataFrame:
    """Filter a dataframe by the number of tokens in a column."""
    return df[df.n_tokens <= n_tokens]

def filter_empty_content(df) -> pd.DataFrame:
    """Remove rows with empty content."""
    return df[df.content != '']

def combined(df: pd.DataFrame) -> pd.DataFrame:
    """Combine the filename, heading, and content into a single column, avoiding the SettingWithCopyWarning."""
    df.loc[:, 'combined'] = df.filename + ":" + df.heading + "\n" + df.content
    return df
    

Define a function to load the data into a pandas dataframe:

In [4]:
import os

def load_documents_as_df(paths: list) -> pd.DataFrame:
    """Load a list of markdown files into a dataframe of sections."""
    parsed = parse_markdown_files(paths)
    # create a dataframe: filename, heading, content
    docs = pd.DataFrame(
        [
            (os.path.basename(path), heading, "\n".join(content))
            for path, headings in parsed.items()
            for heading, content in headings.items()
        ],
        columns=["filename", "heading", "content"],
    )
    docs = (
        docs.pipe(filter_empty_content)
        .pipe(combined)
        .pipe(count_tokens)
        .pipe(filter_by_num_tokens, EMBEDDINGS_MAX_TOKENS)
    )
    return docs

def load_documents_as_df_from_dir(dir: str, **kwargs) -> pd.DataFrame:
    """Load a directory of markdown files into a dataframe of sections."""
    paths = get_markdown_files(dir, **kwargs)
    return load_documents_as_df(paths)


## Apply the functions to the data:

In [21]:
# FOLDER = "tmp/dnd-notes-main"
FOLDER = "/home/tyto/Documents/dnd/DND.SRD.Wiki-0.5.1/"
EXCLUDE_DIRS = ["Templates", "Alt", "Changelog.md", "README.md", "Legal.md"]
print(f"Found {len(get_markdown_files(FOLDER, exclude_dirs_patterns=[]))} markdown files in {FOLDER}.")
print(f"Found {len(get_markdown_files(FOLDER, exclude_dirs_patterns=EXCLUDE_DIRS))} markdown files in {FOLDER} (excluding {EXCLUDE_DIRS}).")

Found 1028 markdown files in /home/tyto/Documents/dnd/DND.SRD.Wiki-0.5.1/.
Found 931 markdown files in /home/tyto/Documents/dnd/DND.SRD.Wiki-0.5.1/ (excluding ['Templates', 'Alt', 'Changelog.md', 'README.md', 'Legal.md']).


In [22]:
df = load_documents_as_df_from_dir(FOLDER, exclude_dirs_patterns=EXCLUDE_DIRS)
print(f"Loaded {len(df)} sections from {FOLDER} across {len(df.filename.unique())} files.")
df.head()

Loaded 1954 sections from /home/tyto/Documents/dnd/DND.SRD.Wiki-0.5.1/ across 931 files.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'combined'] = df.filename + ":" + df.heading + "\n" + df.content
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['n_tokens'] = df.combined.apply(lambda x: len(encoding.encode(x)))


Unnamed: 0,filename,heading,content,combined,n_tokens
1,Paladin.md,Class Features,"As a paladin, you gain the following class fea...","Paladin.md:Class Features\nAs a paladin, you g...",19
2,Paladin.md,Hit Points,**Hit Dice:** 1d10 per paladin level\n**Hit Po...,Paladin.md:Hit Points\n**Hit Dice:** 1d10 per ...,64
3,Paladin.md,Proficiencies,"**Armor:** All armor, shields\n**Weapons:** Si...",Paladin.md:Proficiencies\n**Armor:** All armor...,60
4,Paladin.md,Equipment,"You start with the following equipment, in add...",Paladin.md:Equipment\nYou start with the follo...,821
5,Paladin.md,Divine Sense,The presence of strong evil registers on your ...,Paladin.md:Divine Sense\nThe presence of stron...,188


In [23]:
# Get the total number of tokens in the dataset
df.n_tokens.sum()

# Get the number of documents in the dataset
df.filename.nunique()

# Get the number of sections in the dataset
df.shape[0]

# Get the average number of tokens in each document
df.groupby("filename").n_tokens.sum().mean()

# the cost for embeddings is $0.0004 / 1K tokens, so we can calculate the cost of the dataset
print(f"Cost of embeddings for this dataset: ${df.n_tokens.sum() * 0.0004 / 1000:.2f}")
print(f"Dataset has {df.shape[0]} sections, {df.filename.nunique()} documents, and {df.n_tokens.sum()} tokens.")

Cost of embeddings for this dataset: $0.15
Dataset has 1954 sections, 931 documents, and 384297 tokens.


## Create and save embeddings for the content in each section

In [24]:
from pandarallel import pandarallel
from openai.embeddings_utils import get_embedding

pandarallel.initialize(progress_bar=True)

df['embedding'] = df.parallel_apply(lambda x: get_embedding(text=x.content, engine=EMBEDDINGS_MODEL), axis=1)  # takes roughly 1.5 minutes on dataset with 400k tokens
df.to_csv(f"{FOLDER}/embeddings-sections.csv", index=False)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=245), Label(value='0 / 245'))), HB…