# Create embeddings for a set of markdown documents given the containing folder

In [1]:
import os
import openai
import pandas as pd
import tiktoken
import glob

In [2]:
%load_ext dotenv
%dotenv

In [3]:
openai.api_key = os.getenv("OPENAI_API_KEY")
assert openai.api_key is not None

Define a function to read all the markdown files in a folder and return their contents:

In [4]:
def read_markdown_files(folder_path):
    """Read all markdown files in a folder (with possible sub-folders) and return a dataframe with the file name and content."""
    files = glob.glob(f"{folder_path}/**/*.md", recursive=True)
    file_names = [os.path.basename(file) for file in files]
    file_contents = [open(file, "r").read() for file in files]
    return pd.DataFrame({"file_name": file_names, "content": file_contents})


Prepare the data for the embedding model:

In [5]:
from openai.embeddings_utils import get_embedding

EMBEDDINGS_MODEL = "text-embedding-ada-002"
EMBEDDINGS_ENCODING = "cl100k_base"  # encoding for ada-002
EMBEDDINGS_MAX_TOKENS = 8000  # max tokens for ada-002

encoding = tiktoken.get_encoding(EMBEDDINGS_ENCODING)

def count_tokens(df) -> pd.DataFrame:
    """Count the number of tokens in a dataframe column."""
    df['n_tokens'] = df.content.apply(lambda x: len(encoding.encode(x)))
    return df

def filter_by_max_tokens(df, max_tokens) -> pd.DataFrame:
    """Filter a dataframe by the number of tokens in a column."""
    return df[df.n_tokens <= max_tokens]

def filter_empty_content(df) -> pd.DataFrame:
    """Filter a dataframe by the number of tokens in a column."""
    return df[df.content != ""]

def get_embeddings(df) -> pd.DataFrame:
    """Get embeddings for a dataframe column."""
    df['embedding'] = df.content.apply(lambda x: get_embedding(text=x, engine=EMBEDDINGS_MODEL))
    return df


Use the functions defined above to create the embeddings:

In [6]:
FOLDER = "tmp/dnd-notes-main"
df = (
    read_markdown_files(FOLDER)
    .pipe(filter_empty_content)
    .pipe(count_tokens)
    .pipe(filter_by_max_tokens, max_tokens=EMBEDDINGS_MAX_TOKENS)
)
df

Unnamed: 0,file_name,content,n_tokens
0,A letter between an unnamed council member and...,"Dear [[Zarina Shadowblade|Ms. Shadowblade]],\n...",233
2,Inventor McGee.md,---\ntype:\naliases: \ntags: \ntitle: Inventor...,107
3,The Heart of the Sea.md,---\ntype:\naliases: \ntags: \ntitle: The Hear...,230
4,The Pirate Queen.md,# The Pirate Queen\n\nThe Pirate Queen is a go...,1065
5,The Battle of the Bands.md,---\ntype: \naliases: \ntags: \ntitle: The Bat...,175
...,...,...,...
92,Yevelda.md,**Yevelda.** Half-orc partner of Dockmaster Kr...,26
93,Naval Combat.md,"---\ntype: notes\naliases: \ntags: dnd, combat...",839
94,Seafaring adventure ideas.md,"---\ntype: \naliases: \ntags: dnd, ideas\ntitl...",246
95,D&D Campaign Creation - DM's Lair.md,"---\ntype: video-notes\naliases: \ntags: dnd, ...",913


In [None]:
# get embeddings
df = get_embeddings(df)
# save the dataframe to a csv file for later use
save_path = os.path.join(FOLDER, "embeddings.csv")
df.to_csv(save_path)