# Use embeddings to search the dataset with a query string

In [1]:
import os
import pandas as pd
import openai
from openai.embeddings_utils import get_embedding, cosine_similarity

In [2]:
%load_ext dotenv
%dotenv

In [3]:
openai.api_key = os.getenv("OPENAI_API_KEY")
assert openai.api_key is not None

Load the dataset and the embeddings:

In [4]:
import numpy as np

def load_df(folder_path, file_name) -> pd.DataFrame:
    """Load dataset from a csv file into a pandas DataFrame. Ensure the types are correct.
    
    Expected columns:
    - index: int
    - file_name: str
    - content: str
    - n_tokens: int
    - embedding: np.ndarray
    """
    df = pd.read_csv(os.path.join(folder_path, file_name), index_col=0)
    df["n_tokens"] = df["n_tokens"].astype(int)
    df["embedding"] = df.embedding.apply(eval).apply(np.array)
    return df

In [5]:
FOLDER = "tmp/dnd-notes-main"
df = load_df(FOLDER, "embeddings.csv")
df.head()

Unnamed: 0,file_name,content,n_tokens,embedding
0,A letter between an unnamed council member and...,"Dear [[Zarina Shadowblade|Ms. Shadowblade]],\n...",233,"[0.005814367905259132, -0.0338066890835762, 0...."
2,Inventor McGee.md,---\ntype:\naliases: \ntags: \ntitle: Inventor...,107,"[-0.018809376284480095, -0.007474252488464117,..."
3,The Heart of the Sea.md,---\ntype:\naliases: \ntags: \ntitle: The Hear...,230,"[-0.006191847380250692, -0.014760499820113182,..."
4,The Pirate Queen.md,# The Pirate Queen\n\nThe Pirate Queen is a go...,1065,"[-0.008679009974002838, -0.027751564979553223,..."
5,The Battle of the Bands.md,---\ntype: \naliases: \ntags: \ntitle: The Bat...,175,"[0.000854369776789099, -0.027519790455698967, ..."


In [6]:
EMBEDDINGS_MODEL = "text-embedding-ada-002"

def search_df(df, query, filter=None, top_n=10):
    query_embedding = get_embedding(query, engine=EMBEDDINGS_MODEL)
    df = df.copy()
    if filter is not None:
        df = df[df.file_name.apply(filter)]
    df["similarity"] = df.embedding.apply(
        lambda x: cosine_similarity(query_embedding, x)
    )
    return df.sort_values("similarity", ascending=False).head(top_n)

In [8]:
# try out the search
search_df(df, "Pirate Queen", top_n=10)

Unnamed: 0,file_name,content,n_tokens,embedding,similarity
4,The Pirate Queen.md,# The Pirate Queen\n\nThe Pirate Queen is a go...,1065,"[-0.008679009974002838, -0.027751564979553223,...",0.878185
63,Temple of the Pirate Queen.md,---\ntype:\naliases: \ntags: \ntitle: Temple o...,425,"[0.005318153649568558, -0.015594431199133396, ...",0.863997
3,The Heart of the Sea.md,---\ntype:\naliases: \ntags: \ntitle: The Hear...,230,"[-0.006191847380250692, -0.014760499820113182,...",0.830742
30,Misty discovers a ship.md,---\naliases: \ncampaign: Pirates\nfc-calendar...,149,"[-0.01921083591878414, -0.020662818104028702, ...",0.81769
35,Misty dreams of a burning feywild.md,---\nfc-calendar: PirateHomeGame\nfc-date: 108...,158,"[-0.013825426809489727, -0.024526702240109444,...",0.816829
26,Misty meets Krav in Port Damali.md,---\naliases: \ncampaign: Pirates\ntags: timel...,154,"[-0.015323172323405743, -0.03628437966108322, ...",0.814339
32,The Lizard King is defeated.md,---\nfc-calendar: PirateHomeGame\nfc-date: 108...,143,"[-0.00805725622922182, -0.00715883681550622, -...",0.813848
60,The Revelry.md,---\ntype:\naliases: \ntags: \ntitle: The Reve...,2138,"[0.01417054608464241, -0.021273348480463028, 0...",0.811905
9,The Island with No Name.md,---\ntype: \naliases: \ntags: \ntitle: Adventu...,783,"[0.008201858028769493, -0.024791821837425232, ...",0.810379
8,The crew of The Labyrinth.md,---\ntype:\naliases: \ntags: \ntitle: The crew...,466,"[0.01359730027616024, -0.02842007949948311, -0...",0.809802


In [11]:
# try out the search with a filter
search_df(df, "Factions", filter=lambda filename: "PHG" not in filename, top_n=10)

Unnamed: 0,file_name,content,n_tokens,embedding,similarity
62,Factions.md,- The Clovis Concord: The Clovis Concord is a ...,475,"[0.020600635558366776, -0.01712861843407154, 0...",0.794975
58,The Gilded Consortium.md,## Key people\n\n[[Zarina Shadowblade]]\n[[Jax...,17,"[-0.0019484994700178504, -0.020373281091451645...",0.787722
31,The Rightous Brand is created in response to t...,---\nfc-calendar: PirateHomeGame\nfc-date: 108...,159,"[-0.010819816030561924, -0.01890472136437893, ...",0.786723
57,The Righteous Brand.md,# The Righteous Brand\n\n- Summary\n- History\...,162,"[-0.003133234567940235, -0.020077712833881378,...",0.785974
34,The Crew rescue Voyce Fayette from Lizardfolk.md,---\nfc-calendar: PirateHomeGame\naliases: \nc...,163,"[-0.008282072842121124, -0.019093787297606468,...",0.780697
27,Voyce retires from piracy.md,---\naliases: \nfc-calendar: PirateHomeGame\nc...,166,"[-0.021369343623518944, -0.03256009519100189, ...",0.773597
7,Summary of the Menagerie Coast.md,---\ntype:\naliases: \ntags: \ntitle: Summary ...,217,"[0.013764887116849422, -0.021770406514406204, ...",0.773535
60,The Revelry.md,---\ntype:\naliases: \ntags: \ntitle: The Reve...,2138,"[0.01417054608464241, -0.021273348480463028, 0...",0.771625
33,Krav escapes imprisonment.md,---\naliases: \ncampaign: Pirates\ntags: timel...,155,"[-0.0037840991280972958, -0.024232249706983566...",0.768377
16,Celestial Solstice.md,An rare alignment of heavenly bodies and leyli...,83,"[0.019645312801003456, -0.01342153362929821, -...",0.768001
