# Use embeddings to search the dataset with a query string

In [1]:
import os
import pandas as pd
import openai
from openai.embeddings_utils import get_embedding, cosine_similarity

In [2]:
%load_ext dotenv
%dotenv

In [3]:
openai.api_key = os.getenv("OPENAI_API_KEY")
assert openai.api_key is not None

Load the dataset and the embeddings:

In [4]:
import numpy as np

def load_df(folder_path, file_name) -> pd.DataFrame:
    """Load dataset from a csv file into a pandas DataFrame. Ensure the types are correct.
    
    Expected columns:
    - index: int
    - file_name: str
    - content: str
    - n_tokens: int
    - embedding: np.ndarray
    """
    df = pd.read_csv(os.path.join(folder_path, file_name))
    df["n_tokens"] = df["n_tokens"].astype(int)
    df["embeddings"] = df.embeddings.apply(eval).apply(np.array)
    return df

In [5]:
FOLDER = "tmp/"
df = load_df(FOLDER, "embeddings-sections.csv")
df.head()

Unnamed: 0,filename,title,content,n_tokens,embeddings
0,The Astral Horde.md,[No Header],treasure in the astral sea astral sea accessed...,53,"[-0.016966022551059723, -0.03141343221068382, ..."
1,The Astral Horde.md,The Astral Horde,This is a series of adventures in which the pa...,232,"[-0.01265875156968832, -0.031234312802553177, ..."
2,The Astral Horde.md,Arc steps,1. Level Range\n - I can see this advent...,151,"[0.01715025119483471, -0.03706667199730873, -0..."
3,1 Planes Gone Wild.md,Planes Gone Wild,\[\[Pasted image 20221008163518.png\]\],16,"[-0.001117889885790646, -0.006661985535174608,..."
4,1 Planes Gone Wild.md,Background,The Planar Seal atop the Hearthstar Peaks was ...,90,"[0.006391542498022318, -0.01329746749252081, -..."


In [6]:
EMBEDDINGS_MODEL = "text-embedding-ada-002"

def search_df(df, query, filter=None, top_n=10):
    query_embedding = get_embedding(query, engine=EMBEDDINGS_MODEL)
    df = df.copy()
    if filter is not None:
        df = df[df.filename.apply(filter)]
    df["similarity"] = df.embeddings.apply(
        lambda x: cosine_similarity(query_embedding, x)
    )
    return df.sort_values("similarity", ascending=False).head(top_n)

In [7]:
# try out the search
search_df(df, "The Pirate Queen", top_n=10)

Unnamed: 0,filename,title,content,n_tokens,embeddings,similarity
43,The Pirate Queen.md,The Pirate Queen,The Pirate Queen is a goddess of the pirates a...,392,"[-0.00825242418795824, -0.02610553801059723, 0...",0.887743
225,Temple of the Pirate Queen.md,[No Header],**The Pirate Queen's Temple**: - The walls of ...,177,"[-0.003187410533428192, -0.00827428512275219, ...",0.863848
44,The Pirate Queen.md,Temple,A temple to the Pirate Queen on Darktow may be...,680,"[-0.0034151242580264807, -0.019468415528535843...",0.853297
106,PHG-S2-Prep.md,New Crew,**Gwendolin**. Strongest woman.\n**\[\[People/...,29,"[-0.007602601312100887, -0.020341362804174423,...",0.851806
102,PHG-S2-Prep.md,Scenes,"(1) = 15m\n- Piratical party at Darktow, cel...",194,"[-0.0015426205936819315, -0.029612548649311066...",0.850708
65,PHG-S3-Prep.md,Important NPCs,**Voyce Fayette.** Drunk dwarf pirate on a mis...,116,"[-0.004090995993465185, -0.03364014998078346, ...",0.850685
198,Pirate Home Game.md,Friendly,\[\[The Revelry\]\] - \[\[Factions/Worshippers...,26,"[0.006534031592309475, -0.024336498230695724, ...",0.84871
222,The Revelry.md,Customs and Traditions,"- Worship of the Pirate Queen, the goddess o...",399,"[-0.005146820563822985, -0.018383916467428207,...",0.846649
59,PHG-S3-Prep.md,Characters,**Misty Waters**. Gnomish pirate ranger. Capta...,211,"[1.97819772438379e-05, -0.021489519625902176, ...",0.843951
42,The Heart of the Sea.md,The Heart of the Sea,The Heart of the Sea was a powerful crystal th...,184,"[-0.006328409072011709, -0.018067825585603714,...",0.840909


In [8]:
# try out the search with a filter
search_df(df, "merchants", filter=lambda filename: "PHG" not in filename, top_n=10)

Unnamed: 0,filename,title,content,n_tokens,embeddings,similarity
34,TGF plunder the Goodbarrels' merchant ship.md,TGF Plunder the Goodbarrels' Merchant Ship,"[ They get some loot, a new crew member in Enn...",83,"[-0.014707420021295547, -0.030710013583302498,...",0.806992
218,The Revelry.md,Relationships,- The Clovis Concord views the Revelry as a ...,273,"[0.014430521987378597, -0.03062344342470169, 0...",0.805005
207,The Righteous Brand.md,Summary,Created by the Clovis Concord - under pressure...,93,"[-0.006776900961995125, -0.027880903333425522,...",0.801414
224,Factions.md,[No Header],- The Clovis Concord: The Clovis Concord is ...,480,"[0.019089490175247192, -0.01794874109327793, 0...",0.798272
246,Summary of the Menagerie Coast.md,[No Header],The Menagerie Coast is a tropical region locat...,176,"[0.013720354065299034, -0.02707294560968876, 0...",0.796734
36,The Rightous Brand is created in response to t...,[No Header],[ A push by the guilds to counter the Revelry ...,94,"[-0.009481392800807953, -0.02470739372074604, ...",0.794077
259,Chaedi Witherthin.md,Chaedi Witherthin,Shadar-kai owner of the Screaming Compass gene...,44,"[0.005633740220218897, -0.02472827583551407, -...",0.782947
198,Pirate Home Game.md,Friendly,\[\[The Revelry\]\] - \[\[Factions/Worshippers...,26,"[0.006534031592309475, -0.024336498230695724, ...",0.782075
216,The Revelry.md,The Revelry,The Revelry is a group of pirates that operate...,298,"[0.017536960542201996, -0.03126397356390953, 0...",0.779092
21,The Isle of Shadows.md,Mayor Luuciko (he/him),- The human mayor of the town is afflicted w...,98,"[0.0014107857132330537, -0.03144240006804466, ...",0.778107


In [9]:
COMPLETION_MODEL = "gpt-3.5-turbo"

def generate_answer(prompt, context=None):
    if context is not None:
        context_str = "\n".join(context)
        prompt_with_context = f"{prompt}\n\nContext:\n{context_str}"
    else:
        prompt_with_context = prompt
    response = openai.ChatCompletion.create(
        model=COMPLETION_MODEL,
        messages=[
            {
                "content": prompt_with_context,
                "role": "user",
            },
        ],
    )
    return response.choices[0].message.content.strip()

def generate_answer_with_search(prompt, top_n: int = 5):
    context = search_df(df, prompt, top_n=top_n).content.values
    return generate_answer(prompt, context)

In [10]:
query = "Summarise the factions of the Menagerie Coast"

response = generate_answer_with_search(query, top_n=5)

import pprint

pprint.pprint(response, width=150)

('The Menagerie Coast is ruled by the Clovis Concord, a nation of seven city-states known for trade, art, and piracy. The Gentlemen is a powerful '
 'crime syndicate, while the Gilded Consortium is a group of guilds controlling most of the trade. The Righteous Brand mercenary company fights the '
 'Revelry pirates, a group of pirates operating along the Menagerie Coast. The Revelry was formed to oppose the guild system on the coast and is led '
 'by a council of captains and quartermasters, including Astra McGee and Nicor Veras. The Institute is a secretive group of spellcasters researching '
 'magical phenomena and has damaged the Planar Seal that prevents the intersection of planes of existence.')


In [11]:
query = "Generate a villain who is part of the Gilded Consortium"

response = generate_answer_with_search(query, top_n=5)

import pprint

pprint.pprint(response, width=150)

("**Cornelius Van Kittensteijn** is a member of the Magicians' Guild within the Gilded Consortium. He has risen to a position of power within the "
 'organization thanks to his expertise in dark magic and his charisma. Cornelius leads a secretive cult that worships Orcus, the demon prince of '
 'undeath. He sees himself as a chosen one who will help bring about a new world order, where society is ruled by the strongest and most '
 'intelligent. \n'
 '\n'
 "Cornelius is a tabaxi with black and white fur and piercing green cat's eyes. He dresses in fine robes adorned with symbols of his dark patron. He "
 'is known for his charm and his ability to sway those around him to his cause. Despite his charming exterior, Cornelius is a ruthless leader who '
 'will do whatever it takes to achieve his goals.\n'
 '\n'
 "The PCs may encounter Cornelius as they investigate the Gilded Consortium's involvement in illicit activities. Cornelius may try to recruit them "
 'to his cause or offer them a dea

In [14]:
query = "Who is Ringlety Binch?"

response = generate_answer_with_search(query, top_n=10)

import pprint

pprint.pprint(response, width=150)

'There is no information available about Ringlety Binch in the given context.'
