In [2]:
import os
from pyprojroot import here
import pandas as pd
import chromadb
from openai import OpenAI
import warnings
warnings.filterwarnings("ignore")
from dotenv import load_dotenv
print(load_dotenv())

True


In [5]:
openai_api_key = os.environ["OPENAI_API_KEY"]
#azure_openai_endpoint = os.environ["OPENAI_API_BASE"]

In [6]:
openai_client = OpenAI(
  api_key = openai_api_key,  
  #api_version = "2023-07-01-preview",
  #azure_endpoint = azure_openai_endpoint
)
chroma_client = chromadb.PersistentClient(path=str(here("data/chroma")))

**Create a collection for data injection**

Throws an error if the table already exists

In [7]:
collection = chroma_client.create_collection(name="titanic_small")

In [27]:
file_dir = here("data/for_upload/titanic_small.csv")
df = pd.read_csv(file_dir)
#df = pd.read_csv(file_dir, nrows=5)

In [30]:
#df.shape
df[:5]

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26,0,0,7.925
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35,1,0,53.1
4,0,3,Mr. William Henry Allen,male,35,0,0,8.05


NOTE: Process in chunks if dataset is big.

In [31]:
docs = []
metadatas = []
ids = []
embeddings = []
for index, row in df.iterrows():
    output_str = ""
    # Treat each row as a separate chunk
    for col in df.columns:
        output_str += f"{col}: {row[col]},\n"
    response = openai_client.embeddings.create(
        input = output_str,
        model= "text-embedding-ada-002"
    )
    embeddings.append(response.data[0].embedding)
    docs.append(output_str)
    metadatas.append({"source": "titanic_small"})
    ids.append(f"id{index}")

In [32]:
docs

['Survived: 0,\nPclass: 3,\nName: Mr. Owen Harris Braund,\nSex: male,\nAge: 22,\nSiblings/Spouses Aboard: 1,\nParents/Children Aboard: 0,\nFare: 7.25,\n',
 'Survived: 1,\nPclass: 1,\nName: Mrs. John Bradley (Florence Briggs Thayer) Cumings,\nSex: female,\nAge: 38,\nSiblings/Spouses Aboard: 1,\nParents/Children Aboard: 0,\nFare: 71.2833,\n',
 'Survived: 1,\nPclass: 3,\nName: Miss. Laina Heikkinen,\nSex: female,\nAge: 26,\nSiblings/Spouses Aboard: 0,\nParents/Children Aboard: 0,\nFare: 7.925,\n',
 'Survived: 1,\nPclass: 1,\nName: Mrs. Jacques Heath (Lily May Peel) Futrelle,\nSex: female,\nAge: 35,\nSiblings/Spouses Aboard: 1,\nParents/Children Aboard: 0,\nFare: 53.1,\n',
 'Survived: 0,\nPclass: 3,\nName: Mr. William Henry Allen,\nSex: male,\nAge: 35,\nSiblings/Spouses Aboard: 0,\nParents/Children Aboard: 0,\nFare: 8.05,\n',
 'Survived: 0,\nPclass: 3,\nName: Mr. James Moran,\nSex: male,\nAge: 27,\nSiblings/Spouses Aboard: 0,\nParents/Children Aboard: 0,\nFare: 8.4583,\n',
 'Survived: 0,\n

In [33]:
print(metadatas)
print(ids)

[{'source': 'titanic_small'}, {'source': 'titanic_small'}, {'source': 'titanic_small'}, {'source': 'titanic_small'}, {'source': 'titanic_small'}, {'source': 'titanic_small'}, {'source': 'titanic_small'}, {'source': 'titanic_small'}, {'source': 'titanic_small'}, {'source': 'titanic_small'}, {'source': 'titanic_small'}, {'source': 'titanic_small'}, {'source': 'titanic_small'}, {'source': 'titanic_small'}, {'source': 'titanic_small'}, {'source': 'titanic_small'}, {'source': 'titanic_small'}, {'source': 'titanic_small'}, {'source': 'titanic_small'}, {'source': 'titanic_small'}, {'source': 'titanic_small'}, {'source': 'titanic_small'}, {'source': 'titanic_small'}, {'source': 'titanic_small'}, {'source': 'titanic_small'}, {'source': 'titanic_small'}, {'source': 'titanic_small'}, {'source': 'titanic_small'}, {'source': 'titanic_small'}, {'source': 'titanic_small'}]
['id0', 'id1', 'id2', 'id3', 'id4', 'id5', 'id6', 'id7', 'id8', 'id9', 'id10', 'id11', 'id12', 'id13', 'id14', 'id15', 'id16', 'i

In [34]:
embeddings[0][:10]

[-0.005519494414329529,
 -0.019919363781809807,
 -0.017487507313489914,
 -0.025138292461633682,
 0.010314896702766418,
 0.03456515073776245,
 -0.01339570339769125,
 -0.0007778866565786302,
 -0.024960683658719063,
 -0.017528492957353592]

In [36]:
collection.add(
    documents=docs,
    metadatas=metadatas,
    embeddings=embeddings,
    ids=ids
)

Add of existing embedding ID: id0
Add of existing embedding ID: id1
Add of existing embedding ID: id2
Add of existing embedding ID: id3
Add of existing embedding ID: id4
Add of existing embedding ID: id5
Add of existing embedding ID: id6
Add of existing embedding ID: id7
Add of existing embedding ID: id8
Add of existing embedding ID: id9
Add of existing embedding ID: id10
Add of existing embedding ID: id11
Add of existing embedding ID: id12
Add of existing embedding ID: id13
Add of existing embedding ID: id14
Add of existing embedding ID: id15
Add of existing embedding ID: id16
Add of existing embedding ID: id17
Add of existing embedding ID: id18
Add of existing embedding ID: id19
Add of existing embedding ID: id20
Add of existing embedding ID: id21
Add of existing embedding ID: id22
Add of existing embedding ID: id23
Add of existing embedding ID: id24
Add of existing embedding ID: id25
Add of existing embedding ID: id26
Add of existing embedding ID: id27
Add of existing embedding ID: 

Verify the vectorDB creation

In [37]:
print("Number of vectors in vectordb:", collection.count())

Number of vectors in vectordb: 30


### RAG

In [16]:
from openai import OpenAI

In [17]:
model_name = "gpt-4o-mini"
openai_api_key = os.environ["OPENAI_API_KEY"]
#openai_endpoint = os.environ["OPENAI_API_BASE"]

In [18]:
openai_client = OpenAI(
    #api_version=os.getenv("OPENAI_API_VERSION"),
    #azure_endpoint=os.getenv("OPENAI_API_BASE"),
    api_key=os.getenv("OPENAI_API_KEY"),
)

**Perform similarity search**

In [38]:
query_texts = "what's the average age of survivors"
response = openai_client.embeddings.create(
        input = query_texts,
        model= "text-embedding-ada-002"
    )
query_embeddings = response.data[0].embedding

**Load the chromaDB collection for vector search**

In [39]:
vectordb = chroma_client.get_collection(name="titanic_small")
vectordb.count()

30

In [40]:
results = vectordb.query(
    query_embeddings = query_embeddings,
    n_results=1 #top_k
)

results

{'ids': [['id13']],
 'distances': [[0.4679450325701855]],
 'metadatas': [[{'source': 'titanic_small'}]],
 'embeddings': None,
 'documents': [['Survived: 0,\nPclass: 3,\nName: Mr. Anders Johan Andersson,\nSex: male,\nAge: 39,\nSiblings/Spouses Aboard: 1,\nParents/Children Aboard: 5,\nFare: 31.275,\n']],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents', 'distances']}

Pass the results to an LLM

In [41]:
system_role = "You will recieve the user's question along with the search results of that question over a database. Give the user the proper answer."
prompt = f"User's question: {query_texts} \n\n Search results:\n {results}"

messages = [
    {"role": "system", "content": str(
        system_role
        )},
    {"role": "user", "content": prompt}
]

In [42]:
response = openai_client.chat.completions.create(
    model="gpt-4o-mini",
    messages=messages
)

In [43]:
response.choices[0].message.content

"The search results do not provide a direct answer to the average age of survivors. However, if you are looking for the average age of survivors specifically from the Titanic dataset mentioned, it would typically require a calculation based on individual ages of all survivors recorded in that dataset. If you can provide me with more data on the survivors' ages or point me to specific datasets, I can help you calculate the average."

**Fact check**

In [44]:
df

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26,0,0,7.925
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35,1,0,53.1
4,0,3,Mr. William Henry Allen,male,35,0,0,8.05
5,0,3,Mr. James Moran,male,27,0,0,8.4583
6,0,1,Mr. Timothy J McCarthy,male,54,0,0,51.8625
7,0,3,Master. Gosta Leonard Palsson,male,2,3,1,21.075
8,1,3,Mrs. Oscar W (Elisabeth Vilhelmina Berg) Johnson,female,27,0,2,11.1333
9,1,2,Mrs. Nicholas (Adele Achem) Nasser,female,14,1,0,30.0708
