In [1]:
import os
from pyprojroot import here
import pandas as pd
import chromadb
from openai import AzureOpenAI
import warnings
warnings.filterwarnings("ignore")
from dotenv import load_dotenv
print(load_dotenv())

True


In [3]:
azure_openai_api_key = os.environ["OPENAI_API_KEY"]
azure_openai_endpoint = os.environ["OPENAI_API_BASE"]

In [4]:
azure_client = AzureOpenAI(
  api_key = azure_openai_api_key,  
  api_version = "2023-07-01-preview",
  azure_endpoint = azure_openai_endpoint
)
chroma_client = chromadb.PersistentClient(path=str(here("data/chroma")))

⚠️ It looks like you upgraded from a version below 0.5.6 and could benefit from vacuuming your database. Run chromadb utils vacuum --help for more information.


**Create a collection for data injection**

Throws an error if the table already exists

In [5]:
collection = chroma_client.create_collection(name="titanic_small")

UniqueConstraintError: Collection titanic_small already exists

In [6]:
file_dir = here("data/for_upload/titanic_small.csv")
df = pd.read_csv(file_dir, nrows=5)

In [7]:
df

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26,0,0,7.925
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35,1,0,53.1
4,0,3,Mr. William Henry Allen,male,35,0,0,8.05


NOTE: Process in chunks if dataset is big.

In [8]:
docs = []
metadatas = []
ids = []
embeddings = []
for index, row in df.iterrows():
    output_str = ""
    # Treat each row as a separate chunk
    for col in df.columns:
        output_str += f"{col}: {row[col]},\n"
    response = azure_client.embeddings.create(
        input = output_str,
        model= "text-embedding-ada-002"
    )
    embeddings.append(response.data[0].embedding)
    docs.append(output_str)
    metadatas.append({"source": "titanic_small"})
    ids.append(f"id{index}")

BadRequestError: Unsupported data type

In [9]:
docs

[]

In [10]:
print(metadatas)
print(ids)

[]
[]


In [11]:
embeddings[0][:10]

IndexError: list index out of range

In [12]:
collection.add(
    documents=docs,
    metadatas=metadatas,
    embeddings=embeddings,
    ids=ids
)

NameError: name 'collection' is not defined

Verify the vectorDB creation

In [13]:
print("Number of vectors in vectordb:", collection.count())

NameError: name 'collection' is not defined

### RAG

In [14]:
from openai import AzureOpenAI

In [15]:
model_name = "gpt-35-turbo"
azure_openai_api_key = os.environ["OPENAI_API_KEY"]
azure_openai_endpoint = os.environ["OPENAI_API_BASE"]

In [16]:
azure_client = AzureOpenAI(
    api_version=os.getenv("OPENAI_API_VERSION"),
    azure_endpoint=os.getenv("OPENAI_API_BASE"),
    api_key=os.getenv("OPENAI_API_KEY"),
)

**Perform similarity search**

In [18]:
query_texts = "what's the average age of survivors"
response = azure_client.embeddings.create(
        input = query_texts,
        model= "text-embedding-3-small"
    )
query_embeddings = response.data[0].embedding

BadRequestError: Unsupported data type

**Load the chromaDB collection for vector search**

In [19]:
vectordb = chroma_client.get_collection(name="titanic_small")
vectordb.count()

30

In [21]:
results = vectordb.query(
    query_embeddings = query_embeddings,
    n_results=1 #top_k
)

results

NameError: name 'query_embeddings' is not defined

Pass the results to an LLM

In [22]:
system_role = "You will recieve the user's question along with the search results of that question over a database. Give the user the proper answer."
prompt = f"User's question: {query_texts} \n\n Search results:\n {results}"

messages = [
    {"role": "system", "content": str(
        system_role
        )},
    {"role": "user", "content": prompt}
]

NameError: name 'results' is not defined

In [24]:
response = azure_client.chat.completions.create(
    model=os.getenv("gpt_deployment_name"),
    messages=messages
)

NameError: name 'messages' is not defined

In [25]:
response.choices[0].message.content

NameError: name 'response' is not defined

**Fact check**

In [None]:
df

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26,0,0,7.925
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35,1,0,53.1
4,0,3,Mr. William Henry Allen,male,35,0,0,8.05


: 