### RAG

In [84]:
import os
from pyprojroot import here
import pandas as pd
import chromadb
from openai import OpenAI
import warnings
from dotenv import load_dotenv

warnings.filterwarnings("ignore")
print(load_dotenv())  # Load environment variables

True


In [85]:
# Initialize OpenAI Client
openai_api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=openai_api_key)

# Initialize ChromaDB
chroma_client = chromadb.PersistentClient(path=str(here("data/chroma")))

In [86]:
# Create a collection in ChromaDB
collection_name = "titanic_small"
collection = chroma_client.create_collection(name=collection_name)

In [87]:
# Load Titanic dataset
file_dir = here("data/for_upload/titanic_small.csv")
df = pd.read_csv(file_dir, nrows=5)

In [88]:
# Prepare documents, metadata, and embeddings
docs, metadatas, ids, embeddings = [], [], [], []
for index, row in df.iterrows():
    output_str = "\n".join([f"{col}: {row[col]}" for col in df.columns]) + "\n"

    # Generate text embedding
    response = client.embeddings.create(
        input=output_str,
        model="text-embedding-3-small"
    )
    
    # Store data for ChromaDB
    embeddings.append(response.data[0].embedding)
    docs.append(output_str)
    metadatas.append({"source": collection_name})
    ids.append(f"id{index}")

In [89]:
# Add data to ChromaDB
collection.add(
    documents=docs,
    metadatas=metadatas,
    embeddings=embeddings,
    ids=ids
)

In [90]:
print("Number of vectors in vectordb:", collection.count())

Number of vectors in vectordb: 5


In [91]:
# Query a text string
query_text = "what's the average age of survivors"

In [92]:
# Generate query embedding
response = client.embeddings.create(
    input=query_text,
    model="text-embedding-3-small"
)

In [93]:
query_embedding = response.data[0].embedding

In [94]:
# Retrieve relevant documents
vectordb = chroma_client.get_collection(name=collection_name)
results = vectordb.query(
    query_embeddings=[query_embedding],
    n_results=1  # top_k results
)

In [95]:
# Construct prompt for GPT model
system_role = "You will receive the user's question along with search results from a database. Provide the best possible answer."
prompt = f"User's question: {query_text} \n\nSearch results:\n{results}"

messages = [
    {"role": "system", "content": system_role},
    {"role": "user", "content": prompt}
]

In [96]:
# Generate response using GPT-4 Turbo
response = client.chat.completions.create(
    model="gpt-4-turbo",
    messages=messages
)

In [97]:
print(response.choices[0].message.content)

The search results provided only include information about one individual survivor, Mrs. John Bradley (Florence Briggs Thayer) Cumings, and her age was 38. To obtain the average age of all survivors, information about the ages of all other survivors would be necessary. Based on the data provided, it is only possible to state the age of this one survivor. If you are looking for a comprehensive average, please provide more details or data on other survivors.


In [98]:
df

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26,0,0,7.925
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35,1,0,53.1
4,0,3,Mr. William Henry Allen,male,35,0,0,8.05


In [102]:
# # Clean up the collection data (delete collection)
# chroma_client.delete_collection(name=collection_name)
# print(f"Collection '{collection_name}' deleted successfully.")

In [99]:
# Clean up the collection data (delete collection)
try:
    chroma_client.delete_collection(name=collection_name)
    print(f"Collection '{collection_name}' deleted successfully.")
except Exception as e:
    print(f"Error deleting collection: {e}")

# Verify collection deletion
try:
    # Try to retrieve the collection after deletion
    chroma_client.get_collection(name=collection_name)
except Exception as e:
    print(f"Error retrieving collection after deletion: {e}")

Collection 'titanic_small' deleted successfully.
Error retrieving collection after deletion: Collection titanic_small does not exist.


In [101]:
collection_name

'titanic_small'