# [STARTER] Udaplay Project

## Part 01 - Offline RAG

In this part of the project, you'll build your VectorDB using Chroma.

The data is inside folder `project/starter/games`. Each file will become a document in the collection you'll create.
Example.:
```json
{
  "Name": "Gran Turismo",
  "Platform": "PlayStation 1",
  "Genre": "Racing",
  "Publisher": "Sony Computer Entertainment",
  "Description": "A realistic racing simulator featuring a wide array of cars and tracks, setting a new standard for the genre.",
  "YearOfRelease": 1997
}
```


### Setup

In [None]:
# # Only needed for Udacity workspace

# import importlib.util
# import sys

# # Check if 'pysqlite3' is available before importing
# if importlib.util.find_spec("pysqlite3") is not None:
#     import pysqlite3
#     sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')

In [1]:
import os
import json
from pathlib import Path

import chromadb
from chromadb.utils import embedding_functions
from dotenv import load_dotenv


In [2]:
# Path to the environment file that stores API keys required by the project.
ENV_PATH = Path('.env')

if not ENV_PATH.exists():
    raise FileNotFoundError(
        f"Missing environment file at {ENV_PATH.resolve()}. "
        "Create it and define OPENAI_API_KEY, CHROMA_OPENAI_API_KEY, and TAVILY_API_KEY."
    )


In [3]:
load_dotenv(ENV_PATH)

required_keys = [
    'OPENAI_API_KEY',
    'TAVILY_API_KEY',
]
missing = [key for key in required_keys if not os.getenv(key)]
if missing:
    raise EnvironmentError(f"Missing required environment variables: {', '.join(missing)}")

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
TAVILY_API_KEY = os.getenv('TAVILY_API_KEY')
CHROMA_OPENAI_API_KEY = os.getenv('CHROMA_OPENAI_API_KEY') or OPENAI_API_KEY

if not CHROMA_OPENAI_API_KEY:
    raise EnvironmentError(
        'Set CHROMA_OPENAI_API_KEY or OPENAI_API_KEY to enable embedding generation.'
    )


### VectorDB Instance

In [4]:
CHROMA_DB_PATH = Path('chromadb')
CHROMA_DB_PATH.mkdir(exist_ok=True)

chroma_client = chromadb.PersistentClient(path=str(CHROMA_DB_PATH))
print(f'ChromaDB persistent client initialized at {CHROMA_DB_PATH.resolve()}')


ChromaDB persistent client initialized at C:\Users\Ohara\Desktop\__Udacity\agentic_ai\AI_research_agent_video_game_industry\starter\chromadb


### Collection

In [5]:
EMBEDDING_MODEL = 'text-embedding-3-small'
embedding_fn = embedding_functions.OpenAIEmbeddingFunction(
    api_key=CHROMA_OPENAI_API_KEY,
    model_name=EMBEDDING_MODEL,
    api_base=os.getenv('OPENAI_BASE_URL', 'https://openai.vocareum.com/v1'),
)


In [6]:
COLLECTION_NAME = 'udaplay_games'
collection = chroma_client.get_or_create_collection(
    name=COLLECTION_NAME,
    embedding_function=embedding_fn,
    metadata={'description': 'Video game knowledge base for UdaPlay'},
)
print(f"Connected to collection '{COLLECTION_NAME}' with {collection.count()} existing documents.")


Connected to collection 'udaplay_games' with 0 existing documents.


### Add documents

In [9]:
data_dir = (ENV_PATH.parent / 'games').resolve()
if not data_dir.exists():
    raise FileNotFoundError(f'Missing data directory: {data_dir}')

existing_ids = set()
if collection.count() > 0:
    stored = collection.get()
    existing_ids = set(stored.get('ids', []))

records_added = 0
for file_path in sorted(data_dir.glob('*.json')):
    with file_path.open('r', encoding='utf-8') as f:
        game = json.load(f)

    content = (
        f"[{game['Platform']}] {game['Name']} ({game['YearOfRelease']}) - "
        f"{game['Description']}"
    )
    doc_id = file_path.stem

    if doc_id in existing_ids:
        continue

    collection.add(
        ids=[doc_id],
        documents=[content],
        metadatas=[game],
    )
    records_added += 1

print(
    f'Indexed {records_added} new documents. '
    f'Collection now contains {collection.count()} documents.'
)


Indexed 7 new documents. Collection now contains 22 documents.


In [10]:
sample_query = 'Who developed Gran Turismo? '
results = collection.query(
    query_texts=[sample_query],
    n_results=3,
    include=['metadatas', 'documents', 'distances'],
)

top_docs = results.get('documents', [[]])[0]
top_metadata = results.get('metadatas', [[]])[0]
top_distances = results.get('distances', [[]])[0]

for rank, (doc, meta, distance) in enumerate(zip(top_docs, top_metadata, top_distances), 1):
    similarity = 1 - distance if distance is not None else None
    similarity_str = f"{similarity:.3f}" if similarity is not None else 'N/A'
    print(
        f"{rank}. {meta.get('Name')} - Publisher: {meta.get('Publisher')} "
        f"(sim={similarity_str})"
    )


1. Gran Turismo - Publisher: Sony Computer Entertainment (sim=0.650)
2. Gran Turismo 5 - Publisher: Sony Computer Entertainment (sim=0.601)
3. Grand Theft Auto VI - Publisher: Rockstar Games (sim=0.392)
