In [None]:
import chromadb
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

In [None]:
# Initialize ChromaDB client
client = chromadb.PersistentClient(path="./chromadb")

# Create a collection for recipes
collection = client.get_or_create_collection(name="recipes")

In [None]:
def query_with_metadata(collection, sample_size=500):
    """
    Query embeddings and metadata from the collection where the 'category' attribute is defined and not an empty string.
    """
    total_count = collection.count()

    # Retrieve embeddings in chunks based on sampled indices
    embeddings = []
    metadatas = []
    used_index = set()
    
    while len(embeddings) < sample_size:
        random_index = np.random.randint(total_count)
        
        while random_index in used_index:
            random_index = np.random.randint(total_count)

        result = collection.get(
            offset=random_index,
            limit=1,
            include=["embeddings", "metadatas"],
            where={
                "category": {"$ne": ""},  # Ensure 'cuisine' exists
            },
        )

        if len(result["embeddings"]) > 0:
            embeddings.append(result["embeddings"][0])
            metadatas.append(result["metadatas"][0])

        used_index.add(random_index)

    embeddings = np.array(embeddings)
    categories = [metadata["category"] for metadata in metadatas]

    return embeddings, categories


In [None]:

def visualize_embeddings_by_category(collection, sample_size=500):
    """
    Visualize embeddings with color-coding by category metadata.
    """
    # Fetch embeddings and cuisines
    print("Querying database...")
    embeddings, categories = query_with_metadata(collection, sample_size)

    print("Creating t-SNE...")
    # Apply t-SNE to reduce dimensions to 2D
    tsne = TSNE(n_components=2, random_state=42, perplexity=35, n_iter=1000)
    reduced_embeddings = tsne.fit_transform(embeddings)

    # Map cuisines to colors
    unique_categories = list(set(categories))
    color_map = {category: i for i, category in enumerate(unique_categories)}
    colors = [color_map[category] for category in categories]

    # Plot using matplotlib
    plt.figure(figsize=(12, 8))
    plt.scatter(
        reduced_embeddings[:, 0],
        reduced_embeddings[:, 1],
        c=colors,
        cmap="tab20b",  # Use a colormap with many colors
        alpha=0.7,
    )

    # Create a legend mapping colors to cuisines
    legend_labels = [
        plt.Line2D(
            [0],
            [0],
            marker="o",
            color="w",
            markerfacecolor=plt.cm.tab20(color_map[cuisine]),
            markersize=10,
        )
        for cuisine in unique_categories
    ]
    plt.legend(
        legend_labels,
        unique_categories,
        loc="upper right",
        title="Category",
        bbox_to_anchor=(1.3, 1),
    )

    plt.title("t-SNE Visualization of Recipe Embeddings by Category")
    plt.xlabel("t-SNE Component 1")
    plt.ylabel("t-SNE Component 2")
    plt.grid(True)
    plt.show()

In [None]:
visualize_embeddings_by_category(collection, sample_size=500)