In [2]:
# Install packages
!pip install chromadb langchain openai beautifulsoup4 langchain-community langchain-openai sentence_transformers -q

In [3]:
# Import required modules
import chromadb
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import TextLoader
import requests
import os
import pandas as pd

In [4]:
# Get OpenAI API key from environment variable
openai_api_key = os.getenv("OPENAI_API_KEY")

if not openai_api_key:
    raise ValueError("Please set the OPENAI_API_KEY environment variable")

In [5]:
# Setup Langchain and OpenAI Embeddings

from langchain_openai import OpenAIEmbeddings
import os

# Ensure OpenAI API key is set
if "OPENAI_API_KEY" not in os.environ:
    raise ValueError("OPENAI_API_KEY not found in environment variables")

# Initialize OpenAI Embeddings
embed_model = OpenAIEmbeddings(
    model="text-embedding-3-small",
    openai_api_key=os.environ["OPENAI_API_KEY"]
)

# Test the embeddings
test_text = "Hello, world!"
test_embedding = embed_model.embed_query(test_text)

print(f"Embedding dimension: {len(test_embedding)}")
print(f"First 5 values of the embedding: {test_embedding[:5]}")

# Function to get embeddings for a list of texts
def get_embeddings(texts):
    return embed_model.embed_documents(texts)

print("Embeddings setup complete!")


Embedding dimension: 1536
First 5 values of the embedding: [-0.019184619188308716, -0.025279032066464424, -0.0017195191467180848, 0.01884828321635723, -0.033795066177845]
Embeddings setup complete!


In [6]:
# Jupyter Notebook Cell: ChromaDB Initialization

import chromadb
from chromadb.config import Settings
import os

# Set up ChromaDB with a persistent directory
PERSISTENCE_DIRECTORY = "./chroma_db"
os.makedirs(PERSISTENCE_DIRECTORY, exist_ok=True)

# Initialize ChromaDB client with file-based storage
chroma_client = chromadb.PersistentClient(path=PERSISTENCE_DIRECTORY)

# Create or get the 'babynames' collection
collection_name = "babynames"
collection = chroma_client.get_or_create_collection(name=collection_name)

print(f"ChromaDB initialized with persistence directory: {PERSISTENCE_DIRECTORY}")
print(f"Collection '{collection_name}' is ready.")
print(f"Current document count in collection: {collection.count()}")

# Function to get the collection (useful for other cells)
def get_babynames_collection():
    return chroma_client.get_collection(collection_name)

print("\nChromaDB and collection initialization complete!")

ChromaDB initialized with persistence directory: ./chroma_db
Collection 'babynames' is ready.
Current document count in collection: 0

ChromaDB and collection initialization complete!


In [7]:
def format_enriched_baby_names(file='./EnrichedBabyNamesData.csv'):
    # Read the CSV file
    df = pd.read_csv(file)
    
    # Filter rows where meaning is not null/none
    df_filtered = df.dropna(subset=['meaning'])
    
    # Function to format each row
    def format_row(row):
        return f"Name: {row['name']}|Sex: {row['sex']}|Nation: {row['nationality']}|Meaning: {row['meaning']}"
    
    # Apply the formatting function to each row and join the results
    formatted_strings = df_filtered.apply(format_row, axis=1).tolist()
    
    # Join all formatted strings with newlines
    result = '\n'.join(formatted_strings)
    
    print(f"Processed {len(df_filtered)} names with meanings.")
    print("First few formatted entries:")
    print('\n'.join(formatted_strings[:5]))  # Print first 5 entries as a sample
    
    return result

# Example usage
formatted_data = format_enriched_baby_names()

Processed 23007 names with meanings.
First few formatted entries:
Name: AARON|Sex: Male|Nation: Hebrew|Meaning: Exalted, high, or mountain of strength
Name: ABBAS|Sex: Male|Nation: Arabic|Meaning: The name Abbas means 'lion' or 'stern' in Arabic. It is often associated with strength and bravery.
Name: ABD|Sex: Male|Nation: Arabic|Meaning: The name 'Abd' means 'servant' or 'slave' in Arabic, often used in combination with one of the names of God, such as 'Abdullah' meaning 'servant of God'.
Name: ABDEL|Sex: Male|Nation: Arabic|Meaning: Servant of God
Name: ABDUL|Sex: Male|Nation: Arabic|Meaning: Servant of God


  df = pd.read_csv(file)


In [None]:
import pandas as pd
from tqdm.notebook import tqdm
import chromadb
from chromadb.utils import embedding_functions

def batch_process_baby_names_to_chroma(file='./EnrichedBabyNamesData.csv', batch_size=100):
    # Initialize Chroma client and collection
    embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
    collection = get_babynames_collection()

    # Read the CSV file in chunks
    for chunk in tqdm(pd.read_csv(file, chunksize=batch_size), desc="Processing batches"):
        # Filter rows where meaning is not null/none
        chunk_filtered = chunk.dropna(subset=['meaning'])
        
        if chunk_filtered.empty:
            continue
        
        # Format each row
        formatted_data = chunk_filtered.apply(lambda row: f"Name: {row['name']}|Sex: {row['sex']}|Nation: {row['nationality']}|Meaning: {row['meaning']}", axis=1).tolist()
        
        # Use names as IDs
        ids = chunk_filtered['name'].tolist()
        
        # Generate embeddings
        embeddings = embedding_function(formatted_data)
        
        # Prepare metadata
        metadatas = chunk_filtered.to_dict('records')
        
        # Add to Chroma collection
        collection.add(
            ids=ids,
            embeddings=embeddings,
            metadatas=metadatas,
            documents=formatted_data
        )
    
    print(f"Processed and added {collection.count()} baby names to Chroma collection.")

# Example usage
batch_process_baby_names_to_chroma()

Processing batches: 0it [00:00, ?it/s]

Insert of existing embedding ID: AARON
Insert of existing embedding ID: ABBAS
Insert of existing embedding ID: ABD
Insert of existing embedding ID: ABDEL
Insert of existing embedding ID: ABDUL
Insert of existing embedding ID: ABDULKADIR
Insert of existing embedding ID: ABDULLAH
Insert of existing embedding ID: ABDULRAHMAN
Insert of existing embedding ID: ABDURRAHMAN
Insert of existing embedding ID: ADALBERT
Insert of existing embedding ID: ADAM
Insert of existing embedding ID: ADEL
Insert of existing embedding ID: ADEM
Insert of existing embedding ID: ADIL
Insert of existing embedding ID: ADIN
Insert of existing embedding ID: ADIS
Insert of existing embedding ID: ADMIR
Insert of existing embedding ID: ADNAN
Insert of existing embedding ID: ADOLF
Insert of existing embedding ID: ADRIAN
Insert of existing embedding ID: ADRIANO
Insert of existing embedding ID: AHMED
Insert of existing embedding ID: AJDIN
Insert of existing embedding ID: ALAA
Insert of existing embedding ID: ALAN
Insert of

In [13]:
print(collection.count())

23007


In [101]:
import chromadb
from chromadb.utils import embedding_functions

def query_baby_names(query_text, n_results=50):
    # Initialize Chroma client and get the collection
    embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
    collection = get_babynames_collection()

    # Query the collection
    results = collection.query(
        query_texts=[query_text],
        n_results=n_results,
        include=["documents", "metadatas", "distances"],
        where={"sex": "Male"} 
    )

    # Process and print results
    print(f"Top {n_results} results for query: '{query_text}'\n")
    for i, (doc, metadata, distance) in enumerate(zip(results['documents'][0], results['metadatas'][0], results['distances'][0]), 1):
        print(f"{i}. Distance: {distance:.4f}")
        print(f"   {doc}")

    return results

# Example usage
query_text = "intentional, punctual, timely, english, german, irish"
results = query_baby_names(query_text)

Top 50 results for query: 'intentional, punctual, timely, english, german, irish'

1. Distance: 1.1560
   Name: EION|Sex: Male|Nation: Irish|Meaning: God is gracious
2. Distance: 1.1689
   Name: DEVIN|Sex: Male|Nation: Irish|Meaning: Poet or bard
3. Distance: 1.1909
   Name: ERNIST|Sex: Male|Nation: German|Meaning: Earnest or serious
4. Distance: 1.1931
   Name: FINNEAN|Sex: Male|Nation: Irish|Meaning: Fair or white
5. Distance: 1.2037
   Name: EON|Sex: Male|Nation: Irish|Meaning: God is gracious
6. Distance: 1.2041
   Name: FINN|Sex: Male|Nation: Irish, Scandinavian|Meaning: Fair, white, or blessed
7. Distance: 1.2108
   Name: EIAN|Sex: Male|Nation: Irish|Meaning: God is gracious
8. Distance: 1.2181
   Name: SINCERE|Sex: Male|Nation: English|Meaning: Genuine, honest, or truthful
9. Distance: 1.2193
   Name: EAN|Sex: Male|Nation: Irish|Meaning: God is gracious
10. Distance: 1.2194
   Name: SHONE|Sex: Male|Nation: Irish|Meaning: God is gracious
11. Distance: 1.2273
   Name: ALAN|Sex: Ma