Vector Basics

In [8]:
import numpy as np

v1 = np.array([1,0])
v2 = np.array([0,1])

NOTE: download static eng model embeddings:
`python -m spacy download en_code_web_md`

In [9]:
import spacy
nlp = spacy.load('en_core_web_md')

dog_embedding = nlp.vocab['dog'].vector

In [10]:
dog_embedding.shape

(300,)

In [11]:
dog_embedding[0:10]

array([  1.233  ,   4.2963 ,  -7.9738 , -10.121  ,   1.8207 ,   1.4098 ,
        -4.518  ,  -5.2261 ,  -0.29157,   0.95234], dtype=float32)

In [12]:
def compute_cosine_similarity(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

In [13]:
dog_embedding = nlp.vocab['dog'].vector
cat_empedding = nlp.vocab['cat'].vector
truck_embedding = nlp.vocab['truck'].vector

In [14]:
compute_cosine_similarity(dog_embedding, cat_empedding)

0.8220817

In [15]:
compute_cosine_similarity(dog_embedding, truck_embedding)

0.25462714

take it to the next level and analyze the similarity between sentences and documents

In [16]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

texts = [
         "The canine barked loudly.",
         "The dog made a noisy bark.",
         "He ate a lot of pizza.",
         "He devoured a large quantity of pizza pie.",
]

text_embeddings = model.encode(texts)

text_embeddings.shape



(4, 384)

In [17]:
text_embeddings_dict = dict(zip(texts, list(text_embeddings)))

In [18]:
dog_text_1 = "The canine barked loudly."
dog_text_2 = "The dog made a noisy bark."
compute_cosine_similarity(text_embeddings_dict[dog_text_1], text_embeddings_dict[dog_text_2])

0.77686167

## ChromaDB

In [19]:
import chromadb
from chromadb.utils import embedding_functions

CHROMA_DATA_PATH = "chroma_data/"
EMBEDED_MODEL = "all-MiniLM-L6-v2"
COLLECTION_NAME = "intro_docs"

client = chromadb.PersistentClient(path=CHROMA_DATA_PATH)

In [20]:
embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name=EMBEDED_MODEL
)

collection = client.create_collection(
    name=COLLECTION_NAME,
    embedding_function=embedding_func,
    metadata={"hnsw:space": "cosine"},
)



In [21]:
documents = [
    "The latest iPhone model comes with impressive features and a powerful camera.",
    "Exploring the beautiful beaches and vibrant culture of Bali is a dream for many travelers.",
    "Einstein's theory of relativity revolutionized our understanding of space and time.",
    "Traditional Italian pizza is famous for its thin crust, fresh ingredients, and wood-fired ovens.",
    "The American Revolution had a profound impact on the birth of the United States as a nation.",
    "Regular exercise and a balanced diet are essential for maintaining good physical health.",
    "Leonardo da Vinci's Mona Lisa is considered one of the most iconic paintings in art history.",
    "Climate change poses a significant threat to the planet's ecosystems and biodiversity.",
    "Startup companies often face challenges in securing funding and scaling their operations.",
    "Beethoven's Symphony No. 9 is celebrated for its powerful choral finale, 'Ode to Joy.'",
]

genres = [
    "technology",
    "travel",
    "science",
    "food",
    "history",
    "fitness",
    "art",
    "climate change",
    "business",
    "music",
]

collection.add(
    documents=documents,
    ids=[f"id{i}" for i in range(len(documents))],
    metadatas=[{"genre": g} for g in genres]
)

now your ready to run some semantic queries:

In [22]:
query_results = collection.query(
    query_texts = ["Find me some delicious food!"],
    n_results=1,
)

In [23]:
query_results.keys()

dict_keys(['ids', 'distances', 'metadatas', 'embeddings', 'documents', 'uris', 'data'])

In [25]:
query_results["documents"]

[['Traditional Italian pizza is famous for its thin crust, fresh ingredients, and wood-fired ovens.']]

In [24]:
query_results["distances"]

[[0.7638265072366848]]

why metadata are important:

In [27]:
collection.query(
    query_texts=["Tech me about music history."],
    n_results=2,
)

{'ids': [['id2', 'id9']],
 'distances': [[0.7489849268550266, 0.8206911901679086]],
 'metadatas': [[{'genre': 'science'}, {'genre': 'music'}]],
 'embeddings': None,
 'documents': [["Einstein's theory of relativity revolutionized our understanding of space and time.",
   "Beethoven's Symphony No. 9 is celebrated for its powerful choral finale, 'Ode to Joy.'"]],
 'uris': None,
 'data': None}

you can filter on the metadata to search over more relevant docs.

In [29]:
collection.query(
    query_texts=["Tech me about music history."],
    where={"genre": {"$eq": "music"}},
    n_results=1,
)

{'ids': [['id9']],
 'distances': [[0.8206911901679086]],
 'metadatas': [[{'genre': 'music'}]],
 'embeddings': None,
 'documents': [["Beethoven's Symphony No. 9 is celebrated for its powerful choral finale, 'Ode to Joy.'"]],
 'uris': None,
 'data': None}

you can fine-tune metadata filter

In [30]:

collection.query(
    query_texts=["Tech me about music history."],
    where={"genre": {"$in": ["music", "history"]}},
    n_results=2,
)

{'ids': [['id9', 'id4']],
 'distances': [[0.8206911901679086, 0.8229154649777397]],
 'metadatas': [[{'genre': 'music'}, {'genre': 'history'}]],
 'embeddings': None,
 'documents': [["Beethoven's Symphony No. 9 is celebrated for its powerful choral finale, 'Ode to Joy.'",
   'The American Revolution had a profound impact on the birth of the United States as a nation.']],
 'uris': None,
 'data': None}

In [37]:
collection.get(
    include=["documents"],
)

{'ids': ['id0', 'id1', 'id2', 'id3', 'id4', 'id5', 'id6', 'id7', 'id8', 'id9'],
 'embeddings': None,
 'metadatas': None,
 'documents': ['The latest iPhone model comes with impressive features and a powerful camera.',
  'Exploring the beautiful beaches and vibrant culture of Bali is a dream for many travelers.',
  "Einstein's theory of relativity revolutionized our understanding of space and time.",
  'Traditional Italian pizza is famous for its thin crust, fresh ingredients, and wood-fired ovens.',
  'The American Revolution had a profound impact on the birth of the United States as a nation.',
  'Regular exercise and a balanced diet are essential for maintaining good physical health.',
  "Leonardo da Vinci's Mona Lisa is considered one of the most iconic paintings in art history.",
  "Climate change poses a significant threat to the planet's ecosystems and biodiversity.",
  'Startup companies often face challenges in securing funding and scaling their operations.',
  "Beethoven's Symp

In [38]:
collection.get(
    ids=["id1"],
)

{'ids': ['id1'],
 'embeddings': None,
 'metadatas': [{'genre': 'travel'}],
 'documents': ['Exploring the beautiful beaches and vibrant culture of Bali is a dream for many travelers.'],
 'uris': None,
 'data': None}

In [39]:
collection.count()

10