## Generate Embeddings

In [1]:
from pathlib import Path
import pandas as pd

SENTENCES_FILE = Path("sentences.csv")

if not SENTENCES_FILE.exists():
    print("sentences.csv file not found.")
    exit(1)

dataset = pd.read_csv(SENTENCES_FILE)
dataset.tail(2)

Unnamed: 0,sentence
998,A man is on a rock high above some trees and i...
999,A man is jumping onto a low wall


Generate embedding using Open AI, but with free Ollama local model

In [2]:
from openai import OpenAI
import numpy as np

OLLAMA_URL = "http://localhost:11434/v1"
EMBEDDED_SENTENCES_FILE = Path("embedded_sentences.csv")

client = OpenAI(
    base_url=OLLAMA_URL,
    api_key='ollama' # Required for the OpenAI API, but not for OLLAMA
)

def get_embeddings(sentence: str) -> list[float]:
    try:
        response = client.embeddings.create(
            model="nomic-embed-text:latest",
            input=sentence
        )
    except Exception as e:
        print(f"Failed to get embedding for sentence: {sentence}")
        print(e)
        raise
    embedding = response.data[0].embedding
    return embedding

if not EMBEDDED_SENTENCES_FILE.exists():
    dataset["embedding"] = dataset.sentence.apply(get_embeddings)
    dataset.to_csv(EMBEDDED_SENTENCES_FILE, index=False)
else:
    dataset = pd.read_csv(EMBEDDED_SENTENCES_FILE)
    dataset["embedding"] = dataset.embedding.apply(eval).apply(np.array)

dataset.head(3)

Unnamed: 0,sentence,embedding
0,A little girl is smiling and running outside,"[0.027937938, 0.025827052, -0.15133108, 0.0199..."
1,A man is drawing on a digital dry erase board,"[0.030545507, 0.07927829, -0.11160514, -0.0660..."
2,A black bird is sitting on a dead tree,"[0.008163251, -0.0055234055, -0.15605377, -0.0..."


In [3]:
dataset["id"] = range(1, len(dataset) + 1)
dataset.head()

Unnamed: 0,sentence,embedding,id
0,A little girl is smiling and running outside,"[0.027937938, 0.025827052, -0.15133108, 0.0199...",1
1,A man is drawing on a digital dry erase board,"[0.030545507, 0.07927829, -0.11160514, -0.0660...",2
2,A black bird is sitting on a dead tree,"[0.008163251, -0.0055234055, -0.15605377, -0.0...",3
3,An elderly man is sitting on a bench,"[-0.016438434, 0.06475461, -0.13259469, -0.085...",4
4,A man and a woman are sitting comfortably on t...,"[0.055450395, 0.06100677, -0.1635873, -0.01177...",5


Check dimensions of embedding line

In [4]:
embedding_dimension = len(dataset.iloc[0]["embedding"])
embedding_dimension

768

### FAISS - Facebook library for efficient similarity search and clustering of dense vectors.

Check [Faiss Indexes](https://github.com/facebookresearch/faiss/wiki/Faiss-indexes) for more information.

In [5]:
embeddings = np.array(dataset.embedding.to_list())
embeddings.shape

(1000, 768)

In [6]:
query = "I love soccer"
query_embedding = get_embeddings(query)

## IndexFlatL2 - Exact Search for L2 algorithm
> The more documents you have got the slower it gets.

In [7]:
from faiss import IndexFlatL2, IndexIVFFlat, IndexIVFPQ

index_l2 = IndexFlatL2(embedding_dimension)
index_l2.is_trained

True

In [8]:
index_l2.add(embeddings)
index_l2.ntotal

1000

In [9]:
_, document_index = index_l2.search(np.expand_dims(query_embedding, axis=0), k=5)
dataset.iloc[document_index[0]]

Unnamed: 0,sentence,embedding,id
137,A group of boys is playing soccer on the seashore,"[-0.0064990562, 0.038938403, -0.1637899, -0.05...",138
352,An opponent is tackling a soccer player,"[0.032394014, 0.08220376, -0.15161607, -0.0009...",353
684,A man is punching a soccer ball,"[0.036113564, 0.08290917, -0.10133908, 0.01262...",685
25,A group of men is playing soccer on the beach,"[0.010089901, 0.05418833, -0.15500218, -0.0155...",26
531,Two dogs are running and trying to catch a soc...,"[-0.015107782, 0.028427463, -0.15937233, 0.028...",532


## IndexIVFFlat - Inverted file with exact post-verification

### Voronoi Diagram

Any point within a cell of the Voronoi diagram is closest to the centroid (center) associated with that cell.

A Voronoi diagram partitions space into regions based on the distance to a specific set of points, known as centers or generators.

This algorithm allows you to efficiently find the nearest point to your search criteria without having to compare all possible embeddings—only the closest region needs to be considered.

![](./images/ivf.png)

In [10]:
n_centroids = 20
quantizer = IndexFlatL2(embedding_dimension)
index_ivf = IndexIVFFlat(quantizer, embedding_dimension, n_centroids)
index_ivf.is_trained

False

In [11]:
index_ivf.train(embeddings)
index_ivf.is_trained

True

In [12]:
index_ivf.add(embeddings)
index_ivf.ntotal

1000

In [13]:
_, document_index = index_ivf.search(np.expand_dims(query_embedding, axis=0), k=5)
dataset.iloc[document_index[0]]

Unnamed: 0,sentence,embedding,id
630,A group of women is playing with a ball on the...,"[0.06005692, 0.072284855, -0.18046266, -0.0082...",631
980,The cheerleaders are parading and wearing unif...,"[-0.026352186, 0.019464308, -0.19358607, 0.021...",981
88,Some runners are competing in a race,"[-0.01448306, 0.07064765, -0.19415435, 0.02815...",89
675,Some racers are swimming in a lake,"[-0.033334747, 0.07690886, -0.16447462, 0.0249...",676
162,Some cheerleaders are taking a break,"[-0.0022243182, 0.00015541063, -0.21803565, 0....",163


> Using nprobe we can extend searching by looking into neighbors of found cell. 

In [14]:
index_ivf.nprobe = 5
_, document_index = index_ivf.search(np.expand_dims(query_embedding, axis=0), k=5)
dataset.iloc[document_index[0]]

Unnamed: 0,sentence,embedding,id
137,A group of boys is playing soccer on the seashore,"[-0.0064990562, 0.038938403, -0.1637899, -0.05...",138
950,A soccer player is sitting on the field and is...,"[0.006486396, 0.07249025, -0.1620764, -0.01017...",951
630,A group of women is playing with a ball on the...,"[0.06005692, 0.072284855, -0.18046266, -0.0082...",631
292,The little kid is playing football and falling...,"[0.06897718, 0.007680238, -0.15267676, 0.02927...",293
152,A cold cyclist is celebrating,"[0.02827957, 0.08175549, -0.19929297, 0.039235...",153


# Final optimization
## IndexIVFPQ - IFV + Product Quantizer (PQ)

![ndexIVFPQ - IFV + Product Quantizer (PQ)](./images/ivf-pq.png)

In [15]:
code_size = 8
bits_per_centroid = 4

index_ifv_pq = IndexIVFPQ(quantizer, embedding_dimension, n_centroids, code_size, bits_per_centroid)

index_ifv_pq.is_trained

False

In [16]:
index_ifv_pq.train(embeddings)
index_ifv_pq.add(embeddings)
index_ifv_pq.ntotal

1000

In [17]:
index_ifv_pq.nprobe = 5
_, document_index = index_ifv_pq.search(np.expand_dims(query_embedding, axis=0), k=5)
dataset.iloc[document_index[0]]

Unnamed: 0,sentence,embedding,id
980,The cheerleaders are parading and wearing unif...,"[-0.026352186, 0.019464308, -0.19358607, 0.021...",981
385,A lot of people are in an ice skating park,"[-0.006656262, 0.1322621, -0.17773533, 0.04998...",386
979,The crowd is watching a football game,"[-0.007824613, 0.08482189, -0.17698936, -0.023...",980
483,Some teenage girls are dancing for the camera,"[-0.016949473, -0.0032891117, -0.1651635, 0.05...",484
702,Some teenage girls are dancing for the camera,"[-0.016949473, -0.0032891117, -0.1651635, 0.05...",703
