In [2]:
# Embedding example
def read_quotes() -> list[str]:
    with open("data/rick_and_morty_quotes.txt", "r") as fh:
        return fh.readlines()

In [3]:
rick_and_morty_quotes = read_quotes()
rick_and_morty_quotes[:3]

["Losers look stuff up while the rest of us are carpin' all them diems.\n",
 "He's not a hot girl. He can't just bail on his life and set up shop in someone else's.\n",
 "When you are an a—hole, it doesn't matter how right you are. Nobody wants to give you the satisfaction.\n"]

In [4]:
import numpy as np
from sentence_transformers import SentenceTransformer

In [5]:
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

In [8]:
emb1, emb2 = model.encode([
 "Losers look stuff up while the rest of us are carpin' all them diems.\n",
 "Losers look stuff up while the rest of us are carpin' all them diems."
])
    
np.allclose(emb1, emb2)

True

In [9]:
# Write a function to generate embeddings from text
from typing import Union
MODEL_NAME = 'paraphrase-MiniLM-L6-v2'

def generate_embeddings(input_data: Union[str, list[str]]) -> np.ndarray:    
    model = SentenceTransformer(MODEL_NAME)
    embeddings = model.encode(input_data)
    return embeddings

In [10]:
embeddings = generate_embeddings(rick_and_morty_quotes)

In [11]:
embeddings

array([[ 0.61883414,  0.06881788,  0.44374257, ..., -0.2408206 ,
         0.13083012,  0.16823111],
       [ 0.20167443, -0.01175196, -0.19724423, ..., -0.20020162,
         0.15091497,  0.2699458 ],
       [ 0.101454  , -0.04251512, -0.1338716 , ...,  0.6270057 ,
         0.41396552, -0.05816976],
       ...,
       [ 0.33933112,  0.04986557, -0.07645066, ...,  0.21975014,
         0.3401615 , -0.11520363],
       [ 0.2540498 , -0.29828006, -0.2447474 , ..., -0.03997071,
        -0.38077486,  0.08643651],
       [ 0.3520349 ,  0.16754158, -0.2339558 , ..., -0.04820175,
         0.2062692 ,  0.01106668]], dtype=float32)

In [12]:
#Print the embeddings
for sentence, embedding in zip(rick_and_morty_quotes[:3], embeddings[:3]):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")

Sentence: Losers look stuff up while the rest of us are carpin' all them diems.

Embedding: [ 0.61883414  0.06881788  0.44374257 -0.4535783   0.30271524 -0.10784185
  0.4952488  -0.12448788  0.05482467 -0.04262808  0.04789143 -0.3194036
  0.18216977 -0.2719968  -0.14199582 -0.5600973  -0.35566264 -0.44555148
 -0.03909503  0.42247906 -0.4604968   0.2643653   0.16821665  0.3429576
  0.20552614  0.20994815 -0.07352563 -0.02430918 -0.07486229  0.41356152
 -0.09713844 -0.0247083   0.02246373  0.10461529  0.2520531  -0.05957118
  0.02156202  0.24379656  0.20664108 -0.40555912 -0.18285906  0.1392638
 -0.29004842  0.14936344 -0.1748422  -0.22140718 -0.01152976 -0.17155792
  0.25811     0.01463504 -0.05509435  0.02583241  0.0143066  -0.13821092
  0.16159976 -0.5648239   0.4062961   0.08129334  0.18729603 -0.06932857
 -0.17729416 -0.10064969  0.30244097 -0.22056274 -0.20505206  0.13730268
  0.32069153  0.2297926  -0.22806755  0.37576813 -0.17270249 -0.17178828
  0.16163556  0.5295059  -0.1935814

In [13]:
len(embeddings[0])

384

In [15]:
# As the embeddings are not normalized, we need to normalize it
np.linalg.norm(embeddings, axis=1)

array([5.5780625, 4.802936 , 4.7402267, 5.1419   , 7.088951 , 4.5193706,
       4.1849627, 5.134197 , 4.96831  , 5.241703 , 5.5314946, 4.3252516,
       6.903126 , 5.59264  , 5.204659 , 5.81475  , 6.8994946, 5.7716293,
       6.3855247, 5.230853 , 6.576409 , 5.3486404, 6.0708685, 7.759795 ,
       4.240742 , 4.596544 , 5.9753213, 4.704994 , 5.0277934, 7.6187196,
       5.8399715, 5.674177 , 5.2255936, 6.6308603, 7.2901535, 5.076944 ,
       7.415247 , 5.501258 , 4.718482 , 5.834398 , 4.634559 , 5.4468737,
       5.3290625, 4.7717104, 5.175281 , 5.1571097, 6.241967 , 5.877233 ,
       4.9330583, 7.8839326, 4.924071 , 6.0574813, 4.257353 , 5.084045 ,
       5.624808 , 4.0615   , 5.489658 , 4.0482755], dtype=float32)

In [16]:
query_text = "Are you the cause of your parents' misery?"
query_embedding = model.encode(query_text)

In [17]:
import numpy as np
def euclidean_distance(v1: np.ndarray, v2: np.ndarray) -> float:
    """
    Compute the Euclidean distance between two vectors.

    Parameters
    ----------
    v1 : np.ndarray
        First vector.
    v2 : np.ndarray
        Second vector.

    Returns
    -------
    float
        Euclidean distance between `v1` and `v2`.
    """
    dist = v1 - v2
    return np.linalg.norm(dist, axis=len(dist.shape)-1)

In [18]:
def find_nearest_neighbors(query: np.ndarray,
                           vectors: np.ndarray,
                           k: int = 1) -> np.ndarray:
    """
    Find k-nearest neighbors of a query vector.

    Parameters
    ----------
    query : np.ndarray
        Query vector.
    vectors : np.ndarray
        Vectors to search.
    k : int, optional
        Number of nearest neighbors to return, by default 1.

    Returns
    -------
    np.ndarray
        The `k` nearest neighbors of `query` in `vectors`.
    """
    distances = euclidean_distance(query, vectors)
    return np.argsort(distances)[:k]

In [19]:
indices = find_nearest_neighbors(query_embedding, embeddings, k=3)

In [20]:
indices

array([ 9,  6, 22])

In [21]:
for i in indices:
    print(rick_and_morty_quotes[i])

You're not the cause of your parents' misery. You're just a symptom of it.

Having a family doesn't mean that you stop being an individual. You know the best thing you can do for the people that depend on you? Be honest with them, even if it means setting them free.

B—h, my generation gets traumatized for breakfast.

