In [1]:
import psycopg2
import psycopg2.extras
from pgvector.psycopg2 import register_vector
from llama_index.core import Document
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.openai import OpenAIEmbedding
from tqdm.autonotebook import tqdm
import wikipedia

  from tqdm.autonotebook import tqdm


# Motivación

Un caso de uso de bases de datos vectoriales que está en auge es implementar sistemas de *Retrieval Augmnented Generation*, donde a un LLM se le brinda contexto desde una base de conocimientos para que genere una respuesta a una `query` del usuario.

En muchos casos, esa base de conocimientos está en la forma de texto natural. Entonces debemos obtener los documentos mas relevantes (_semánticamente mas cercanos_) para alimentarlos al LLM.

In [2]:
conn = psycopg2.connect("postgresql://utdt:utdt@pgvector:5432/utdt")
conn.autocommit = True
cur = conn.cursor()

# Creamos la extensión pgvector y la registramos
cur.execute("CREATE EXTENSION IF NOT EXISTS vector")
register_vector(conn)

In [3]:
# Creamos la tabla
# Notar el tipo `vector(384)`
cur = conn.cursor()

cur.execute("DROP TABLE IF EXISTS documents")

cur.execute(
    """CREATE TABLE documents (
        label text PRIMARY KEY,
        content text,
        embedding vector(384),
        metadata json
    )"""
)

In [4]:
# Cargamos un modelo de embeddings
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")



In [5]:
articles = [
    wikipedia.page("Roman Empire"),
    wikipedia.page("Byzantine Empire"),
    wikipedia.page("Ottoman Empire"),
    wikipedia.page("Roman Republic"),
    wikipedia.page("Western Roman Empire"),
    wikipedia.page("Eastern Roman Empire"),
    wikipedia.page("Constantinople"),
]

# Separamos cada articulo en chunks
splitter = SentenceSplitter(
    chunk_size=128,
    chunk_overlap=32,
)

nodes = splitter.get_nodes_from_documents(
    [
        Document(text=a.content, metadata={"pageid":a.pageid, "revision_id": a.revision_id, "title": a.title})
        for a in articles
    ]
)

In [6]:
# insertamos cada uno de los chunks
import json

for node in tqdm(nodes):
    embedding = model.encode(node.text)
    cur.execute(
        """INSERT INTO documents (
        label,
        content,
        embedding,
        metadata
    ) VALUES (%s, %s, %s, %s)
    """,
    (node.id_, node.text, embedding, json.dumps(node.metadata))
    )

100%|██████████| 1716/1716 [00:35<00:00, 48.98it/s]


Para buscar vecinos a un texto dado, tenemos vaarias opciones:
- `<#>` indica producto interno
    - Si los embeddings están normalizados, es lo mismo que distancia coseno
    - Negativo, dado que postgres solo soporta ordenamiento ascendente para `index scans`
-  `<=>` indica distancia coseno
-  `<+>` indica distancia L1
-  `<->` indica distancia L2

In [7]:
query = "What was the influence of christianity in the late Roman Empire?"
query_embedding = model.encode(query)

cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)

In [8]:
%%time
cur.execute(
    """
    SELECT label, content, embedding <=> %(query_embedding)s::vector as distance, metadata
    FROM documents
    WHERE embedding <=> %(query_embedding)s::vector < 0.6
    ORDER BY embedding <=> %(query_embedding)s::vector ASC
    LIMIT 2
    """,
    {"query_embedding": query_embedding},
)

CPU times: user 72 μs, sys: 1.93 ms, total: 2 ms
Wall time: 8.72 ms


In [9]:
neighbors = cur.fetchall()

In [10]:
for neighbor in neighbors:
    print(neighbor)
    print()

RealDictRow([('label', '12d96458-a7c3-4808-8acc-c8d0733933b6'), ('content', 'Christianity emerged in Roman Judaea as a Jewish religious sect in the 1st century and gradually spread out of Jerusalem throughout the Empire and beyond. Imperially authorized persecutions were limited and sporadic, with martyrdoms occurring most often under the authority of local officials. Tacitus reports that after the Great Fire of Rome in AD 64, the emperor attempted to deflect blame from himself onto the Christians.'), ('distance', 0.33045442933386926), ('metadata', {'pageid': '25507', 'revision_id': 1228169490, 'title': 'Roman Empire'})])

RealDictRow([('label', 'd3c2b25d-0b9c-449e-940c-5f4bbb642bf5'), ('content', 'The Byzantine state inherited from pagan times the administrative and financial routine of administering religious affairs, and this was applied to the Christian Church. Following the pattern set by Eusebius of Caesarea, the Byzantines viewed the emperor as a representative or messenger of C

Hasta acá estamos haciendo búsqueda *exacta* de vecinos, que tiene *recall perfecto*.

Pero podemos agregar *indices* para hacer busqueda aproximada de vecinos para que sea más rápido, a costa de tener menor recall.

Releyendo el párrafo anterior, pensemos: cuando agregamos un índice "normal", los resultados son los mismos. ¿Es el caso acá?

In [11]:
# Se debe crear un indice especifico para la funcion de distancia que se use 
cur.execute("CREATE INDEX ON documents USING hnsw (embedding vector_cosine_ops)")

In [12]:
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)

In [13]:
%%time
cur.execute(
    """
    SELECT label, content, embedding <=> %(query_embedding)s::vector as distance, metadata
    FROM documents
    WHERE embedding <=> %(query_embedding)s::vector < 0.6
    ORDER BY embedding <=> %(query_embedding)s::vector ASC
    LIMIT 2
    """,
    {"query_embedding": query_embedding},
)

CPU times: user 0 ns, sys: 1.27 ms, total: 1.27 ms
Wall time: 2.25 ms


In [14]:
neighbors = cur.fetchall()

In [15]:
for neighbor in neighbors:
    print(neighbor)
    print()

RealDictRow([('label', '12d96458-a7c3-4808-8acc-c8d0733933b6'), ('content', 'Christianity emerged in Roman Judaea as a Jewish religious sect in the 1st century and gradually spread out of Jerusalem throughout the Empire and beyond. Imperially authorized persecutions were limited and sporadic, with martyrdoms occurring most often under the authority of local officials. Tacitus reports that after the Great Fire of Rome in AD 64, the emperor attempted to deflect blame from himself onto the Christians.'), ('distance', 0.33045442933386926), ('metadata', {'pageid': '25507', 'revision_id': 1228169490, 'title': 'Roman Empire'})])

RealDictRow([('label', '37c48fac-5028-4f3c-858b-d155e60cdd9b'), ('content', 'The Byzantine state inherited from pagan times the administrative and financial routine of administering religious affairs, and this was applied to the Christian Church. Following the pattern set by Eusebius of Caesarea, the Byzantines viewed the emperor as a representative or messenger of C

# Ejercicios
1. Calcular la distancia entre cada par de articulos, tomando el promedio de los embeddings de los chunks de cada articulo como su embedding.
2. Buscar los pasajes más relevantes sobre el asesinato de Julio César utilizando solamente los articulos que contengan "Roman" en su titulo.
3. Obtenga los 20 pasajes mas relevantes referidos a las rutas económicas de la República Romana. Re-rankeelos por su distancia promedio a los otros 19 pasajes.
4. Cree una nueva columna `norm_embedding` con los embeddings normalizados. Agregue un índice de producto interno y repita alguno de los puntos anteriores. ¿Cambiaron los resultados?

# Referencias
- https://github.com/pgvector/pgvector