In [2]:
import psycopg2
import psycopg2.extras
from pgvector.psycopg2 import register_vector
from llama_index.core.llama_dataset import download_llama_dataset
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.openai import OpenAIEmbedding
from tqdm.autonotebook import tqdm

import wikipedia

rome = wikipedia.page("Roman Empire")

In [3]:
conn = psycopg2.connect("postgresql://utdt:utdt@pgvector:5432/utdt")
conn.autocommit = True
cur = conn.cursor()
cur.execute("CREATE EXTENSION IF NOT EXISTS vector")

register_vector(conn)

In [4]:
conn = psycopg2.connect("postgresql://utdt:utdt@pgvector:5432/utdt")
conn.autocommit = True
cur = conn.cursor()
cur.execute("DROP TABLE IF EXISTS documents")
cur.execute(
    """CREATE TABLE documents (
    label text PRIMARY KEY,
    content text,
    embedding vector(1536),
    metadata json
)"""
)

In [31]:
openai_embedding = OpenAIEmbedding(
    model="text-embedding-3-small", 
    embed_batch_size=20, 
    dimensions=1536, 
    api_key="",
    timeout=60,
)

In [None]:
rag_dataset, documents = download_llama_dataset(
    "PaulGrahamEssayDataset", "./data"
)

splitter = SentenceSplitter(
    chunk_size=256,
    chunk_overlap=20,
)

nodes = splitter.get_nodes_from_documents(documents)

for node in tqdm(nodes):
    embedding = openai_embedding.get_text_embedding(node.text)
    cur.execute(
        """INSERT INTO documents (
        label,
        content,
        embedding
    ) VALUES (%s, %s, %s)
    """,
    (node.id_, node.text, embedding)
    )

cur.execute("CREATE INDEX ON documents USING hnsw (embedding vector_l2_ops)")

In [40]:
query = rag_dataset[0].query
print(query)

In the essay, the author mentions his early experiences with programming. Describe the first computer he used for programming, the language he used, and the challenges he faced.


In [39]:
query_embedding = openai_embedding.get_text_embedding(query)

cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)

cur.execute(
    """
    SELECT label, content, embedding <=> %(query_embedding)s::vector as distance
    FROM documents
    ORDER BY embedding <=> %(query_embedding)s::vector ASC
    LIMIT 10
    """,
    {"query_embedding": query_embedding},
)
neighbors = cur.fetchall()

for neighbor in neighbors:
    print(neighbor)

RealDictRow([('label', '602ec7b7-a300-4cb1-a3ef-9e71d806d3bf'), ('content', 'What I Worked On\n\nFebruary 2021\n\nBefore college the two main things I worked on, outside of school, were writing and programming. I didn\'t write essays. I wrote what beginning writers were supposed to write then, and probably still are: short stories. My stories were awful. They had hardly any plot, just characters with strong feelings, which I imagined made them deep.\n\nThe first programs I tried writing were on the IBM 1401 that our school district used for what was then called "data processing." This was in 9th grade, so I was 13 or 14. The school district\'s 1401 happened to be in the basement of our junior high school, and my friend Rich Draves and I got permission to use it. It was like a mini Bond villain\'s lair down there, with all these alien-looking machines â\x80\x94 CPU, disk drives, printer, card reader â\x80\x94 sitting up on a raised floor under bright fluorescent lights.\n\nThe language 