In [19]:
import sqlite3
import sqlite_vec
import hashlib
import json
import numpy as np
from typing import List

In [4]:
# Step 1 - Connection SQLite
conn = sqlite3.connect("demo.db")
conn.enable_load_extension(True)
sqlite_vec.load(conn)
cur = conn.cursor()

In [5]:
# Step 2 - Create Tables
cur.execute("""
CREATE TABLE IF NOT EXISTS docs (
    id INTEGER PRIMARY KEY,
    content TEXT NOT NULL
);
""")

# Insert docs only if table is empty (idempotent)
cur.execute("SELECT COUNT(*) FROM docs;")
docs = [
    (1, "The quick brown fox jumps over the lazy dog"),
    (2, "A fast auburn fox leaps above a sleepy canine"),
    (3, "An article about database systems and vector search"),
    (4, "Deep learning and embeddings for natural language processing"),
]
docs_count = cur.fetchone()[0]
if docs_count == 0:
    cur.executemany("INSERT INTO docs(id, content) VALUES (?, ?);", docs)

In [7]:
def embed_text(text: str, dim: int = 8) -> List[float]:
  """Simple deterministic embedding for demo purposes.
  Uses SHA256 of the text to produce a repeatable vector in range [-1, 1].
  """
  h = hashlib.sha256(text.encode("utf-8")).digest()
  vec = []
  for i in range(dim):
      b1 = h[(i * 2) % len(h)]
      b2 = h[(i * 2 + 1) % len(h)]
      val = (b1 << 8) | b2
      f = (val / 65535.0) * 2.0 - 1.0
      vec.append(f)
  return vec

In [8]:
# Step 3 - Add vector embeddings
cur.execute("""
CREATE VIRTUAL TABLE IF NOT EXISTS vec_docs USING vec0(
    embedding FLOAT[8]
);
""")
cur.execute("SELECT COUNT(*) FROM vec_docs;")
vec_count = cur.fetchone()[0]
if vec_count == 0:
  rows = []
  for _id, text in docs:
      emb = embed_text(text, dim=8)
      rows.append((_id, json.dumps(emb)))

  cur.executemany("INSERT INTO vec_docs(rowid, embedding) VALUES (?, ?);", rows)
  conn.commit()

In [None]:
cur = conn.execute("SELECT rowid, embedding FROM vec_docs LIMIT 5;")
for row in cur.fetchall():
    rowid, blob = row
    vec = np.frombuffer(blob, dtype=np.float32)
    print(f"id={rowid}, vec={vec[:8]}") # Print first 8 dimensions

id=1, vec=[ 0.68484014  0.9664301  -0.93875027  0.00453193 -0.17351034  0.20888075
  0.37526512 -0.63820857]
id=2, vec=[-0.10630961  0.9881895   0.983032   -0.3600061  -0.8277867   0.31184864
 -0.8834821  -0.96121156]
id=3, vec=[-0.5786679  -0.48274967 -0.9851072   0.9522393   0.95977724  0.27382314
  0.81451136 -0.19572747]
id=4, vec=[-0.6383917   0.20823987 -0.6133974  -0.80352485  0.38023958 -0.9358206
 -0.67974365  0.78530556]


In [9]:
# Step 4 - Test query
query = "fox dog"
text_query = query
query_vec = embed_text(text_query, dim=8)
query_vec_json = json.dumps(query_vec)
res = cur.execute(
    """
    SELECT rowid, distance
    FROM vec_docs
    WHERE embedding MATCH ?
    ORDER BY distance
    LIMIT 2;
    """,
    (query_vec_json,)
).fetchall()

for rowid, distance in res:
    print(f"- rowid={rowid}  distance={float(distance):.12f}")

- rowid=3  distance=1.915137887001
- rowid=1  distance=2.211798191071
