In [1]:
import sqlite3
import numpy as np
from sentence_transformers import SentenceTransformer
import os

  from .autonotebook import tqdm as notebook_tqdm
  if not hasattr(np, "object"):





In [2]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [3]:
texts = sample_texts = [
        "Classic chocolate chip cookies recipe.",
        "Ingredients: 2 cups flour, 1 cup butter.",
        "Add 1 cup sugar and 2 eggs.",
        "Mix in 1 tsp vanilla extract.",
        "Stir in 2 cups chocolate chips.",
        "Preheat oven to 375Â°F.",
        "Scoop dough onto baking sheet.",
        "Bake for 10-12 minutes until golden."
    ]

In [4]:
document_name="sample_doc"
db_path='embeddings.db'

In [5]:
embeddings = model.encode(texts)
embedding_data = []
for i, (text, emb) in enumerate(zip(texts, embeddings)):
    embedding_data.append((document_name, f"{document_name}_chunk_{i+1}", text, emb.astype(np.float32).tobytes()))
    

In [6]:
conn = sqlite3.connect(db_path)
c = conn.cursor()
c.execute('''
CREATE TABLE IF NOT EXISTS embeddings (
    document_name TEXT,
    chunk_id TEXT PRIMARY KEY,
    text TEXT,
    embedding_vector BLOB
)
''')
c.executemany('INSERT OR REPLACE INTO embeddings VALUES (?, ?, ?, ?)', embedding_data)
conn.commit()
conn.close()

In [7]:
def cosine_similarity(a, b):
    
    if np.linalg.norm(a) == 0 or np.linalg.norm(b) == 0:
        return 0.0 
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [8]:
def find_similar_chunks(query_text, db_path='embeddings.db', top_k=3):

    query_emb = model.encode([query_text])[0]  
    
    conn = sqlite3.connect(db_path)
    c = conn.cursor()
    c.execute('SELECT text, embedding_vector FROM embeddings')
    results = c.fetchall()
    conn.close()
    
    similarities = []
    for text, emb_blob in results:
        stored_emb = np.frombuffer(emb_blob, dtype=np.float32)
        sim = cosine_similarity(query_emb, stored_emb)
        similarities.append((sim, text))
    
    # Sort
    top_matches = sorted(similarities, key=lambda x: x[0], reverse=True)[:top_k]
    return top_matches

In [9]:
stored_data = [(text, emb) for text, emb in zip(texts, embeddings)]

In [10]:
query = "sweet baking ingredients"

In [11]:
top_similar = find_similar_chunks(query)
print(f"\nExample 2: Top similar chunks to '{query}':")
for sim, text in top_similar:
    print(f"  Similarity: {sim:.4f} | Text: {text}")


Example 2: Top similar chunks to 'sweet baking ingredients':
  Similarity: 0.5065 | Text: Ingredients: 2 cups flour, 1 cup butter.
  Similarity: 0.4835 | Text: Classic chocolate chip cookies recipe.
  Similarity: 0.4317 | Text: Bake for 10-12 minutes until golden.
