# Notebook: JSON Vector Search Books Example

Adapted from a [notebook](https://github.com/singlestore-labs/devrel-notebook-examples/blob/main/json-vector-search-books-example/json-vector-search-books-example.ipynb) on GitHub by [Yaroslav Demenskyi](https://github.com/demenskyi).

## 1. Install and import packages

In [None]:
!pip install openai numpy pymongo --quiet

In [None]:
import openai
import json
import struct
import requests
from bson import ObjectId
from pymongo import MongoClient
from typing import TypedDict, Optional

## 2. Set constants

In [None]:
import os
import getpass

os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")

In [None]:
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
DB_NAME = "bookstore"
SOURCE_COLLECTION_NAME = "books"
EMBEDDINGS_COLLECTION_NAME = "books_with_embedding"
BOOK_EMBEDDINGS_NUMBER = 50

## 3.  Set variables

In [None]:
openai.api_key = OPENAI_API_KEY
client = MongoClient(connection_url_mongo)
db = client.get_database(DB_NAME)
sourceCollection = db[SOURCE_COLLECTION_NAME]
embeddingsCollection = db[EMBEDDINGS_COLLECTION_NAME]

## 4. Set helper classes

In [None]:
class Book(TypedDict):
    _id: dict
    title: str
    subjects: list[str]
    description: str
    price: float
    type: str
    createdAt: str
    updatedAt: str


class BookWithEmbedding(Book):
    embedding: str


class JSONEncoder(json.JSONEncoder):
    def default(self, o):
        if isinstance(o, ObjectId):
            return str(o)
        return super().default(o)

## 5. Define helper functions

Function to create an embedding using the OpenAI Embedding API

In [None]:
create_embedding_retries = 0

def create_embedding(data: str | dict) -> list[float]:
    global create_embedding_retries

    if type(data) is dict:
        parsed_id = json.loads(data["_id"])["$oid"]
        data["_id"] = ObjectId(parsed_id)
        data = json.dumps(data, cls=JSONEncoder)

    try:
        data = data.replace("\n", " ")
        response = openai.Embedding.create(input=data, model="text-embedding-ada-002")
        create_embedding_retries = 0
        return response["data"][0]["embedding"]
    except Exception as e:
        if create_embedding_retries < 5:
            print("An error occurred while creating the embedding. Retrying...", "\n", e)
            create_embedding_retries = create_embedding_retries + 1
            return create_embedding(data)
        else:
            print("Maximum retries reached.", "\n", e)

Function to convert data into binary data

In [None]:
def data_to_binary(data: list[float]):
    format_string = "f" * len(data)
    return struct.pack(format_string, *data)

Function to validate a collection before the loading

In [None]:
def validate_collection(name: str):
    if not name in db.list_collection_names():
        return False

    if db[name].estimated_document_count() == 0:
        return False

    return True

Function to retrieve metadata from the database to check if the value of `last_book_embeddings_number` has changed to start the load step.

In [None]:
def get_meta():
    return db["meta"].find_one()

Function to update the meta data

In [None]:
def update_meta(book_embeddings_number: int):
    db["meta"].update_one({"_id": get_meta()["_id"]}, {
        "$set": {"last_book_embeddings_number": book_embeddings_number}
    })

Function for retrieving books from the source collection to be used to create embeddings

In [None]:
def get_books(limit: Optional[int] = None) -> list[Book]:
    query = sourceCollection.find()

    if type(limit) is int:
        query.limit(limit)

    return list(query)

Function to set source collection data

In [None]:
def load_source():

    def insert_meta():
        db["meta"].insert_one({"last_book_embeddings_number": 0})

    if validate_collection(SOURCE_COLLECTION_NAME):
        print("Source loading")
        insert_meta()

Function to insert/drop a collection with embeddings that will be used to search with $dotProduct. If the value of `BOOK_EMBEDDINGS_NUMBER` is changed, the collection is reset.

In [None]:
def load_embeddings():
    is_same_embeddings_number = get_meta()["last_book_embeddings_number"] == BOOK_EMBEDDINGS_NUMBER

    def reset_collection():
        global embeddingsCollection
        embeddingsCollection.drop()
        db.create_collection(EMBEDDINGS_COLLECTION_NAME, columns=[{"id": "embedding", "type": "LONGBLOB NOT NULL"}])
        embeddingsCollection = db[EMBEDDINGS_COLLECTION_NAME]

    def create_embeddings(books: list[Book]) -> list[BookWithEmbedding]:
        embeddings = []

        for book in books:
            embedding = create_embedding(book)
            embedding_binary = data_to_binary(embedding)
            embeddings.append({**book, "embedding": embedding_binary})

        return embeddings

    def insert_embeddings(embeddings: list[BookWithEmbedding]):
        embeddingsCollection.insert_many(embeddings)

    if not validate_collection(EMBEDDINGS_COLLECTION_NAME) or not is_same_embeddings_number:
        print("Embeddings loading")
        reset_collection()
        books = get_books(BOOK_EMBEDDINGS_NUMBER)
        embeddings = create_embeddings(books)
        insert_embeddings(embeddings)
        update_meta(BOOK_EMBEDDINGS_NUMBER)

Function to search for books with a string query and $dotProduct

In [None]:
def search(query: str):
    print("Searching:", query)

    query_embedding = create_embedding(query)
    query_binary = data_to_binary(query_embedding)
    query_result = embeddingsCollection.aggregate([
        {"$addFields": {"dot": {"$dotProduct": ["$embedding", query_binary]}}},
        {"$project": {"embedding": 0}},
        {"$sort": {"dot": -1}},
        {"$limit": 5},
    ])

    result = list(query_result)

    print(json.dumps(result, cls=JSONEncoder, indent=4))

    return result

## 6. Prepare and load collections

In [None]:
load_source()
load_embeddings()

## 7. Use search

In [None]:
search("Recommend books about the horror genre")