<img src = "https://github.com/VeryFatBoy/notebooks/blob/main/common/images/img_github_singlestore-jupyter_featured_2.png?raw=true">

<div id="singlestore-header" style="display: flex; background-color: rgba(235, 249, 245, 0.25); padding: 5px;">
    <div id="icon-image" style="width: 90px; height: 90px;">
        <img width="100%" height="100%" src="https://raw.githubusercontent.com/singlestore-labs/spaces-notebooks/master/common/images/header-icons/browser.png" />
    </div>
    <div id="text" style="padding: 5px; margin-left: 10px;">
        <div id="badge" style="display: inline-block; background-color: rgba(0, 0, 0, 0.15); border-radius: 4px; padding: 4px 8px; align-items: center; margin-top: 6px; margin-bottom: -2px; font-size: 80%">SingleStore Notebooks</div>
        <h1 style="font-weight: 500; margin: 8px 0 0 4px;">SingleStore Kai support for MongoDB $vectorSearch</h1>
    </div>
</div>

In [4]:
!pip cache purge

Files removed: 8


In [5]:
!pip install tabulate --quiet

In [6]:
from tabulate import tabulate
import pandas as pd
import pymongo

In [7]:
url = "https://gist.githubusercontent.com/VeryFatBoy/9af771d443f5ec4dd6eec8d69a062638/raw/c03ef25a97f23a48ee408ac02114195b663a2364/iris.csv"

pandas_df = pd.read_csv(url)

In [8]:
pandas_df["vector"] = pandas_df.apply(
    lambda row: [
        row["sepal_length"],
        row["sepal_width"],
        row["petal_length"],
        row["petal_width"]
    ], axis = 1
)

new_df = pandas_df[["vector", "species"]]

new_df.head()

Unnamed: 0,vector,species
0,"[5.1, 3.5, 1.4, 0.2]",Iris-setosa
1,"[4.9, 3.0, 1.4, 0.2]",Iris-setosa
2,"[4.7, 3.2, 1.3, 0.2]",Iris-setosa
3,"[4.6, 3.1, 1.5, 0.2]",Iris-setosa
4,"[5.0, 3.6, 1.4, 0.2]",Iris-setosa


In [9]:
records = new_df.to_dict(orient = "records")

In [10]:
dimensions = len(new_df.at[0, "vector"])

In [11]:
%%sql
DROP DATABASE IF EXISTS iris_db;
CREATE DATABASE IF NOT EXISTS iris_db;

<div class="alert alert-block alert-warning">
    <b class="fa fa-solid fa-exclamation-circle"></b>
    <div>
        <p><b>Action Required</b></p>
        <p>Select the database from the drop-down menu at the top of this notebook. It updates the <b>connection_url_kai</b> which is used to make connections to the selected database.</p>
    </div>
</div>

In [14]:
client = pymongo.MongoClient(connection_url_kai)
db = client["iris_db"]
collection = db["iris"]

In [15]:
collection.drop()

In [16]:
db.list_collection_names()

[]

In [17]:
db.create_collection("iris",
    columns = [{
        "id": "vector", "type": f"VECTOR({dimensions}) NOT NULL"
    }],
);

In [18]:
db.list_collection_names()

['iris']

In [19]:
result = collection.insert_many(records)

In [20]:
total_count = collection.count_documents({})

print(f"Total number of documents in the collection: {total_count}")

Total number of documents in the collection: 150


In [21]:
cursor = collection.find(projection = {"_id": 0}).limit(5)

table = []

for document in cursor:
    species = document["species"]
    vector = [round(value, 2) for value in document["vector"]]
    table.append([vector, species])

print(tabulate(table, headers = ["vector", "species"]))

vector                species
--------------------  ---------------
[5.0, 3.2, 1.2, 0.2]  Iris-setosa
[4.9, 2.4, 3.3, 1.0]  Iris-versicolor
[6.6, 2.9, 4.6, 1.3]  Iris-versicolor
[4.9, 2.5, 4.5, 1.7]  Iris-virginica
[5.4, 3.4, 1.5, 0.4]  Iris-setosa


In [22]:
db.command({
    "createIndexes": "iris",
    "indexes": [{
        "key": {"vector": "vector"},
        "name": "vector_index",
        "kaiIndexOptions": {
            "index_type": "AUTO",
            "metric_type": "EUCLIDEAN_DISTANCE",
            "dimensions": dimensions
        }
    }],
});

In [24]:
query_vector = [5.2, 3.6, 1.5, 0.3]

In [25]:
pipeline = [
    {
        "$vectorSearch": {
            "index": "vector_index",
            "path": "vector",
            "queryVector": query_vector,
            "limit": 5
        }
    }, {
        "$project": {
            "_id": 0,
            "species": 1,
            "score": {
                "$meta": "vectorSearchScore"
            }
        }
    }
]

cursor = collection.aggregate(pipeline)

table = []

for document in cursor:
    species = document["species"]
    score = document["score"]
    table.append([score, species])

print(tabulate(table, headers = ["score", "species"]))

   score  species
--------  -----------
0.141421  Iris-setosa
0.173205  Iris-setosa
0.173205  Iris-setosa
0.173205  Iris-setosa
0.2       Iris-setosa
