Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
177 changes: 177 additions & 0 deletions src/python_bindings/HNSW_iterator_demo.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "86973d44",
"metadata": {},
"outputs": [],
"source": [
"from VecSim import *\n",
"import numpy as np\n",
"\n",
"dim = 100\n",
"num_elements = 100000\n",
"M = 32\n",
"efConstruction = 200\n",
"efRuntime = 200\n",
"\n",
"# Create a hnsw index for vectors of 100 floats. Use 'L2' as the distance metric\n",
"hnswparams = HNSWParams()\n",
"hnswparams.M = M\n",
"hnswparams.efConstruction = efConstruction\n",
"hnswparams.initialCapacity = num_elements\n",
"hnswparams.efRuntime = efRuntime\n",
"hnswparams.dim = dim\n",
"hnswparams.type = VecSimType_FLOAT32\n",
"hnswparams.metric = VecSimMetric_L2\n",
"\n",
"hnsw_index = HNSWIndex(hnswparams)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "03ff62d4",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# Add 100k random vectors and insert then to the index\n",
"data = np.float32(np.random.random((num_elements, dim)))\n",
"vectors = []\n",
"\n",
"for i, vector in enumerate(data):\n",
" hnsw_index.add_vector(vector, i)\n",
" vectors.append((i, vector))\n",
"\n",
"print(f'Index size: {hnsw_index.index_size()}')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bc831b57",
"metadata": {},
"outputs": [],
"source": [
"# Create a random query vector\n",
"hnsw_index.set_ef(300)\n",
"query_data = np.float32(np.random.random((1, dim)))\n",
"\n",
"# Create batch iterator for this query vector\n",
"batch_iterator = hnsw_index.create_batch_iterator(query_data)\n",
"returned_results_num = 0\n",
"accumulated_labels = []\n",
"total_time = 0\n",
"\n",
"from scipy import spatial\n",
"\n",
"# Sort distances of every vector from the target vector and get the actual order\n",
"dists = [(spatial.distance.euclidean(query_data, vec), key) for key, vec in vectors]\n",
"dists = sorted(dists)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a3c3fbee",
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"# Get the next best results\n",
"import time\n",
"\n",
"start = time.time()\n",
"batch_size = 100\n",
"labels, distances = batch_iterator.get_next_results(batch_size, BY_SCORE)\n",
"total_time += time.time()-start\n",
"\n",
"print (f'Results in rank {returned_results_num}-{returned_results_num+len(labels[0])} are: \\n')\n",
"print (f'scores: {distances}\\n')\n",
"print (f'labels: {labels}')\n",
"\n",
"returned_results_num += len(labels[0])\n",
"accumulated_labels.extend(labels[0])\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d79d0bb9",
"metadata": {},
"outputs": [],
"source": [
"# Measure recall and time\n",
"\n",
"keys = [key for _, key in dists[:returned_results_num]]\n",
"correct = len(set(accumulated_labels).intersection(set(keys)))\n",
"\n",
"print(f'Total search time: {total_time}')\n",
"print(f'Recall for {returned_results_num} results in index of size {num_elements} with dim={dim} is: ', correct/returned_results_num)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8328da4b",
"metadata": {},
"outputs": [],
"source": [
"# Comapre to \"stadnrd\" KNN search\n",
"\n",
"start = time.time()\n",
"labels_knn, distances_knn = hnsw_index.knn_query(query_data, returned_results_num)\n",
"print(f'Total search time: {time.time() - start}')\n",
"\n",
"keys = [key for _, key in dists[:returned_results_num]]\n",
"correct = len(set(labels_knn[0]).intersection(set(keys)))\n",
"print(f'Recall for {returned_results_num} results in index of size {num_elements} with dim={dim} is: ', correct/returned_results_num)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7925ecf6",
"metadata": {},
"outputs": [],
"source": [
"# Run batches until depleted\n",
"batch_iterator.reset()\n",
"returned_results_num = 0\n",
"batch_size = 100\n",
"start = time.time()\n",
"while(batch_iterator.has_next()):\n",
" labels, distances = batch_iterator.get_next_results(batch_size, BY_ID)\n",
" returned_results_num += len(labels[0])\n",
"\n",
"print(f'Total results returned: {returned_results_num}\\n')\n",
"print(f'Total search time: {time.time() - start}')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:vsim] *",
"language": "python",
"name": "conda-env-vsim-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}