# Milvus DB

## Initialie a connection to Milvus

First you need to pull and run a Milvus docker image: 
```
docker pull ...
```

Then we define the 


In [1]:
from pymilvus import (
    connections,
    utility,
    FieldSchema,
    CollectionSchema,
    DataType,
    Collection,
    MilvusClient
)
import numpy as np

import time

In [2]:
# To je samo zato da lahko razdelimo kodo v vec celic v notebooku
def add_method(cls):  
    def decorator(func):  
        setattr(cls, func.__name__, func)  
        return func  
    return decorator  

class SimpleVectorDB:
    def __init__(
        self, host="localhost", port="19530", vector_dim = 128, collection_name="simple_vectors", index_params={
                "metric_type": "COSINE",
                "index_type": "IVF_FLAT",
                "params": {"nlist": 1024},
            }
    ):
        """Initialize connection to Milvus server and set up the collection."""
        self.collection_name = collection_name

        # Connect to Milvus server
        connections.connect(host=host, port=port)

        # Define collection schema
        self.dim = vector_dim  # Vector dimension
        
        print("self.dim", self.dim)

        # Define the fields for the collection
        fields = [
            FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
            FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=self.dim),
            FieldSchema(name="metadata", dtype=DataType.VARCHAR, max_length=512),
        ]

        schema = CollectionSchema(fields=fields, description="Simple vector database")


        # Create collection if it doesn't exist
        if utility.has_collection(self.collection_name):
            self.collection = Collection(self.collection_name)
        else:
            self.collection = Collection(self.collection_name, schema)

            # Create an index for vector field
            self.collection.create_index(field_name="vector", index_params=index_params)





In [3]:
@add_method(SimpleVectorDB) 
def insert(self, vectors, metadata_list):
    """
    Insert vectors and their metadata into the database.

    Args:
        vectors: numpy array of shape (n, dim) containing the vectors
        metadata_list: list of strings containing metadata for each vector
    """
    if len(vectors) != len(metadata_list):
        raise ValueError("Number of vectors and metadata entries must match")

    # Ensure vectors are in the correct format (numpy array)
    if not isinstance(vectors, np.ndarray):
        vectors = np.array(vectors)

    # Ensure vectors are float32
    vectors = vectors.astype(np.float32)

    # Prepare data for insertion
    data = [
        {"vector": vec, "metadata": meta}
        for vec, meta in zip(vectors, metadata_list)
    ]

    # Insert the data
    self.collection.insert(data)
    self.collection.flush()


In [4]:
@add_method(SimpleVectorDB)
def search(self, query_vector, top_k=5, search_params={}):
    """
    Search for the closest vectors to the query vector.

    Args:
        query_vector: numpy array of shape (dim,) containing the query vector
        top_k: number of closest vectors to return

    Returns:
        List of tuples containing (id, distance, metadata) for the closest vectors
    """
    self.collection.load()

    # Ensure query vector is in the correct format
    if not isinstance(query_vector, np.ndarray):
        query_vector = np.array(query_vector)
    query_vector = query_vector.astype(np.float32)


    results = self.collection.search(
        data=[query_vector],
        anns_field="vector",
        param=search_params,
        limit=top_k,
        output_fields=["metadata"],
    )

    # Format results
    search_results = []
    for hits in results:
        for hit in hits:
            search_results.append(
                {
                    "id": hit.id,
                    "distance": hit.distance,
                    "metadata": hit.entity.get("metadata"),
                }
            )

    return search_results


In [5]:
@add_method(SimpleVectorDB)
def delete_by_ids(self, ids):
    """Delete vectors by their IDs."""
    expr = f"id in {ids}"
    self.collection.delete(expr)

@add_method(SimpleVectorDB)
def count(self):
    """Return the number of vectors in the database."""
    return self.collection.num_entities

@add_method(SimpleVectorDB)
def close(self):
    """Close the connection to Milvus server."""
    connections.disconnect("default")
    
@add_method(SimpleVectorDB)
def delete_collection(self):
    """Drop the collection"""
    self.collection.drop()


In [6]:

## FLAT Index

FLAT_INDEX_PARAMS = {
    "index_type": "FLAT",
    "metric_type": "COSINE",
}

VECTOR_DIM = 32

db = SimpleVectorDB(collection_name= "flat_index_collection", vector_dim=VECTOR_DIM, index_params=FLAT_INDEX_PARAMS)


def insertRandomVectors(db, num_vectors = 99999, vector_size = 128):
    # Create some random vectors and metadata
    vectors = (
        np.random.random((num_vectors, vector_size)).astype(np.float32) * 10000
    )  # Ensure float32 type
    # arr1, arr2, arr3 = np.split(vectors, 3)

    metadata = np.array([f"Vector_{i}" for i in range(num_vectors)]) 
    # mtd1, mtd2, mtd3 = np.split(metadata, 3)

    # Insert vectors
    db.insert(vectors, metadata)



t1 = time.time()    
for _ in range(3):
    insertRandomVectors(db, num_vectors=333333,  vector_size=VECTOR_DIM)
print(f"Inserted {3*333333} vectors in {time.time() - t1:.2f}s")

# Perform a search
t2 = time.time()
query = np.random.random(VECTOR_DIM).astype(np.float32).tolist()  # Ensure float32 type
for _ in range(10):
    results = db.search(query, top_k=3)
print(f"Search time: {time.time() - t2:.2f}s")

print("\nSearch results:", results)
for i in range(len(results)):
    print(f"ID: {results[i]['id']}, Distance: {results[i]['distance']}, Metadata: {results[i]['metadata']}")



db.delete_collection()




self.dim 32


KeyboardInterrupt: 

In [7]:
## HNSW Index

HNSW_INDEX_PARAMS = {
    "index_type": "HNSW",
    "metric_type": "COSINE",
    "params": {"M": 64, "efConstruction": 64},
}

db = SimpleVectorDB(collection_name= "hnsw_index_collection", vector_dim=VECTOR_DIM, index_params=HNSW_INDEX_PARAMS)


def insertRandomVectors(db, num_vectors = 9999, vector_size = 128):
    # Create some random vectors and metadata
    vectors = (
        np.random.random((num_vectors, vector_size)).astype(np.float32) * 10000
    )  # Ensure float32 type

    metadata = np.array([f"Vector_{i}" for i in range(num_vectors)]) 

    # Insert vectors
    db.insert(vectors, metadata)



t1 = time.time()    
for _ in range(3):
    insertRandomVectors(db, num_vectors=33333,  vector_size=VECTOR_DIM)
print(f"Inserted {3*33333} vectors in {time.time() - t1:.2f}s")

# Perform a search
t2 = time.time()
query = np.random.random(VECTOR_DIM).astype(np.float32).tolist()  # Ensure float32 type
for _ in range(10):
    results = db.search(query, top_k=3)
print(f"Search time: {time.time() - t2:.2f}s")

print("\nSearch results:", results)
for i in range(len(results)):
    print(f"ID: {results[i]['id']}, Distance: {results[i]['distance']}, Metadata: {results[i]['metadata']}")



db.delete_collection()


self.dim 32
Inserted 99999 vectors in 23.42s
Search time: 3.57s

Search results: [{'id': 455325402355499568, 'distance': 0.9310328960418701, 'metadata': 'Vector_14870'}, {'id': 455325402355505849, 'distance': 0.9293422102928162, 'metadata': 'Vector_21151'}, {'id': 455325402355546111, 'distance': 0.9263074398040771, 'metadata': 'Vector_28077'}]
ID: 455325402355499568, Distance: 0.9310328960418701, Metadata: Vector_14870
ID: 455325402355505849, Distance: 0.9293422102928162, Metadata: Vector_21151
ID: 455325402355546111, Distance: 0.9263074398040771, Metadata: Vector_28077
