## 对比HNSW索引性能

In [1]:
# 连接部署在Docker上的Standalone版本Milvus
from pymilvus import connections, db

conn = connections.connect(host="127.0.0.1", port=19530)

# 查看当前已有数据库
db.list_database()

['HNSW_100k_128',
 'IVF_PQ_100k_128',
 'default',
 'wwf_mac_docker',
 'RESH_100k_128',
 'FLAT_100k_128']

In [3]:
# 创建测试RESH的数据库
database = db.create_database("HNSW_100k_128")

# 列出所有数据库
db.list_database()

['default',
 'wwf_mac_docker',
 'RESH_100k_128',
 'FLAT_100k_128',
 'HNSW_100k_128']

In [2]:
# 创建Collection
from pymilvus import MilvusClient, DataType

client = MilvusClient(
    uri="http://localhost:19530"
)

# client.create_collection(
#     collection_name="HNSW_100k_128_collection",
#     dimension=128,
#     index_params="HNSW",
#     # index_params="IVF_FLAT",
#     # index_params="IVF_SQ8",
#     # index_params="IVF_PQ",
#     # index_params="HNSW",
#     
#     metric_type="L2"
#     # metric_type="IP"
#     # metric_type="COSINE"
# )
# 
# res = client.get_load_state(
#     collection_name="HNSW_100k_128_collection"
# )
# 
# print(res)

In [5]:
import numpy as np

# 加载向量数据
ResNet50_features = np.load('D:\\Python_Project\\Learned_Index\\Milvus\\RESH\\Original_data\\128_100k_ResNet50_vector.npy')

# 打印每个特征向量的形状
print("Features shape:", ResNet50_features.shape)

# 为每个特征生成唯一的 ID 列表
ResNet50_features_ids = list(range(len(ResNet50_features)))

# 随机选择 n 个向量
n = 100  # 你可以修改 n 的值
random_indices = np.random.choice(len(ResNet50_features), size=n, replace=False)
random_vectors = ResNet50_features[random_indices]

# 将随机选出的向量转为 [[ ]] 的形式
random_vectors_list = random_vectors.tolist()

# 打印随机选出的向量
print(f"Randomly selected {n} vectors in [[ ]] format:", random_vectors_list)

Features shape: (100000, 128)
Randomly selected 100 vectors in [[ ]] format: [[-0.2541620433330536, 0.21408526599407196, 0.2415148764848709, -0.8798109889030457, 0.05302658677101135, 0.09620831161737442, -0.534636914730072, -0.26768770813941956, 0.6316637992858887, -0.16742633283138275, -0.31270527839660645, -0.1722744107246399, -0.46167728304862976, 0.17374691367149353, -0.08287922292947769, -0.008955635130405426, 0.18264976143836975, 0.7549436092376709, -0.31868186593055725, -0.6897777915000916, 0.7990290522575378, 0.01724894344806671, 0.5142604112625122, -0.27558475732803345, 0.6478550434112549, -0.043308187276124954, -0.5940326452255249, -0.3689795434474945, -0.4533664882183075, 0.016151202842593193, -0.24602364003658295, 0.5047791600227356, -0.041066162288188934, 0.5830411911010742, 0.31983816623687744, 0.5640528202056885, 0.35257652401924133, -0.645290195941925, -0.21118414402008057, 0.03828875720500946, -0.19780458509922028, 0.497911274433136, 0.4708804786205292, 0.1207419782876

In [6]:
# 通用的插入函数，用于将高维向量插入到Collection
def insert_to_milvus(collection_name, features, ids):
    # features 是一个 NumPy 数组，直接使用
    # 构建 Milvus 所需的插入数据结构
    data = [{"id": ids[i], "vector": features[i].tolist()} for i in range(len(features))]

    # 插入数据到集合中 (假设 client 是正确连接的 Milvus 客户端实例)
    res = client.insert(collection_name=collection_name, data=data)

    # 打印插入结果
    print(f"Inserting into {collection_name} completed with result: {len(res)}")

# 插入特征到 _collection
insert_to_milvus("HNSW_100k_128_collection", ResNet50_features, ResNet50_features_ids)

Inserting into HNSW_100k_128_collection completed with result: 3


In [3]:
import numpy as np

# 加载查询负载--完全随机负载100
Sample_features = np.load('D:\Python_Project\Learned_Index\Milvus\RESH\Query\Completely_random\sampled_100_vectors.npy')

# 打印每个特征向量的形状
print("Sample Features Shape:", Sample_features.shape)

# # 为每个特征生成唯一的 ID 列表
# Sample_features_ids = list(range(len(Sample_features)))
# 
# # 随机选择 n 个向量
# n = 100  # 你可以修改 n 的值
# random_indices = np.random.choice(len(Sample_features), size=n, replace=False)
# random_vectors = Sample_features[random_indices]

# 将随机选出的向量转为 [[ ]] 的形式
Sample_features_list = Sample_features.tolist()

# 打印随机选出的向量
# print(f"Randomly selected {n} vectors in [[ ]] format:", random_vectors_list)

Sample Features Shape: (100, 128)


In [48]:
# 批量向量搜索 10NN
# import json
# res = client.search(
#     collection_name="FLAT_100k_128_collection",
#     data=Sample_features_list,
#     limit=10,
#     search_params={"metric_type": "L2",} 
# )
# result = json.dumps(res, indent=4)
# result = json.dumps(res)
# print(result)
import time

# 记录开始时间
start_time = time.time()

# 执行搜索操作
res = client.search( 
    collection_name="HNSW_100k_128_collection",
    data=Sample_features_list,
    limit=10,
    search_params={"metric_type": "L2",}
)

# 记录结束时间
end_time = time.time()

# 计算并打印运行时间（单位：毫秒）
elapsed_time = (end_time - start_time) * 1000  # 转换为毫秒
print(f"运行时间: {elapsed_time:.2f} 毫秒")


运行时间: 53.55 毫秒
