## 对比IVF_PQ索引性能

In [21]:
# 连接部署在Docker上的Standalone版本Milvus
from pymilvus import connections, db

conn = connections.connect(host="127.0.0.1", port=19530)

# 查看当前已有数据库
db.list_database()
db.using_database("IVF_PQ_100k_128")

In [2]:
# # 创建测试RESH的数据库
# database = db.create_database("IVF_PQ_100k_128")
# 
# # 列出所有数据库
# db.list_database()

['wwf_mac_docker',
 'RESH_100k_128',
 'FLAT_100k_128',
 'HNSW_100k_128',
 'IVF_PQ_100k_128',
 'default']

In [22]:
# 创建Collection
from pymilvus import MilvusClient, DataType

client = MilvusClient(
    uri="http://localhost:19530"
)

client.create_collection(
    collection_name="IVF_PQ_100k_128_collection",
    dimension=128,
    index_params="IVF_PQ",
    # index_params="IVF_FLAT",
    # index_params="IVF_SQ8",
    # index_params="IVF_PQ",
    # index_params="HNSW",

    metric_type="L2"
    # metric_type="IP"
    # metric_type="COSINE"
)

res = client.get_load_state(
    collection_name="IVF_PQ_100k_128_collection"
)

print(res)

{'state': <LoadState: Loaded>}


In [23]:
import numpy as np

# 加载向量数据
ResNet50_features = np.load('D:\\Python_Project\\Learned_Index\\Milvus\\RESH\\Original_data\\128_100k_ResNet50_vector.npy')

# 打印每个特征向量的形状
print("Features shape:", ResNet50_features.shape)

# 为每个特征生成唯一的 ID 列表
ResNet50_features_ids = list(range(len(ResNet50_features)))

# # 随机选择 n 个向量
# n = 100  # 你可以修改 n 的值
# random_indices = np.random.choice(len(ResNet50_features), size=n, replace=False)
# random_vectors = ResNet50_features[random_indices]
# 
# # 将随机选出的向量转为 [[ ]] 的形式
# random_vectors_list = random_vectors.tolist()
# 
# # 打印随机选出的向量
# print(f"Randomly selected {n} vectors in [[ ]] format:", random_vectors_list)

Features shape: (100000, 128)


In [4]:
# 通用的插入函数，用于将高维向量插入到Collection
def insert_to_milvus(collection_name, features, ids):
    # features 是一个 NumPy 数组，直接使用
    # 构建 Milvus 所需的插入数据结构
    data = [{"id": ids[i], "vector": features[i].tolist()} for i in range(len(features))]

    # 插入数据到集合中 (假设 client 是正确连接的 Milvus 客户端实例)
    res = client.insert(collection_name=collection_name, data=data)

    # 打印插入结果
    print(f"Inserting into {collection_name} completed with result: {len(res)}")

# 插入特征到 _collection
insert_to_milvus("IVF_PQ_100k_128_collection", ResNet50_features, ResNet50_features_ids)

Inserting into IVF_PQ_100k_128_collection completed with result: 3


In [31]:
import numpy as np

# 加载查询负载--完全随机负载100
Sample_features = np.load('D:\Python_Project\Learned_Index\Milvus\RESH\Query\Completely_random\sampled_100_vectors.npy')

# 打印每个特征向量的形状
print("Sample Features Shape:", Sample_features.shape)

# # 为每个特征生成唯一的 ID 列表
# Sample_features_ids = list(range(len(Sample_features)))
# 
# # 随机选择 n 个向量
# n = 100  # 你可以修改 n 的值
# random_indices = np.random.choice(len(Sample_features), size=n, replace=False)
# random_vectors = Sample_features[random_indices]

# 将随机选出的向量转为 [[ ]] 的形式
Sample_features_list = Sample_features.tolist()

# 打印随机选出的向量
# print(f"Randomly selected {n} vectors in [[ ]] format:", random_vectors_list)

Sample Features Shape: (100, 128)


In [107]:
import time
import json
import numpy as np
from decimal import Decimal, getcontext

# 设置计算精度
getcontext().prec = 50  # 可根据需求调整精度

# 记录开始时间
start_time = time.time()

# 执行搜索操作
res = client.search(
    collection_name="IVF_PQ_100k_128_collection",
    data=Sample_features_list,
    limit=50,
    search_params={"metric_type": "L2"}
)

# 记录结束时间
end_time = time.time()

# 将 res 转换为 JSON 字符串（使用缩进格式）
result = json.dumps(res, indent=4)

# 将 JSON 字符串解析回 Python 对象
parsed_result = json.loads(result)

# 提取所有 id，并保持每个查询结果的 id 在一个子列表中
ids_list = []
distance_sums = []

for result in parsed_result:
    # 提取 id 列表
    ids_list.append([item['id'] for item in result])
    
    # 计算距离总和 (使用 Decimal 提高精度)
    distance_sum = sum(Decimal(item['distance']) for item in result)
    distance_sums.append([float(distance_sum)])  # 将 Decimal 转为 float 存储

# 打印 id 列表和距离总和
print("查询结果的 IDs 列表:", ids_list)
print("每个查询的距离总和:", distance_sums)

# 将 ids_list 和 distance_sums 保存为 .npy 文件
np.save('D:\\Python_Project\\Learned_Index\\Milvus\\RESH\\Query\\Completely_random\\ids_list_IVF_PQ.npy', ids_list)
np.save('D:\\Python_Project\\Learned_Index\\Milvus\\RESH\\Query\\Completely_random\\distance_sum_IVF_PQ.npy', distance_sums)

print("IDs 和距离总和已成功保存到 .npy 文件中！")

# 计算并打印运行时间（单位：毫秒）
elapsed_time = (end_time - start_time) * 1000  # 转换为毫秒
print(f"运行时间: {elapsed_time:.2f} 毫秒")

查询结果的 IDs 列表: [[94854, 2975, 37855, 34868, 5910, 69651, 56576, 47248, 7882, 94187, 70945, 83102, 58251, 25111, 50738, 35111, 52003, 93027, 1232, 44625, 48674, 91145, 31543, 95568, 70527, 7403, 94006, 52097, 86851, 39407, 18308, 59210, 97575, 80666, 92781, 80289, 65278, 42146, 42856, 30259, 2532, 52041, 60206, 60868, 68008, 37620, 67191, 92964, 29484, 24914], [75452, 66168, 49161, 85738, 88048, 56889, 78427, 93119, 39587, 59377, 72112, 68408, 70447, 68932, 57215, 56094, 89024, 63090, 47743, 96607, 69099, 33983, 75550, 41527, 80302, 76931, 9306, 37142, 94017, 73278, 44915, 77709, 79977, 55052, 83366, 65532, 48808, 70652, 75293, 60680, 39970, 76194, 89502, 78758, 96465, 43288, 82553, 62697, 43433, 60316], [85447, 34360, 36541, 44071, 53027, 67205, 62504, 94142, 44104, 95299, 63295, 60270, 56593, 86933, 44939, 34291, 40162, 55892, 45779, 95718, 82088, 62177, 48727, 81500, 56915, 87037, 33545, 93738, 80436, 97714, 66774, 53680, 73189, 34822, 93126, 36464, 56880, 57556, 85513, 76063, 87621, 