## 对比FLAT索引性能

In [2]:
# 连接部署在Docker上的Standalone版本Milvus
from pymilvus import connections, db

conn = connections.connect(host="127.0.0.1", port=19530)

# 查看当前已有数据库
db.list_database()

['wwf_mac_docker',
 'RESH_100k_128',
 'FLAT_100k_128',
 'HNSW_100k_128',
 'IVF_PQ_100k_128',
 'default']

In [2]:
# 创建测试RESH的数据库
database = db.create_database("FLAT_100k_128")

# 列出所有数据库
db.list_database()

RPC error: [create_database], <MilvusException: (code=65535, message=database already exist: FLAT_100k_128)>, <Time:{'RPC start': '2024-11-05 21:03:30.569458', 'RPC error': '2024-11-05 21:03:30.571816'}>


MilvusException: <MilvusException: (code=65535, message=database already exist: FLAT_100k_128)>

In [3]:
# 创建Collection
from pymilvus import MilvusClient, DataType

client = MilvusClient(
    uri="http://localhost:19530"
)

# client.create_collection(
#     collection_name="FLAT_100k_128_collection",
#     dimension=128,
#     index_params="FLAT",
#     # index_params="IVF_FLAT",
#     # index_params="IVF_SQ8",
#     # index_params="IVF_PQ",
#     # index_params="HNSW",
#     
#     metric_type="L2"
#     # metric_type="IP"
#     # metric_type="COSINE"
# )
# 
# res = client.get_load_state(
#     collection_name="FLAT_100k_128_collection"
# )
# 
# print(res)

In [4]:
import numpy as np

# 加载向量数据
ResNet50_features = np.load('D:\\Python_Project\\Learned_Index\\Milvus\\RESH\\Original_data\\128_100k_ResNet50_vector.npy')

# 打印每个特征向量的形状
print("Features shape:", ResNet50_features.shape)

# 为每个特征生成唯一的 ID 列表
ResNet50_features_ids = list(range(len(ResNet50_features)))

# # 随机选择 n 个向量
# n = 1  # 你可以修改 n 的值
# random_indices = np.random.choice(len(ResNet50_features), size=n, replace=False)
# random_vectors = ResNet50_features[random_indices]
# 
# # 将随机选出的向量转为 [[ ]] 的形式
# random_vectors_list = random_vectors.tolist()
# 
# # 打印随机选出的向量
# print(f"Randomly selected {n} vectors in [[ ]] format:", random_vectors_list)

Features shape: (100000, 128)


In [6]:
# 通用的插入函数，用于将高维向量插入到Collection
def insert_to_milvus(collection_name, features, ids):
    # features 是一个 NumPy 数组，直接使用
    # 构建 Milvus 所需的插入数据结构
    data = [{"id": ids[i], "vector": features[i].tolist()} for i in range(len(features))]

    # 插入数据到集合中 (假设 client 是正确连接的 Milvus 客户端实例)
    res = client.insert(collection_name=collection_name, data=data)

    # 打印插入结果
    print(f"Inserting into {collection_name} completed with result: {len(res)}")

# 插入特征到 _collection
insert_to_milvus("FLAT_100k_128_collection", ResNet50_features, ResNet50_features_ids)

Inserting into FLAT_100k_128_collection completed with result: 3


In [5]:
import numpy as np

# 加载查询负载--完全随机负载100
Sample_features = np.load('D:\Python_Project\Learned_Index\Milvus\RESH\Query\Completely_random\sampled_100_vectors.npy')

# 打印每个特征向量的形状
print("Sample Features Shape:", Sample_features.shape)

# 为每个特征生成唯一的 ID 列表
Sample_features_ids = list(range(len(Sample_features)))

# 随机选择 n 个向量
# n = 100  # 你可以修改 n 的值
# random_indices = np.random.choice(len(Sample_features), size=n, replace=False)
# random_vectors = Sample_features[random_indices]

# 将随机选出的向量转为 [[ ]] 的形式
Sample_features_list = Sample_features.tolist()
# Sample_features_list = random_vectors.tolist()
# Sample_features_ids = list(range(len(Sample_features_list)))

# 打印随机选出的向量
# print(f"Randomly selected {n} vectors in [[ ]] format:", random_vectors_list)

Sample Features Shape: (100, 128)


In [6]:
# 批量向量搜索 10NN
# import json
# res = client.search(
#     collection_name="FLAT_100k_128_collection",
#     data=Sample_features_list,
#     limit=10,
#     search_params={"metric_type": "L2",} 
# )
# result = json.dumps(res, indent=4)
# result = json.dumps(res)
# print(result)
import time
import json
import ast

# 记录开始时间
start_time = time.time()

# 执行搜索操作
res = client.search( 
    collection_name="FLAT_100k_128_collection",
    data=Sample_features_list,
    limit=10,
    search_params={"metric_type": "L2",}
)

# 记录结束时间
end_time = time.time()

# 将 res 转换为 JSON 字符串（使用缩进格式）
result = json.dumps(res, indent=4)

# 打印转换后的 JSON 字符串
# print(f"JSON 字符串：\n{result}")

# 将 JSON 字符串解析回 Python 对象
parsed_result = json.loads(result)

# 提取所有 id，并保持每个查询结果的 id 在一个子列表中
ids_list = []
for result in parsed_result:
    ids_list.append([item['id'] for item in result])  # 将每个查询结果的 id 作为一个子列表

# 打印每个子列表，每个查询的 ids 在一行显示
# for ids in ids_list:
#     print(f"查询结果的 IDs: {', '.join(map(str, ids))}")

print(ids_list)
# 将 ids_list 保存为 .npy 文件
np.save('D:\\Python_Project\\Learned_Index\\Milvus\\RESH\\Query\\Completely_random\\ids_list_right.npy', ids_list)

print("IDs 已成功保存到 ids_list_right.npy 文件中！")

# 计算并打印运行时间（单位：毫秒）
elapsed_time = (end_time - start_time) * 1000  # 转换为毫秒
print(f"运行时间: {elapsed_time:.2f} 毫秒")

[[94854, 2975, 37855, 34868, 5910, 69651, 56576, 47248, 7882, 94187], [75452, 66168, 49161, 85738, 88048, 56889, 78427, 93119, 39587, 59377], [85447, 34360, 36541, 44071, 53027, 67205, 62504, 94142, 44104, 95299], [19386, 26547, 12441, 81460, 20174, 48677, 5331, 17765, 80859, 60028], [43843, 46271, 33786, 87007, 50078, 46405, 59393, 53349, 61119, 36619], [45944, 91499, 35287, 80838, 94079, 40019, 33895, 88755, 62869, 50777], [18322, 12876, 1686, 11767, 6212, 31180, 72143, 41185, 15788, 65672], [65115, 80289, 52874, 81793, 25729, 38785, 20618, 70925, 42574, 17379], [74942, 85531, 79203, 96527, 52519, 88567, 91942, 33910, 36190, 79975], [12397, 9459, 86463, 59279, 32823, 37112, 72891, 23885, 74857, 12959], [67761, 65158, 95706, 99282, 78585, 46475, 73321, 52476, 81501, 74552], [38359, 71193, 39847, 97464, 80744, 74079, 68863, 72601, 77243, 74310], [6122, 13505, 3116, 15818, 8264, 6993, 16555, 24806, 17493, 17400], [79422, 72936, 8893, 75597, 32743, 79181, 84111, 50248, 87835, 61391], [29

In [7]:
# 计算召回率
import numpy as np

# 加载三个 .npy 文件
ids_list_HNSW = np.load('D:\\Python_Project\\Learned_Index\\Milvus\\RESH\\Query\\Completely_random\\ids_list_HNSW.npy', allow_pickle=True)
ids_list_IVF_PQ = np.load('D:\\Python_Project\\Learned_Index\\Milvus\\RESH\Query\\Completely_random\\ids_list_IVF_PQ.npy', allow_pickle=True)
ids_list_right = np.load('D:\\Python_Project\\Learned_Index\\Milvus\\RESH\Query\\Completely_random\\ids_list_right.npy', allow_pickle=True)

# 计算召回率函数
def calculate_recall(predicted, ground_truth):
    recalls = []
    for pred, gt in zip(predicted, ground_truth):
        correct_count = len(set(pred) & set(gt))  # 计算预测结果与标准答案的交集
        recall = correct_count / len(gt)  # 召回率
        recalls.append(recall)
    return np.mean(recalls)  # 返回平均召回率

# 计算两个算法的平均召回率
recall_HNSW = calculate_recall(ids_list_HNSW, ids_list_right)
recall_IVF_PQ = calculate_recall(ids_list_IVF_PQ, ids_list_right)

# 输出召回率结果
print(f"HNSW 的平均召回率: {recall_HNSW:.2f}")
print(f"IVF_PQ 的平均召回率: {recall_IVF_PQ:.2f}")

HNSW 的平均召回率: 0.92
IVF_PQ 的平均召回率: 0.90


In [None]:
# 计算Overall Ratio
