# 基础index可以合成使用
## 数据准备

In [1]:
import faiss
import numpy as np 


d = 512          # 维数
n_data = 2000   
np.random.seed(0) 
data = []
mu = 3
sigma = 0.1
for i in range(n_data):
    data.append(np.random.normal(mu, sigma, d))
data = np.array(data).astype('float32')

# query
query = []
n_query = 10
np.random.seed(12) 
query = []
for i in range(n_query):
    query.append(np.random.normal(mu, sigma, d))
query = np.array(query).astype('float32')

## 乘积量化作为cell-probe方法的粗量化器
乘积量化也可以作为粗量化器。其中有两个参数，聚类中心数c，维度划分m,这样每个划分都有c个倒排表，一共有$c^m$个倒排表。实际使用中，一般直接让$m=2$。  
MultiIndexQuantizer也经常与IndexFlat对比，以便选取合适的参数。

In [2]:
nbits_mi = 5   # c
M_mi = 2       # m
coarse_quantizer_mi = faiss.MultiIndexQuantizer(d, M_mi, nbits_mi) #不需要add任何数据
ncentroids_mi = 2 ** (M_mi * nbits_mi)

index = faiss.IndexIVFFlat(coarse_quantizer_mi, d, ncentroids_mi)
index.quantizer_trains_alone = True  #表示这是粗量化器的flag
index.train(data)
index.add(data)
index.nprobe = 50
dis, ind = index.search(query, 10)
print(dis)
print(ind)

[[8.61838   8.9166355 8.998905  9.01289   9.019705  9.188372  9.192066
  9.236515  9.236515  9.238319 ]
 [9.164592  9.201885  9.344341  9.34485   9.416972  9.513818  9.5142
  9.573903  9.605167  9.605826 ]
 [8.211029  8.373353  8.459253  8.459894  8.498556  8.631897  8.658703
  8.697348  8.71368   8.735945 ]
 [8.45299   8.513636  8.545483  8.597126  8.705633  8.7258835 8.7599125
  8.761805  8.781776  8.80736  ]
 [8.369623  8.760081  8.928711  8.93695   8.960407  9.022133  9.035181
  9.041813  9.088304  9.187949 ]
 [8.299071  8.432397  8.457373  8.562357  8.579033  8.6983185 8.794185
  8.794858  8.79899   8.835644 ]
 [8.860753  8.885756  8.922768  8.928856  8.9446945 8.959139  8.972376
  8.977709  9.020763  9.039915 ]
 [8.763845  8.7686    8.846914  8.846914  8.9460125 8.97376   8.976009
  9.007911  9.009394  9.033215 ]
 [8.488056  8.662771  8.701336  8.741288  8.8490505 8.857254  8.893715
  8.933592  8.933592  8.938933 ]
 [8.684618  8.767246  8.903692  8.903692  8.917681  8.940119  8.9

## Pre-filtering PQ codes with polysemous codes

In [10]:
index = faiss.IndexPQ(d, 16, 8)
# before train
index.do_polysemous_training = True
index.train(data)

index.add(data)
# before searching
index.search_type = faiss.IndexPQ.ST_polysemous
index.polysemous_ht = 54  # the Hamming threshold

dis, ind = index.search(query, 10)
print(dis)
print(ind)

[[5.974182  6.10614   6.1224976 6.126343  6.204773  6.2459717 6.2525024
  6.2615356 6.2651367 6.2732544]
 [6.5718384 6.638489  6.6399536 6.6640625 6.6744385 6.6782837 6.746521
  6.751709  6.756714  6.769348 ]
 [5.9680786 5.9692383 5.979309  6.0097046 6.039795  6.1034546 6.1087036
  6.1325684 6.1377563 6.140991 ]
 [5.486389  5.761841  5.781189  5.7894897 5.8067017 5.831421  5.8376465
  5.840637  5.8604126 5.875305 ]
 [5.8931885 6.109314  6.147461  6.1534424 6.185974  6.21875   6.2248535
  6.243225  6.2452393 6.2542725]
 [5.776123  6.0252686 6.0493774 6.0758057 6.093445  6.0980225 6.1068115
  6.114258  6.139099  6.17218  ]
 [6.023987  6.0289917 6.043213  6.04834   6.055298  6.091736  6.1139526
  6.1154785 6.140625  6.1411133]
 [6.0039062 6.060547  6.0739136 6.130188  6.179138  6.1831665 6.22876
  6.249756  6.2651367 6.2764893]
 [5.9506226 6.124695  6.152466  6.159851  6.164917  6.1794434 6.1951904
  6.20166   6.215149  6.2680664]
 [6.0117188 6.022705  6.185547  6.2302856 6.256653  6.2704

In [31]:
# for IndexIVFPQ
coarse_quantizer = faiss.IndexFlatL2(d)
index = faiss.IndexIVFPQ(coarse_quantizer, d, ncentroids_mi, M_mi, 8)
# before training
index.do_polysemous_training = True
index.train(data)

index.add(data)

# before searching
index.polysemous_ht = 54  # the Hamming threshold

dis, ind = index.search(query, 10)
print(dis)
print(ind)

[[5.5944242 5.762803  5.81783   5.826597  5.8281116 5.853023  5.8883333
  5.9103804 5.9335566 5.9981723]
 [6.017277  6.0180216 6.1391654 6.1525793 6.1690145 6.285906  6.367799
  6.3924723 6.483139  6.5108876]
 [5.1201916 5.155489  5.1658216 5.2309318 5.2639446 5.27475   5.284258
  5.3065944 5.332886  5.40935  ]
 [5.0789447 5.1293364 5.129437  5.2446656 5.2639885 5.297005  5.3017607
  5.332982  5.33683   5.3529315]
 [5.5589275 5.579212  5.659722  5.6908636 5.7286787 5.7511234 5.7699876
  5.7724476 5.7734365 5.8821025]
 [5.2813945 5.3259473 5.353539  5.38878   5.3981133 5.4004183 5.4237156
  5.4381495 5.4439116 5.4494076]
 [5.431495  5.4494677 5.548911  5.555647  5.6319094 5.634611  5.638289
  5.6782656 5.7148957 5.720402 ]
 [5.3285294 5.335212  5.4976454 5.5008645 5.525567  5.5554523 5.555627
  5.5836873 5.5910864 5.593465 ]
 [5.4126625 5.4127774 5.491379  5.5269804 5.5783095 5.578781  5.619939
  5.622184  5.637552  5.666792 ]
 [5.3548384 5.5728064 5.635675  5.6637754 5.703406  5.703953

要设置合理的阈值，请记住：
* 阈值(threshold)应介于0和每个代码的位数之间（在这种情况下为128 = 16 * 8），并且代码遵循二项式分布
* 将阈值设置为每个代码位数的1/2将节省代码比较的1/2，这还不够。应将其设置为较低的值（因此对于128位代码为54）。

## IndexIVFPQR: refining IVFPQ search results with an additional product quantizer
`IndexIVFPQR`在`IndexIVFPQ`之上又增加了一个额外的量化量，与`IndexRefineFlat`类似，它改进了`IndexIVFPQ`计算的距离并基于这些重新排序结果。