# faiss基础模块
faiss中的索引基于几个基础算法构建，只不过在faiss中是一种高效的实现。他们分别是k-means聚类、PCA降维、PQ编码、解码。

## k-means聚类

In [2]:
import sys
import faiss
import numpy as np 

d = 512          #维数
n_data = 2000   
np.random.seed(0) 
data = []
mu = 3
sigma = 0.1
for i in range(n_data):
    data.append(np.random.normal(mu, sigma, d))
data = np.array(data).astype('float32')

# 聚类
ncentroids = 1024
niter = 20
verbose = True
d = data.shape[1]
kmeans = faiss.Kmeans(d, ncentroids)
kmeans.train(data)

#输出聚类中心
print(kmeans.centroids)

[[3.046999  3.0121088 3.0124333 ... 3.0203993 3.011947  2.9342847]
 [2.7589808 3.0725713 2.9360871 ... 3.0773525 2.902585  2.995511 ]
 [3.1167192 2.9537685 2.9987445 ... 3.0199993 2.9278672 3.050025 ]
 ...
 [2.9502757 3.0440164 2.9121387 ... 2.9652288 3.2078865 3.009649 ]
 [2.9459333 3.0297534 2.9002755 ... 2.9255435 2.8951385 2.9468067]
 [2.9947238 3.1082706 2.9418213 ... 3.0144033 3.046606  2.9184723]]


In [3]:
#计算某个向量属于哪一个子类，返回聚类中心次序和L2距离
D, I = kmeans.index.search(data[:5], 1)
print(D)
print(I)

[[4.899538 ]
 [2.2404225]
 [3.0874515]
 [4.472025 ]
 [2.1018007]]
[[ 61]
 [767]
 [393]
 [415]
 [175]]


In [4]:
#返回距离某个聚类中心最近的5个向量
index = faiss.IndexFlatL2(d)
index.add (data)
D, I = index.search(kmeans.centroids, 5)
print(D)
print(I)

[[3.2480469 4.0878906 4.2226562 4.2304688 4.3759766]
 [0.        8.475586  8.520508  8.709961  8.7421875]
 [0.        8.439453  8.458008  8.519531  8.62207  ]
 ...
 [0.        8.826172  8.837891  8.891602  8.931641 ]
 [2.5205078 3.046875  3.2646484 5.705078  5.932617 ]
 [2.2167969 2.2207031 6.609375  6.6591797 6.665039 ]]
[[1083  472  356 1892   34]
 [1411  414  198  620 1129]
 [ 140  317 1686   24  402]
 ...
 [ 753 1776  331  389  279]
 [ 432 1096  240  879  329]
 [ 625 1211  751  106 1318]]


## PCA降维

In [5]:
mat = faiss.PCAMatrix (512, 64)  # 从512维降为64维
mat.train(data)
assert mat.is_trained
tr = mat.apply_py(data)
print(tr.shape)

(2000, 64)


## PQ编码/解码
ProductQuantizer对象可以将向量编码为code。

In [6]:
d = 512  # 数据维度
cs = 4  # code size (bytes)

# 训练数据集
xt = data  #训练集

# dataset to encode (could be same as train)
x = data

pq = faiss.ProductQuantizer(d, cs, 8)
pq.train(xt)

# encode编码 
codes = pq.compute_codes(x)

# decode解码
x2 = pq.decode(codes)

# 编码-解码后与原始数据的差
avg_relative_error = ((x - x2)**2).sum() / (x ** 2).sum()
print(avg_relative_error)

0.0008765541


## 标量量化器（scalar quantizer）与之类似。

In [7]:
d = 512  # 数据维度

# 训练集
xt = data

# dataset to encode (could be same as train)
x = data

# QT_8bit allocates 8 bits per dimension (QT_4bit also works)
sq = faiss.ScalarQuantizer(d, faiss.ScalarQuantizer.QT_8bit)
sq.train(xt)

# encode 编码
codes = sq.compute_codes(x)

# decode 解码
x2 = sq.decode(codes)

# 计算编码-解码后与原始数据的差
avg_relative_error = ((x - x2)**2).sum() / (x ** 2).sum()
print(avg_relative_error)

6.7287445e-08
