# faiss基础模块
faiss中的索引基于几个基础算法构建，只不过在faiss中是一种高效的实现。分别是
* k-means聚类
* PCA降维
* PQ编码
* 解码

## k-means聚类

In [4]:
# 导入faiss
import sys
import faiss

# 数据
import numpy as np 
d = 512          # 维数
n_data = 2000   
np.random.seed(0) 
data = []
mu = 3
sigma = 0.1
for i in range(n_data):
    data.append(np.random.normal(mu, sigma, d))
data = np.array(data).astype('float32')

# 聚类
ncentroids = 100
niter = 20
verbose = True
d = data.shape[1]
kmeans = faiss.Kmeans(d, ncentroids, niter=niter, verbose=verbose)
kmeans.train(data)

#输出聚类中心
print(kmeans.centroids)
print(kmeans.centroids.shape)

[[3.0195723 3.0102541 3.0154822 ... 3.0012543 2.983653  2.9833684]
 [2.963544  2.9899907 2.9942687 ... 2.989136  2.9432213 2.9964879]
 [3.038704  2.9829412 3.0110557 ... 2.9955654 3.0334764 3.0621648]
 ...
 [3.1335988 3.0283628 3.0185556 ... 2.9817123 2.8893135 2.846763 ]
 [3.0277164 2.9488263 2.9712245 ... 3.046851  3.0523338 2.9560134]
 [2.9731872 2.9688954 3.0551343 ... 3.0077536 3.038112  3.0422602]]
(100, 512)


In [5]:
kmeans.obj

array([16907.16796875,  9558.58007812,  9549.16894531,  9546.1015625 ,
        9544.56542969,  9544.17089844,  9544.15234375,  9543.54589844,
        9543.54589844,  9543.54589844,  9543.54589844,  9543.54589844,
        9543.54589844,  9543.54589844,  9543.54589844,  9543.54589844,
        9543.54589844,  9543.54589844,  9543.54589844,  9543.54589844])

In [7]:
kmeans.iteration_stats

AttributeError: 'Kmeans' object has no attribute 'iteration_stats'

In [8]:
#计算某个向量属于哪一个子类，返回聚类中心次序和L2距离
D, I = kmeans.index.search(data[:5], 1)
print(D)
print(I)

[[5.027096 ]
 [4.712036 ]
 [4.7764983]
 [4.5557623]
 [4.403679 ]]
[[61]
 [44]
 [61]
 [49]
 [83]]


In [14]:
# 返回距离某个聚类中心最近的5个向量
index = faiss.IndexFlatL2 (d)
index.add (data)
D, I = index.search (kmeans.centroids, 5)
print(D)
print(I)

[[3.8203125e+00 4.3466797e+00 4.3652344e+00 4.3691406e+00 4.3789062e+00]
 [3.6269531e+00 4.0800781e+00 4.1953125e+00 4.2304688e+00 4.2705078e+00]
 [3.7167969e+00 4.0781250e+00 4.1015625e+00 4.1796875e+00 4.2685547e+00]
 [3.4667969e+00 3.8759766e+00 4.1923828e+00 4.4365234e+00 4.5742188e+00]
 [0.0000000e+00 9.0585938e+00 9.0957031e+00 9.1376953e+00 9.1484375e+00]
 [3.7412109e+00 4.3515625e+00 4.4199219e+00 4.5517578e+00 4.5849609e+00]
 [3.8154297e+00 4.2207031e+00 4.4433594e+00 4.5625000e+00 4.5986328e+00]
 [3.5927734e+00 3.9169922e+00 4.3535156e+00 4.4375000e+00 4.4453125e+00]
 [3.9101562e+00 4.3496094e+00 4.3554688e+00 4.4189453e+00 4.4384766e+00]
 [0.0000000e+00 8.6308594e+00 8.8496094e+00 8.9980469e+00 9.0107422e+00]
 [3.8427734e+00 4.2568359e+00 4.3339844e+00 4.3779297e+00 4.3789062e+00]
 [3.5175781e+00 4.0654297e+00 4.1015625e+00 4.2148438e+00 4.2558594e+00]
 [2.3457031e+00 2.3486328e+00 6.8046875e+00 6.8427734e+00 6.9658203e+00]
 [9.7656250e-04 8.8378906e+00 8.9287109e+00 8.99414

## PCA降维

In [15]:
mat = faiss.PCAMatrix (512, 64)  # 从512维降为64维
mat.train(data)
print(data.shape)
assert mat.is_trained
tr = mat.apply_py(data)
print(tr.shape)

(2000, 512)
(2000, 64)


## PQ编码/解码
ProductQuantizer对象可以将向量编码为code。

In [16]:
d = 512  # 数据维度
cs = 4  # code size (bytes)

# 训练数据集
xt = data  #训练集

# dataset to encode (could be same as train)
x = data

pq = faiss.ProductQuantizer(d, cs, 8)
pq.train(xt)

# encode编码 
codes = pq.compute_codes(x)

# decode解码
x2 = pq.decode(codes)

# 编码-解码后与原始数据的差
avg_relative_error = ((x - x2)**2).sum() / (x ** 2).sum()
print(avg_relative_error)

0.0008764966


标量量化器（scalar quantizer）与之类似。

In [17]:
d = 512  # 数据维度

# 训练集
xt = data

# dataset to encode (could be same as train)
x = data

# QT_8bit allocates 8 bits per dimension (QT_4bit also works)
sq = faiss.ScalarQuantizer(d, faiss.ScalarQuantizer.QT_8bit)
sq.train(xt)

# encode 编码
codes = sq.compute_codes(x)

# decode 解码
x2 = sq.decode(codes)

# 计算编码-解码后与原始数据的差
avg_relative_error = ((x - x2)**2).sum() / (x ** 2).sum()
print(avg_relative_error)

6.7287445e-08
