In [2]:
import numpy as np
import faiss
from matplotlib import pyplot

Loading faiss with AVX2 support.


Whitening and searching with L2 distance is equivalent to using a Mahalnobis distance (see https://en.wikipedia.org/wiki/Mahalanobis_distance)

In [10]:
# Generate a random matrix with Gaussian distribution
rs = np.random.RandomState(123)

d = 10
n = 10000

# Gaussian, identity covariance, mean 0
xnorm = rs.randn(n, d)

# Gaussian, general
x = xnorm @ rs.rand(d, d) + rs.rand(d)

# convert to float for Faiss
x = x.astype('float32')

In [13]:
# Train PCA with whitening
whiten = faiss.PCAMatrix(d, d, -0.5)
whiten.train(x)

In [14]:
# look at eigenvalues
faiss.vector_to_array(whiten.eigenvalues)

array([2.7029884e+05, 2.0929883e+04, 1.5178835e+04, 9.8838896e+03,
       8.2295615e+03, 4.9401885e+03, 3.4985391e+03, 1.5123438e+03,
       1.7264275e+02, 7.5637016e+01], dtype=float32)

In [16]:
# apply whitening
xt = whiten.apply_py(x)

In [18]:
# search on original vectors
index = faiss.IndexFlatL2(d)
index.add(x)
index.search(x[:3], 4)[1]

array([[   0, 5289, 5793, 9512],
       [   1, 1570, 6202, 3440],
       [   2, 2601, 8350, 7803]])

In [17]:
# search on whitened vectors
index = faiss.IndexFlatL2(d)
index.add(xt)
index.search(xt[:3], 4)[1]

array([[   0, 1908, 1604, 4671],
       [   1,  742, 3440, 3498],
       [   2,  318, 2879, 9319]])

In [20]:
# search in Gaussian space with identity covariance
index = faiss.IndexFlatL2(d)
index.add(xnorm.astype('float32'))
index.search(xnorm[:3].astype('float32'), 4)[1]

array([[   0, 1908, 1604, 4671],
       [   1,  742, 3440, 3498],
       [   2,  318, 2879, 9319]])

The results should be the same in the whitened space as with identity covariance. 

## Degenerate example

In [25]:
# make rank-deficient matrix
M = rs.rand(d, d)
M[1] = M[0]

x = xnorm @ M + rs.rand(d)
x = x.astype('float32')
whiten = faiss.PCAMatrix(d, d, -0.5)
whiten.train(x)

print(faiss.vector_to_array(whiten.eigenvalues))
xt = whiten.apply_py(x)

[2.9622341e+05 2.0840871e+04 1.6759289e+04 9.2783945e+03 8.6274736e+03
 5.2020957e+03 3.0101091e+03 1.3651075e+03 1.4849922e+02 1.0448833e-02]


In [26]:
index = faiss.IndexFlatL2(d)
index.add(xt)
index.search(xt[:3], 4)[1]

array([[   0,  118, 5289, 1908],
       [   1,  742, 3498, 7666],
       [   2, 9319,  318, 4849]])

Note that the results are now different. The last eigenvalue should be 0, but to numerical approximations it is +/- epsilon. If the sign happens to be negative, the result is even worse. In practice, it is best to restrict the output dimension to exclude too small eigenvalues: build the `PCAMatrix` with an output dimension smaller than `d`.