In [12]:
import sys
import numpy as np

 
def ivecs_read(fname):
  a = np.fromfile(fname, dtype='int32')
  d = a[0]
  return a.reshape(-1, d + 1)[:, 1:].copy()
 
def fvecs_read(fname):
  return ivecs_read(fname).view('float32')

def fbin_read(fname):
  data = np.fromfile(fname, dtype='int32')
  n, d = data[0], data[1]
  return data[2:].reshape(n, d).view('float32')


In [13]:
path_sift1m = "/var/lib/docker/anns/dataset/sift1m/base.fvecs"
path_gist1m = "/var/lib/docker/anns/dataset/gist1m/base.fvecs"
path_deep1m = "/var/lib/docker/anns/dataset/deep1b/base.1M.fbin"
path_text2image1m = "/home/dbcloud/big-ann-benchmarks/data/text2image1B/base.1B.fbin.crop_nb_1000000"
path_turing1m = "/home/dbcloud/big-ann-benchmarks/data/MSTuringANNS/base1b.fbin.crop_nb_1000000"

Hopkins Statistic

In [16]:
from scipy.spatial.distance import cdist

def hopkins_statistic(data, sample_size=None):
  n = data.shape[0]
  if sample_size is None:
      sample_size = n

  # Randomly Select Sample Points
  indices = np.random.permutation(n)
  sample_data = data[indices[:sample_size]]
  
  # Randomly Generate Background Points
  min_vals = data.min(axis=0)
  max_vals = data.max(axis=0)
  background_data = np.random.uniform(min_vals, max_vals, (sample_size, data.shape[1]))
  
  # Distance Computation
  d_x = np.mean(np.min(cdist(sample_data, background_data), axis=1))
  d_u = np.mean(np.min(cdist(background_data, sample_data), axis=1))

  # COmputation for Hopkins Statistic
  H = d_x / (d_x + d_u)
  
  return H

sample_size = 5000



H_sift1m = hopkins_statistic(fvecs_read(path_sift1m), sample_size)
print("Hopkins Statistic for SIFT1M: ", H_sift1m)

H_gist1m = hopkins_statistic(fvecs_read(path_gist1m), sample_size)
print("Hopkins Statistic for GIST1M: ", H_gist1m)

H_deep1m = hopkins_statistic(fbin_read(path_deep1m), sample_size)
print("Hopkins Statistic for DEEP1M: ", H_deep1m)

H_text2image1m = hopkins_statistic(fbin_read(path_text2image1m), sample_size)
print("Hopkins Statistic for Text2image1M: ", H_text2image1m)

H_turing1m = hopkins_statistic(fbin_read(path_turing1m), sample_size)
print("Hopkins Statistic for Turing1M: ", H_turing1m)

H_random_clustered_data = hopkins_statistic(fvecs_read(path_random_clustered_data), sample_size)
print("Hopkins Statistic for Random Clustered Data: ", H_random_clustered_data)

Hopkins Statistic for SIFT1M:  0.49618556213019954
Hopkins Statistic for GIST1M:  0.5275797203532487
Hopkins Statistic for DEEP1M:  0.4792842712513281
Hopkins Statistic for Text2image1M:  0.4871102759793496
Hopkins Statistic for Turing1M:  0.48044122831340663
