## Create TileDB group for TileDB-Vector-Search vamana index. Populate only metadata

### Structure of TileDB group holding the index

* constituent arrays
  * `feature_vectors`
  * `adjacency_scores`
  * `adjacency_ids`
  * `adjacency_row_index`
  * `medoids` (Anticipating ivf_vamana)
* metadata
  * base (for any TileDB-Vector-Search index group)
    * `dataset_type`
    * `storage_version`
    * `dtype`
    * `feature_type`
    * `id_type`
    * `base_sizes`
    * `ingestion_timestamps`
    * `temp_size`
    * `dimension`
    * `feature_datatype`
    * `id_datatype`
  * vamana specific
    * `index_type`
    * `adjacency_scores_type`
    * `adjacency_row_index_type`
    * `num_edges_history`
    * `adjacency_scores_datatype`
    * `adjacency_row_index_datatype`

In [None]:
import tiledb
import numpy as np
import tempfile
import json
import os
import shutil

### Initialize metadata

In [None]:
dataset_type = 'vector_search'
storage_version = '0.3'
dtype = 'float32'
feature_type = 'float32'
id_type = 'uint64'
base_sizes = json.dumps([0, 10000])
ingestion_timestamps = json.dumps([0,1704946748930])
temp_size = np.uint64(0)
dimension = np.uint32(128)
feature_datatype = np.uint32(2) # float32
id_datatype = np.uint32(10) # uint64

index_type = 'Vamana'
adjacency_scores_type = 'float32'
adjacency_row_index_type = 'uint64'
num_edges_history = json.dumps([0, 40000])
adjacency_scores_datatype = np.uint32(2)
adjacency_row_index_datatype = np.uint32(10)

### Define paths to test group

In [None]:
test_data_root = '/Users/lums/TileDB/TileDB-Vector-Search-complete-index-vamana/external/test_data/nano/vamana'
test_index_name_metadata = test_data_root + "/" + 'vamana_test_index_metadata'
test_index_name = test_data_root + "/" + 'vamana_test_index'
print(f"Temporary index group: {test_index_name}")

#### Create TileDB group

In [None]:
def open_group_for_test_write(name):
    # Delete the group if it exist
    if os.path.exists(name):
        # Delete the directory
        shutil.rmtree(name)

    tiledb.Group.create(name)
    return tiledb.Group(name, "w")

def open_group_for_test_read(name):
    return tiledb.Group(name, "r")

def open_group_for_test(name, rw):
    if (rw == "r"):
        return open_group_for_test_read(name)
    elif (rw == "w"):
        return open_group_for_test_write(name)
    else:
        return None

### Populate metadata

In [None]:
def populate_metadata(vamana_index):
    vamana_index.meta['dataset_type'] = dataset_type
    vamana_index.meta['storage_version'] = storage_version
    vamana_index.meta['dtype'] = dtype
    vamana_index.meta['feature_type'] = feature_type
    vamana_index.meta['id_type'] = id_type
    vamana_index.meta['base_sizes'] = base_sizes
    vamana_index.meta['ingestion_timestamps'] = ingestion_timestamps
    vamana_index.meta['temp_size'] = temp_size
    vamana_index.meta['dimension'] = dimension
    vamana_index.meta['feature_datatype'] = feature_datatype
    vamana_index.meta['id_datatype'] = id_datatype

    vamana_index.meta['index_type'] = index_type
    vamana_index.meta['adjacency_scores_type'] = adjacency_scores_type
    vamana_index.meta['adjacency_row_index_type'] = adjacency_row_index_type
    vamana_index.meta['num_edges_history'] = num_edges_history
    vamana_index.meta['adjacency_scores_datatype'] = adjacency_scores_datatype
    vamana_index.meta['adjacency_row_index_datatype'] = adjacency_row_index_datatype

In [None]:
### Create arrays in the group

In [None]:
# Create an array and add as a member to the group
def add_array_to_group(vamana_index, array_name, data_in):

    flags = data_in.flags

    if flags.c_contiguous:
        data = data_in
    elif flags.f_continuous:
        data = data_in.transpose()
    else:
        raise ValueError("Invalid orientation -- this should never happen")

    
    if (data.ndim == 2):
         dim = np.int32(data.shape[0])
         num_v = np.int32(data.shape[1])
         domain = tiledb.Domain(tiledb.Dim(name="rows", domain=(0, dim-1), tile=dim, dtype=np.int32),
                  tiledb.Dim(name="cols", domain=(0, num_v-1), tile=num_v, dtype=np.int32))
    else:
         dim = np.int32(data.shape[0])
         domain = tiledb.Domain(tiledb.Dim(name="rows", domain=(0, dim-1), tile=dim, dtype=np.int32))
        
    values = tiledb.Attr("values", dtype=data.dtype)

    # Do we need to specify orders if array is fortran order?
    # schema = tiledb.ArraySchema(domain=domain, attrs=(values,))
    schema = tiledb.ArraySchema(domain=domain, attrs=(values,), cell_order='col-major', tile_order='col-major')

    array_path = test_index_name + "/" + array_name
    if os.path.exists(array_path):
        shutil.rmtree(array_path)
    tiledb.Array.create(array_path, schema)
    
    vamana_index.add(array_name, name=array_name, relative=True)

    with tiledb.DenseArray(array_path, mode='w') as A:
        A[:] = data
        verify_array(array_name, data)

def verify_array(verify_array_name, verify_data):
    verify_array_path = test_index_name + "/+/" + verify_array_name
    return
    with tiledb.DenseArray(verify_array_path, mode='r') as A:
        written_data = A[:]["values"]
        if (np.linalg.norm(verify_data != written_data)):
            print(f"verifying add_array_to_group failed for {verify_array_path}")

### Write just the metadata for the group

In [None]:
vamana_test_index_metadata = open_group_for_test(test_index_name_metadata, "w")
populate_metadata(vamana_test_index_metadata)
vamana_test_index_metadata.close()

### Reopen group

In [None]:
verify_index = tiledb.Group(test_index_name_metadata, "r")

#### For now just dump the metadata for visual inspection -- @todo Compare values, types, etc

# verify_index.meta.dump()

In [None]:
verify_index.close()

## Test group

### Create arrays

In [None]:
num_vectors = np.uint32(231)

row_numbers = np.arange(dimension * num_vectors, dtype=np.float32).reshape(dimension, num_vectors)
column_numbers = np.arange(dimension * num_vectors, dtype=np.float32).reshape(num_vectors,dimension)
feature_vectors = row_numbers + column_numbers.transpose()

adjacency_scores = np.arange(num_vectors*4, dtype=np.float32) ** 2
adjacency_ids = np.arange(num_vectors*4, dtype=np.uint64) % num_vectors
adjacency_row_index = np.arange(num_vectors+1, dtype = np.uint64) * 4
# medoids = ...


In [None]:
vamana_test_index = open_group_for_test(test_index_name, "w")
if (not vamana_test_index.isopen):
    raise IOError(f"{test_index_name} is not open")

In [None]:
populate_metadata(vamana_test_index)

In [None]:
# Vamana specific
add_array_to_group(vamana_test_index, 'feature_vectors', feature_vectors)
verify_array('feature_vectors', feature_vectors)

add_array_to_group(vamana_test_index, 'adjacency_scores', adjacency_scores)
verify_array('adjacency_scores', adjacency_scores)

add_array_to_group(vamana_test_index, 'adjacency_ids', adjacency_ids)
verify_array('adjacency_ids', adjacency_ids)

add_array_to_group(vamana_test_index, 'adjacency_row_index', adjacency_row_index)
verify_array('adjacency_row_index', adjacency_row_index)

In [None]:
vamana_test_index.close()