In [2]:
import numpy as np
import os
import sys
import tiledb

In [3]:
os.system('pwd');

/Users/lums/TileDB/feature-vector-prototype/python


In [4]:
# cfg = tiledb.Ctx().config()

# cfg["py.init_buffer_bytes"] = 1024**2 * 50
# cfg["vfs.s3.scheme"] = "https" 
# cfg["vfs.s3.region"] = "us-west-2"
# cfg["vfs.s3.endpoint_override"] = ""
# cfg["vfs.s3.use_virtual_addressing"] = "true"
# cfg["vfs.s3.aws_access_key_id"] = "AKIA2HZNSCDDICHRA6P2";
# cfg["vfs.s3.aws_secret_access_key"] = "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";

tiledb.default_ctx({"vfs.s3.region": "us-west-2"});

In [5]:
def get_data_info(filename):
    # Check if filename exists
    if (not os.path.exists(filename)):
        raise Exception(f'{filename} does not exist.')

    file_size = os.path.getsize(filename)
    print(f'The size of {filename} is {file_size} bytes.')

    f = open(filename, 'rb')
    dimension = np.fromfile(f, dtype=np.int32, count=1)[0]
    num_vectors = file_size // (4 + dimension * 4)   # Four bytes for float

    print(f'num_vectors is {num_vectors}, dimension is {dimension}')
    f.close()
    
    return num_vectors, dimension

In [16]:
def fvecs_generator(filename, num_vectors, dimension, block_size):

    base, ext = os.path.splitext(filename)
    if (ext == '.fvecs'):
        element_type = np.float32
    elif (ext == '.ivecs'):
        element_type = np.int32
    elif (ext == '.bvecs'):
        element_type = np.uint8
    else:
        raise Exception(f'Unkown extension {ext}')
    
    f = open(filename, 'rb')
    dim = np.fromfile(f, dtype=np.int32, count=1)[0]
    if (dimension != dim):
        raise Exception(f'{dimension} != {dim}')
    
    chunk_size = block_size * dimension
    # print(f'chunk_size is {chunk_size}, block_size is {block_size}, dimension is {dimension}')
    while True:
        chunk = np.fromfile(f, dtype=element_type, count=chunk_size)
        if chunk.size == 0:
            break
        # print(f'before reshape chunk is {type(chunk)} and size {chunk.shape}')
        b = chunk.reshape(block_size, dimension)
        b = np.transpose(b)

        # print(f'after reshape chunk is {type(b)} and size {b.shape}')

        yield b
    f.close()

In [17]:
def create_array(array_name, num_vectors, dimension, tile_size):
    
    print(f'Creating array {array_name}: {dimension} by {num_vectors}')
    
    # The array will be dimension by num_vectors                                                  
    dom = tiledb.Domain(
        tiledb.Dim(name="rows", domain=(0, dimension-1), tile=dimension, dtype=np.int32),
        tiledb.Dim(name="cols", domain=(0, num_vectors-1), tile=tile_size, dtype=np.int32),
    )

    # The array will be dense with a single attribute "a" so each (i,j) cell can store a float.                                  
    schema = tiledb.ArraySchema(
        domain=dom, sparse=False, attrs=[tiledb.Attr(name="a", dtype=np.float32)], 
        cell_order='col-major', tile_order='col-major'
    )

    # Create the (empty) array on disk.   
    if (tiledb.object_type(array_name) == "array"):
        print(f"Array {array_name} already exists.  Deleting")
        tiledb.remove(array_name)
    tiledb.DenseArray.create(array_name, schema)
    
    # Check size
    with tiledb.DenseArray(array_name, mode="r") as A:
        data = A[:]
        print(f'After creation, array shape is {data["a"].shape}')
    

In [18]:
def write_array(array_name, num_vectors, dimension, block_size, tile_size):

    print(f'Opening array {array_name} for writing')
    
    # Open the TileDB array for writing
    A = tiledb.DenseArray(array_name, mode='w')

    # Read data blocks from the file and write them to the TileDB array
    generator = fvecs_generator(filename, num_vectors, dimension, block_size)

    begin = 0
    for block in generator:
        # print(f'Read block number {begin} : block is {type(block)} with shape {block.shape}')
        A[0:dimension, begin*block_size:(begin+1)*block_size] = block
        begin = begin + 1
        if begin*block_size >= num_vectors:
            break

    # Close the TileDB array
    A.close()

In [19]:
filename = '/Users/lums/TileDB/feature-vector-prototype/external/data/sift/sift_base.fvecs'
# array_name = 'sift_base'
array_name = 's3://tiledb-lums/sift_base'
# array_name = 'tiledb://lums/sift_base'
# array_name = 'https://tiledb-lums.s3.amazonaws.com/sift_base'
# array_name = 'https://tiledb-lums.s3-us-west-2.amazonaws.com/sift_base'

block_size = 10000
const_dim = 128
#tile_size = block_size
tile_size = const_dim

num_vectors, dimension = get_data_info(filename)
create_array(array_name, num_vectors, dimension, tile_size)
write_array(array_name, num_vectors, dimension, block_size, tile_size)

The size of /Users/lums/TileDB/feature-vector-prototype/external/data/sift/sift_base.fvecs is 516000000 bytes.
num_vectors is 1000000, dimension is 128
Creating array s3://tiledb-lums/sift_base: 128 by 1000000
Array s3://tiledb-lums/sift_base already exists.  Deleting
After creation, array shape is (128, 1000000)
Opening array s3://tiledb-lums/sift_base for writing


---

### Some testing below here

---

In [20]:
import tiledb

filename = '/Users/lums/TileDB/feature-vector-prototype/external/data/sift/sift_base.fvecs'
array_name = 's3://tiledb-lums/sift_base'
# array_name = 'sift_base'

A = tiledb.open(array_name)
f = open(filename, 'rb')
dim = np.fromfile(f, dtype=np.int32, count=1)[0]

chunk = np.fromfile(f, dtype=np.float32, count=dim * 1000000)
B = chunk.reshape(1000000, dim)
B = np.transpose(B)

In [21]:
print(f'A is {type(A)} with shape {A.shape}')
print(f'B is {type(A)} with shape {B.shape}')

A is <class 'tiledb.libtiledb.DenseArrayImpl'> with shape (128, 1000000)
B is <class 'tiledb.libtiledb.DenseArrayImpl'> with shape (128, 1000000)


In [22]:
print(chunk[0:10])
print(A[0:10,0:5]['a'])
print(B[0:10,0:5])

A.close()
f.close()

[ 0. 16. 35.  5. 32. 31. 14. 10. 11. 78.]
[[0.0e+00 1.8e-43 3.3e+01 2.3e+01 2.7e+01]
 [1.6e+01 1.4e+01 1.8e-43 1.0e+01 2.9e+01]
 [3.5e+01 3.5e+01 0.0e+00 1.8e-43 2.1e+01]
 [5.0e+00 1.9e+01 1.0e+00 1.2e+01 1.8e-43]
 [3.2e+01 2.0e+01 5.0e+00 4.7e+01 1.0e+00]
 [3.1e+01 3.0e+00 3.0e+00 1.4e+01 1.0e+00]
 [1.4e+01 1.0e+00 4.4e+01 2.5e+01 0.0e+00]
 [1.0e+01 1.3e+01 4.0e+01 2.0e+00 0.0e+00]
 [1.1e+01 1.1e+01 2.0e+01 3.0e+00 1.4e+01]
 [7.8e+01 1.6e+01 1.4e+01 4.0e+00 1.6e+01]]
[[0.0e+00 1.8e-43 3.3e+01 2.3e+01 2.7e+01]
 [1.6e+01 1.4e+01 1.8e-43 1.0e+01 2.9e+01]
 [3.5e+01 3.5e+01 0.0e+00 1.8e-43 2.1e+01]
 [5.0e+00 1.9e+01 1.0e+00 1.2e+01 1.8e-43]
 [3.2e+01 2.0e+01 5.0e+00 4.7e+01 1.0e+00]
 [3.1e+01 3.0e+00 3.0e+00 1.4e+01 1.0e+00]
 [1.4e+01 1.0e+00 4.4e+01 2.5e+01 0.0e+00]
 [1.0e+01 1.3e+01 4.0e+01 2.0e+00 0.0e+00]
 [1.1e+01 1.1e+01 2.0e+01 3.0e+00 1.4e+01]
 [7.8e+01 1.6e+01 1.4e+01 4.0e+00 1.6e+01]]


---

#### Cruft below

---

In [None]:
# Name of the array to create.                                                                                                      
array_name = "writing_dense_multiple"

def create_array():
    # The array will be 4x4 with dimensions "rows" and "cols", with domain [1,4].                                                   
    dom = tiledb.Domain(
        tiledb.Dim(name="rows", domain=(0, 3), tile=2, dtype=np.int32),
        tiledb.Dim(name="cols", domain=(0, 4), tile=2, dtype=np.int32),
    )

    # The array will be dense with a single attribute "a" so each (i,j) cell can store an integer.                                  
    schema = tiledb.ArraySchema(
        domain=dom, sparse=False, attrs=[tiledb.Attr(name="a", dtype=np.int32)]
    )

    # Create the (empty) array on disk.                                                                                             
    tiledb.DenseArray.create(array_name, schema)


def write_array():
    # Open the array and write to it.                                                                                               
    with tiledb.DenseArray(array_name, mode="w") as A:
        # First write                                                                                                               
        data = np.array(([0, 22, 33], [2, 44, 77]))
        A[0:2, 0:3] = data

        # Second write                                                                                                              
        data = np.array(([5, 6, 7, 8], [9, 10, 11, 12]))
        A[1:3, 1:5] = data


def read_array():
    # Open the array and read from it.                                                                                              
    with tiledb.DenseArray(array_name, mode="r") as A:
        # Slice the entire array                                                                                                    
        data = A[:]
        print(data["a"])
    return data


if tiledb.object_type(array_name) != "array":
    create_array()
    write_array()

data = read_array()

print(f'data is {type(data["a"])} with shape {data["a"].shape}')

In [None]:


# Name of the array to create.                                                                                                      
array_name = "writing_dense_multiple"


def create_array():
    # The array will be 4x4 with dimensions "rows" and "cols", with domain [1,4].                                                   
    dom = tiledb.Domain(
        tiledb.Dim(name="rows", domain=(1, 4), tile=2, dtype=np.int32),
        tiledb.Dim(name="cols", domain=(1, 4), tile=2, dtype=np.int32),
    )

    # The array will be dense with a single attribute "a" so each (i,j) cell can store an integer.                                  
    schema = tiledb.ArraySchema(
        domain=dom, sparse=False, attrs=[tiledb.Attr(name="a", dtype=np.int32)]
    )

    # Create the (empty) array on disk.                                                                                             
    tiledb.DenseArray.create(array_name, schema)
    
            # First write                                                                                                               
        data = np.array(([1, 2], [3, 4]))
        A[1:3, 1:3] = data

        # Second write                                                                                                              
        data = np.array(([5, 6, 7, 8], [9, 10, 11, 12]))
        A[2:4, 1:5] = data


def write_array():
    # Open the array and write to it.                                                                                               
    with tiledb.DenseArray(array_name, mode="w") as A:
        # First write                                                                                                               
        data = np.array(([1, 2], [3, 4]))
        A[1:3, 1:3] = data

        # Second write                                                                                                              
        data = np.array(([5, 6, 7, 8], [9, 10, 11, 12]))
        A[2:4, 1:5] = data


def read_array():
    # Open the array and read from it.                                                                                              
    with tiledb.DenseArray(array_name, mode="r") as A:
        # Slice the entire array                                                                                                    
        data = A[:]
        print(data["a"])


if tiledb.object_type(array_name) != "array":
    create_array()
    write_array()

read_array()


In [None]:
filename = '/Users/lums/TileDB/feature-vector-prototype/external/data/sift/sift_base.fvecs'
# array_name = 'sift_base'
array_name = 's3://tiledb-lums/sift_base'
tile_size = 100000
block_size = 10000
create_fvecs_array(filename, array_name, tile_size, block_size)

In [None]:
def create_fvecs_array(filename, array_name, tile_size, block_size):
    with tiledb.from_numpy(array_name, np.zeros((0,))) as A:
        dim = None
        offset = 0
        for block in fvecs_generator(filename, block_size):
            if dim is None:
                dim = block.shape[1]
                A.schema.set_domain((0, None), (0, dim))
                A.schema.set_tile((tile_size, dim))
                A.schema.set_cell_order(tiledb.Layout.ROW_MAJOR)
                A.schema.set_sparse(False)
                A.schema.set_attrs(tiledb.Attr("features", dtype=np.float32, var=False))
            num_rows = block.shape[0]
            A[offset:offset+num_rows] = block
            offset += num_rows
    A.schema.set_capacity(offset)

In [None]:
filename = '/Users/lums/TileDB/feature-vector-prototype/external/data/sift/sift_base.fvecs'
block_size = 1000
dimension = 128
gen = fvecs_generator(filename, block_size, dimension)
a = next(gen)
print(f'a is {type(a)} with shape {a.shape}')

In [None]:
import numpy as np
import tiledb

def fvecs_generator(filename, block_size):
    f = open(filename, 'rb')
    dim = np.fromfile(f, dtype=np.int32, count=1)[0]
    chunk_size = block_size * dim
    while True:
        chunk = np.fromfile(f, dtype=np.float32, count=chunk_size)
        if chunk.size == 0:
            break
        yield chunk.reshape((-1, dim))
    f.close()

def create_fvecs_array(filename, array_name, tile_size, block_size):
    with tiledb.from_numpy(array_name, np.zeros((0,))) as A:
        dim = None
        offset = 0
        for block in fvecs_generator(filename, block_size):
            if dim is None:
                dim = block.shape[1]
                A.schema.set_domain((0, None), (0, dim))
                A.schema.set_tile((tile_size, dim))
                A.schema.set_cell_order(tiledb.Layout.ROW_MAJOR)
                A.schema.set_sparse(False)
                A.schema.set_attrs(tiledb.Attr("features", dtype=np.float32, var=False))
            num_rows = block.shape[0]
            A[offset:offset+num_rows] = block
            offset += num_rows
    A.schema.set_capacity(offset)

In [None]:
filename = 'external/data/sift/sift_base.fvecs'
array_name = 'sift_base'
tile_size = 100000
block_size = 10000
create_fvecs_array(filename, array_name, tile_size, block_size)