In [1]:
import names

In [2]:
import syft as sy

In [3]:
import pyarrow as pa
import numpy as np
from typing import Optional

In [4]:
from syft.core.adp.entity import DataSubject

In [5]:
from syft.core.adp.vm_private_scalar_manager import VirtualMachinePrivateScalarManager

In [6]:
np_array_type = pa.list_(pa.int64())
np_dtype = pa.int64()
entity_type = pa.string()

In [7]:
sept_field_tuples = [
    ('child', np_array_type),
    ('min_vals', np_array_type),
    ('max_vals', np_array_type),
    ('entity', entity_type)
]

In [8]:
SEPTA_struct = pa.struct(sept_field_tuples)

In [9]:
SEPTA_struct_list = pa.list_(SEPTA_struct)

In [10]:
SEPTA_struct_list

ListType(list<item: struct<child: list<item: int64>, min_vals: list<item: int64>, max_vals: list<item: int64>, entity: string>>)

In [11]:
sept_schema_field_tuples = [
    ('child', np_dtype),
    ('min_vals', np_dtype),
    ('max_vals', np_dtype),
    ('entity', entity_type)
]

In [12]:
SEPTA_schema = pa.schema(sept_field_tuples)

In [13]:
class SingleEntityPhiArrowTensor:
    # Number of entities in a SEPT is by definition 1
    n_entities = 1

    def __init__(
        self,
        child: np.ndarray,
        entity: DataSubject,
        min_vals: np.ndarray,
        max_vals: np.ndarray,
        scalar_manager: Optional[VirtualMachinePrivateScalarManager] = None,
    ) -> None:
        # self.pa_struct = 
        self.child = child
        self._min_vals = min_vals
        self._max_vals = max_vals
        self.entity = entity

        if scalar_manager is None:
            self.scalar_manager = VirtualMachinePrivateScalarManager()
        else:
            self.scalar_manager = scalar_manager

In [14]:
def make_sept_data():
    highest = 50
    dims = 3
    child = np.random.randint(
        low=-highest, high=highest, size=(dims, dims), dtype=np.int32
    )
    max_values = np.ones_like(child) * highest
    min_values = np.ones_like(child) * -highest
    return child, max_values, min_values, names.get_first_name()

In [15]:
child, max_values, min_values, entity_name = make_sept_data()

In [16]:
def make_sept_dict(child, max_values, min_values, entity):    
     return {'child': child.flatten(), 'min_vals':min_values.flatten(), 'max_vals':max_values.flatten(), 'entity':entity}

In [17]:
def make_sept_dict_py(child, max_values, min_values, entity):    
     return {'child': list(child.flatten()), 'min_vals':list(min_values.flatten()), 'max_vals':list(max_values.flatten()), 'entity':entity}

In [18]:
shape_arr = pa.array(child.shape)

In [19]:
shape_arr

<pyarrow.lib.Int64Array object at 0x155414be0>
[
  3,
  3
]

In [20]:
child_arr = pa.array(child.flatten())

In [21]:
child_arr

<pyarrow.lib.Int32Array object at 0x15542a160>
[
  8,
  -28,
  40,
  -13,
  -18,
  -44,
  -16,
  49,
  -21
]

In [22]:
max_values_arr = pa.array(max_values.flatten())

In [23]:
min_values_arr = pa.array(min_values.flatten())

In [24]:
entity = DataSubject("Hawkeye")

In [25]:
entity_np = np.array(entity.name, dtype=str)

In [26]:
entity_arr = pa.array(entity_np.flatten())

In [27]:
entity_arr

<pyarrow.lib.StringArray object at 0x15528a820>
[
  "Hawkeye"
]

In [28]:
SEPTA = SingleEntityPhiArrowTensor

In [29]:
first = SEPTA(child, entity, min_values, max_values)

In [30]:
first

<__main__.SingleEntityPhiArrowTensor at 0x15528b8e0>

In [31]:
sept_data = {'child': child.flatten(), 'min_vals':min_values.flatten(), 'max_vals':max_values.flatten(), 'entity':entity.name}

In [32]:
sept_arrow_data = {
    'child': child_arr,
    'min_vals':min_values_arr,
    'max_vals':max_values_arr,
    'entity':entity_arr
}

In [33]:
sept1 = pa.array([sept_data])

In [34]:
sept2 = pa.array([sept_data])

In [35]:
# sept1

In [36]:
# sept2

In [37]:
sept_data_1 = make_sept_data()

In [38]:
sept_data_2 = make_sept_data()

In [39]:
# rb = pa.RecordBatch.from_pylist([make_sept_dict(*sept_data_1)])

In [40]:
sept_1 = make_sept_dict(*sept_data_1)
sept_2 = make_sept_dict(*sept_data_2)

In [41]:
apache_arrow = pa.Tensor.from_numpy(obj=child)

In [42]:
sink = pa.BufferOutputStream()

pa.ipc.write_tensor(apache_arrow, sink)
buffer = sink.getvalue()
# numpy_bytes = pa.compress(buffer, asbytes=True, codec="zstd")
numpy_bytes = buffer.to_pybytes()

In [44]:
entities = [entity, entity, entity]

In [52]:
entity_np = [entity.name for entity in entities]

In [55]:
rept = {"entities": entity_np, "tensor_bytes": numpy_bytes}

In [56]:
rb = pa.RecordBatch.from_pylist([rept])

In [57]:
rb

pyarrow.RecordBatch
entities: list<item: string>
  child 0, item: string
tensor_bytes: binary

In [None]:
rb = pa.RecordBatch.from_pylist([sept_1 for i in range(1_000_000_000)])

In [None]:
len(rb)

In [None]:
def ser(batch):
    sink = pa.BufferOutputStream()
    with pa.ipc.new_stream(sink, batch.schema) as writer:
        writer.write_batch(batch)
    return sink.getvalue()

In [None]:
def de(buf):
    with pa.ipc.open_stream(buf) as reader:
        schema = reader.schema
        batches = [b for b in reader]
    return batches

In [None]:
import timeit

In [None]:
%%time
data = ser(rb)

In [None]:
%%time
rb2 = de(data)

In [None]:
rb2

In [None]:
rb2_table = pa.Table.from_pylist([sept_1, sept_1, sept_2])

In [None]:
rb2_table

In [None]:
len(rb2_table[1])

In [None]:
rb2_table = pa.Table.from_batches(rb2)

In [None]:
import pyarrow.compute as pc

In [None]:
len(rb2_table[0])

In [None]:
rb2_table[0].flatten()

In [None]:
(
    array[
        list<item: int32>
    ],
    array[uint32]
)

In [None]:
a = rb2_table.group_by("entity").aggregate([("child", "sum")])

In [None]:
a = rb2_table.group_by("entity").aggregate([("entity", "unique")])

In [None]:
dir(a)

In [None]:
nested_arr = pa.array([[], None, [1, 2], [None, 1]])

In [None]:
nested_arr

In [None]:
nested_arr.type

In [None]:
nested_arr + nested_arr

In [None]:
# t = pa.table([
#       pa.array(["a", "a", "b", "b", "c"]),
#       pa.array([1, 2, 3, 4, 5]),
# ], names=["keys", "values"])

In [None]:
t = pa.table([
      pa.array(["a", "a", "b", "b", "c"]),
      pa.array([[1], [2], [3], [4], [5]]),
], names=["keys", "values"])

In [None]:
t.group_by("keys").aggregate([("values", "sum")])

In [None]:
t.group_by("keys")