In [1]:
import names

In [2]:
import syft as sy

In [3]:
import pyarrow as pa
import numpy as np
from typing import Optional

In [4]:
from syft.core.adp.entity import DataSubject

In [5]:
from syft.core.adp.vm_private_scalar_manager import VirtualMachinePrivateScalarManager

In [6]:
np_array_type = pa.list_(pa.int64())
np_dtype = pa.int64()
entity_type = pa.string()

In [7]:
sept_field_tuples = [
    ('child', np_array_type),
    ('min_vals', np_array_type),
    ('max_vals', np_array_type),
    ('entity', entity_type)
]

In [8]:
SEPTA_struct = pa.struct(sept_field_tuples)

In [9]:
SEPTA_struct_list = pa.list_(SEPTA_struct)

In [10]:
SEPTA_struct_list

ListType(list<item: struct<child: list<item: int64>, min_vals: list<item: int64>, max_vals: list<item: int64>, entity: string>>)

In [11]:
sept_schema_field_tuples = [
    ('child', np_dtype),
    ('min_vals', np_dtype),
    ('max_vals', np_dtype),
    ('entity', entity_type)
]

In [12]:
SEPTA_schema = pa.schema(sept_field_tuples)

In [13]:
class SingleEntityPhiArrowTensor:
    # Number of entities in a SEPT is by definition 1
    n_entities = 1

    def __init__(
        self,
        child: np.ndarray,
        entity: DataSubject,
        min_vals: np.ndarray,
        max_vals: np.ndarray,
        scalar_manager: Optional[VirtualMachinePrivateScalarManager] = None,
    ) -> None:
        # self.pa_struct = 
        self.child = child
        self._min_vals = min_vals
        self._max_vals = max_vals
        self.entity = entity

        if scalar_manager is None:
            self.scalar_manager = VirtualMachinePrivateScalarManager()
        else:
            self.scalar_manager = scalar_manager

In [14]:
def make_sept_data():
    highest = 50
    dims = 3
    child = np.random.randint(
        low=-highest, high=highest, size=(dims, dims), dtype=np.int32
    )
    max_values = np.ones_like(child) * highest
    min_values = np.ones_like(child) * -highest
    return child, max_values, min_values, names.get_first_name()

In [15]:
child, max_values, min_values, entity_name = make_sept_data()

In [16]:
def make_sept_dict(child, max_values, min_values, entity):    
     return {'child': child.flatten(), 'min_vals':min_values.flatten(), 'max_vals':max_values.flatten(), 'entity':entity}

In [17]:
def make_sept_dict_py(child, max_values, min_values, entity):    
     return {'child': list(child.flatten()), 'min_vals':list(min_values.flatten()), 'max_vals':list(max_values.flatten()), 'entity':entity}

In [70]:
child

array([[  1,   1,   1],
       [-47, -36, -48],
       [  9,  -5,  43]], dtype=int32)

In [71]:
# b = child.reshape(-1,)

In [72]:
# a = child.flatten()

In [73]:
# a.base is child

In [74]:
# b.base is child

In [75]:
x = pa.Tensor.from_numpy(child)

In [76]:
x

<pyarrow.Tensor>
type: int32
shape: (3, 3)
strides: (12, 4)

In [77]:
z = x.to_numpy()

In [78]:
child[1] = 0

In [80]:
child

array([[ 1,  1,  1],
       [ 0,  0,  0],
       [ 9, -5, 43]], dtype=int32)

In [65]:
z

array([[  1,   1,   1],
       [-47, -36, -48],
       [  9,  -5,  43]], dtype=int32)

In [69]:
dir(x)

['__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'dim_name',
 'dim_names',
 'equals',
 'from_numpy',
 'is_contiguous',
 'is_mutable',
 'ndim',
 'shape',
 'size',
 'strides',
 'to_numpy',
 'type']

In [19]:
shape_arr

<pyarrow.lib.Int64Array object at 0x151c0e400>
[
  3,
  3
]

In [20]:
child_arr = pa.array(child.flatten())

In [21]:
child_arr

<pyarrow.lib.Int32Array object at 0x14f8217c0>
[
  -34,
  -33,
  -15,
  21,
  -9,
  -30,
  -38,
  -38,
  -1
]

In [126]:
max_values[0] = 1

In [127]:
max_values

array([[ 1,  1,  1],
       [50, 50, 50],
       [50, 50, 50]], dtype=int32)

In [123]:
max_values_arr = pa.array(max_values.reshape(-1,))

In [128]:
max_values_arr

<pyarrow.lib.Int32Array object at 0x151c8ebe0>
[
  1,
  1,
  1,
  50,
  50,
  50,
  50,
  50,
  50
]

In [22]:
max_values_arr = pa.array(max_values.flatten())

In [23]:
min_values_arr = pa.array(min_values.flatten())

In [24]:
entity = DataSubject("Hawkeye")

In [98]:
entity_np = np.array(x, dtype=str)

In [101]:
entity_np

array(['a', 'b', 'c'], dtype='<U1')

In [100]:
entity_arr = pa.Tensor.from_numpy(entity_np)

ArrowNotImplementedError: Unsupported numpy type 19

In [104]:
entity_arr = pa.array(entity_np)

In [105]:
entity_arr

<pyarrow.lib.StringArray object at 0x151a2ad00>
[
  "a",
  "b",
  "c"
]

In [28]:
SEPTA = SingleEntityPhiArrowTensor

In [29]:
first = SEPTA(child, entity, min_values, max_values)

In [30]:
first

<__main__.SingleEntityPhiArrowTensor at 0x14f854760>

In [31]:
sept_data = {'child': child.flatten(), 'min_vals':min_values.flatten(), 'max_vals':max_values.flatten(), 'entity':entity.name}

In [32]:
sept_arrow_data = {
    'child': child_arr,
    'min_vals':min_values_arr,
    'max_vals':max_values_arr,
    'entity':entity_arr
}

In [33]:
sept1 = pa.array([sept_data])

In [34]:
sept2 = pa.array([sept_data])

In [126]:
# sept1

In [127]:
# sept2

{'child': array([-34, -33, -15,  21,  -9, -30, -38, -38,  -1], dtype=int32),
 'min_vals': array([-50, -50, -50, -50, -50, -50, -50, -50, -50], dtype=int32),
 'max_vals': array([50, 50, 50, 50, 50, 50, 50, 50, 50], dtype=int32),
 'entity': 'Hawkeye'}

In [61]:
sept_data_1 = make_sept_data()

In [62]:
sept_data_2 = make_sept_data()

In [68]:
# rb = pa.RecordBatch.from_pylist([make_sept_dict(*sept_data_1)])

In [132]:
sept_1 = make_sept_dict(*sept_data_1)
sept_2 = make_sept_dict(*sept_data_2)

In [None]:
len(rb)

In [108]:
def ser(batch):
    sink = pa.BufferOutputStream()
    with pa.ipc.new_stream(sink, batch.schema) as writer:
        writer.write_batch(batch)
    return sink.getvalue()

In [109]:
def de(buf):
    with pa.ipc.open_stream(buf) as reader:
        schema = reader.schema
        batches = [b for b in reader]
    return batches

In [None]:
rb = pa.RecordBatch.from_pylist()

In [117]:
entities = [{"name":"a"}, {"name":"b"}, {"name":"c"}]
rb = pa.RecordBatch.from_pylist(entities)

In [122]:
rb

pyarrow.RecordBatch
name: string

In [110]:
entity_arr = pa.array(entity_np)

In [112]:
ser(entity_arr)

AttributeError: 'pyarrow.lib.StringArray' object has no attribute 'schema'

In [None]:
import timeit

In [None]:
%%time
data = ser(rb)

In [None]:
%%time
rb2 = de(data)

In [106]:
rb2

[pyarrow.RecordBatch
 child: list<item: int32>
   child 0, item: int32
 min_vals: list<item: int32>
   child 0, item: int32
 max_vals: list<item: int32>
   child 0, item: int32
 entity: string]

In [118]:
rb2_table = pa.Table.from_pylist([sept_1, sept_1, sept_2])

In [119]:
rb2_table

pyarrow.Table
child: list<item: int32>
  child 0, item: int32
min_vals: list<item: int32>
  child 0, item: int32
max_vals: list<item: int32>
  child 0, item: int32
entity: string
----
child: [[[35,-34,30,34,38,36,-18,-46,26],[35,-34,30,34,38,36,-18,-46,26],[22,-26,28,-6,40,9,46,39,-11]]]
min_vals: [[[-50,-50,-50,-50,-50,-50,-50,-50,-50],[-50,-50,-50,-50,-50,-50,-50,-50,-50],[-50,-50,-50,-50,-50,-50,-50,-50,-50]]]
max_vals: [[[50,50,50,50,50,50,50,50,50],[50,50,50,50,50,50,50,50,50],[50,50,50,50,50,50,50,50,50]]]
entity: [["James","James","Robert"]]

In [139]:
len(rb2_table[1])

3

In [108]:
rb2_table = pa.Table.from_batches(rb2)

In [147]:
import pyarrow.compute as pc

In [154]:
len(rb2_table[0])

3

In [158]:
rb2_table[0].flatten()

[<pyarrow.lib.ChunkedArray object at 0x14fae0590>
 [
   [
     [
       35,
       -34,
       30,
       34,
       38,
       36,
       -18,
       -46,
       26
     ],
     [
       35,
       -34,
       30,
       34,
       38,
       36,
       -18,
       -46,
       26
     ],
     [
       22,
       -26,
       28,
       -6,
       40,
       9,
       46,
       39,
       -11
     ]
   ]
 ]]

In [None]:
(
    array[
        list<item: int32>
    ],
    array[uint32]
)

In [121]:
a = rb2_table.group_by("entity").aggregate([("child", "sum")])

ArrowNotImplementedError: Function 'hash_sum' has no kernel matching input types (array[list<item: int32>], array[uint32])

In [163]:
a = rb2_table.group_by("entity").aggregate([("entity", "unique")])

ArrowKeyError: No function registered with name: hash_unique

In [113]:
dir(a)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_table',
 'aggregate',
 'keys']

In [122]:
nested_arr = pa.array([[], None, [1, 2], [None, 1]])

In [123]:
nested_arr

<pyarrow.lib.ListArray object at 0x14fa7ba00>
[
  [],
  null,
  [
    1,
    2
  ],
  [
    null,
    1
  ]
]

In [124]:
nested_arr.type

ListType(list<item: int64>)

In [125]:
nested_arr + nested_arr

TypeError: unsupported operand type(s) for +: 'pyarrow.lib.ListArray' and 'pyarrow.lib.ListArray'

In [171]:
# t = pa.table([
#       pa.array(["a", "a", "b", "b", "c"]),
#       pa.array([1, 2, 3, 4, 5]),
# ], names=["keys", "values"])

In [174]:
t = pa.table([
      pa.array(["a", "a", "b", "b", "c"]),
      pa.array([[1], [2], [3], [4], [5]]),
], names=["keys", "values"])

In [175]:
t.group_by("keys").aggregate([("values", "sum")])

ArrowNotImplementedError: Function 'hash_sum' has no kernel matching input types (array[list<item: int64>], array[uint32])

In [176]:
t.group_by("keys")

<pyarrow.lib.TableGroupBy at 0x14fa7f4c0>