In [1]:
import syft as sy

In [30]:
from syft.core.adp.entity_list import DataSubjectList

In [2]:
!lscpu

Architecture:                    x86_64
CPU op-mode(s):                  32-bit, 64-bit
Byte Order:                      Little Endian
Address sizes:                   46 bits physical, 57 bits virtual
CPU(s):                          104
On-line CPU(s) list:             0-103
Thread(s) per core:              2
Core(s) per socket:              26
Socket(s):                       2
NUMA node(s):                    2
Vendor ID:                       GenuineIntel
CPU family:                      6
Model:                           106
Model name:                      Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz
Stepping:                        6
CPU MHz:                         2800.000
CPU max MHz:                     2800.0000
CPU min MHz:                     800.0000
BogoMIPS:                        5586.87
Virtualization:                  VT-x
Hypervisor vendor:               Microsoft
Virtualization type:             full
L1d cache:                       2.4 MiB
L1i cache:           

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_parquet("1M_rows_dataset_sample.parquet")

In [3]:
df.head()

Unnamed: 0,tweet_id,impressions,tweet_date_time,date,time,user_id,url,publication_title,ad_fontes_bias,ad_fontes_reliability,domain
0,1,9029,2021-03-21 06:29:10,2021-03-21,06:29:10,92256,https://www.aljazeera.com/news/2019/04/trump-v...,Al Jazeera,-0.75,48.0,.aljazeera.com
1,1,9029,2021-03-21 06:29:10,2021-03-21,06:29:10,418264,https://www.aljazeera.com/news/2019/04/trump-v...,Al Jazeera,-0.75,48.0,.aljazeera.com
2,1,9029,2021-03-21 06:29:10,2021-03-21,06:29:10,195053,https://www.aljazeera.com/news/2019/04/trump-v...,Al Jazeera,-0.75,48.0,.aljazeera.com
3,1,9029,2021-03-21 06:29:10,2021-03-21,06:29:10,590420,https://www.aljazeera.com/news/2019/04/trump-v...,Al Jazeera,-0.75,48.0,.aljazeera.com
4,1,9029,2021-03-21 06:29:10,2021-03-21,06:29:10,602475,https://www.aljazeera.com/news/2019/04/trump-v...,Al Jazeera,-0.75,48.0,.aljazeera.com


In [4]:
%%timeit
data_subject_array = df['user_id'].to_numpy()

2.41 µs ± 14 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [34]:
from syft.core.adp.entity import DataSubject

In [36]:
df.shape[0]/1e6

1.0

In [38]:
%%time
old_method_entities = [DataSubject(str(i)) for i in bil_row['user_id']]

CPU times: user 11min 11s, sys: 1min 56s, total: 13min 8s
Wall time: 13min 8s


### Ran the cell below for 4 hours and it still didn't complete- this shows the problems with the current implementation

In [41]:
%%time
old_entity_list = DataSubjectList.from_objs(old_method_entities)

KeyboardInterrupt: 

### Try with bigger DF

In [7]:
from time import time

t0 = time()
bil_row = pd.concat([df] * 1000)
tf = time() - t0

In [8]:
print(tf)

131.64065289497375


In [9]:
t0 = time()
billion_entities = bil_row['user_id'].to_numpy()
tf = time() - t0
print(tf)

0.0003299713134765625


In [10]:
billion_entities.shape[0]/1e9

1.0

This line of code will take a huge amount of time and is what we're trying to avoid

In [12]:
%%time
one_hot_lookup, entities_indexed = np.unique(billion_entities, return_inverse=True)

CPU times: user 3min 57s, sys: 7.52 s, total: 4min 5s
Wall time: 4min 5s


In [13]:
billion_entities

array([ 92256, 418264, 195053, ..., 394451, 457104, 654218])

In [14]:
one_hot_lookup

array([     1,      5,      9, ..., 699986, 699992, 699994])

In [15]:
len(one_hot_lookup)

173730

## Comparison
- np.unique: which returns a sorted np array of unique values

vs

- pandas.df.unique() followed by np.sort(): which together returns a sorted np array of unique values

In [20]:
%%time
unique_data_subjects = bil_row['user_id'].unique()

CPU times: user 6.15 s, sys: 3.5 ms, total: 6.15 s
Wall time: 6.15 s


In [21]:
unique_data_subjects

array([ 92256, 418264, 195053, ..., 580305, 341981, 167206])

In [22]:
len(unique_data_subjects)

173730

In [23]:
%%time
sorted_unique_data_subjects = np.sort(unique_data_subjects)

CPU times: user 11.3 ms, sys: 0 ns, total: 11.3 ms
Wall time: 10.6 ms


In [24]:
sorted_unique_data_subjects

array([     1,      5,      9, ..., 699986, 699992, 699994])

In [25]:
%%time
unique_entities = np.unique(billion_entities)

CPU times: user 1min 7s, sys: 1.48 s, total: 1min 8s
Wall time: 1min 8s


In [26]:
unique_entities

array([     1,      5,      9, ..., 699986, 699992, 699994])

In [28]:
def get_entities(input_dataframe: pd.DataFrame):
    
    # This will be the equivalent of the DataSubjectList.entities_indexed
    data_subjects = input_dataframe['user_id'].to_numpy()
    
    # This will be the equivalent of the DataSubjectList.one_hot_indexed- a sorted array of all unique entities
    unique_data_subjects = np.sort(input_dataframe['user_id'].unique())
    return unique_data_subjects, data_subjects

In [33]:
%%time
unique_data_subjects, data_subject_array = get_entities(bil_row)
entity_list = DataSubjectList(one_hot_lookup=unique_data_subjects, entities_indexed=data_subject_array)

CPU times: user 6.22 s, sys: 0 ns, total: 6.22 s
Wall time: 6.22 s


## Refactor DataSubjectList to accept integers

In [44]:
# future
from __future__ import annotations

# stdlib
from typing import Any
from typing import List
from typing import Optional
from typing import Tuple
from typing import Union

# third party
import numpy as np
import pandas as pd

# relative
from syft.core.common.serde.serializable import serializable
# from ..common.serde.serializable import serializable
from syft.core.adp.entity import DataSubject
# from .entity import DataSubject


# allow us to serialize and deserialize np.arrays with strings inside as two np.arrays
# one containing the uint8 bytes and the other the offsets between strings
def numpyutf8tolist(string_index: Tuple[np.ndarray, np.ndarray]) -> np.ndarray:
    string_array, index_array = string_index
    output_bytes: bytes = string_array.astype(np.uint8).tobytes()
    output_list = []
    last_offset = 0
    for offset in index_array:
        chars = output_bytes[last_offset:offset]
        final_string = chars.decode("utf-8")
        last_offset = offset
        output_list.append(final_string)
    return np.array(output_list)


def liststrtonumpyutf8(string_list: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
    bytes_list = []
    indexes = []
    offset = 0
    for item in string_list:
        if not isinstance(item, (DataSubject, str)):
            raise Exception(
                f"DataSubjectList entities must be List[Union[str, DataSubject]]. {type(item)}"
            )
        name = item if isinstance(item, str) else item.name
        name_bytes = name.encode("utf-8")
        offset += len(name_bytes)
        indexes.append(offset)
        bytes_list.append(name_bytes)

    np_bytes = np.frombuffer(b"".join(bytes_list), dtype=np.uint8)
    np_indexes = np.array(indexes)
    return (np_bytes, np_indexes)


@serializable(recursive_serde=True)
class DataSubjectList:
    __attr_allowlist__ = ("one_hot_lookup", "entities_indexed")
    __slots__ = ("one_hot_lookup", "entities_indexed")

    # one_hot_lookup is a numpy array of unicode strings which can't be serialized
    __serde_overrides__ = {
        "one_hot_lookup": [liststrtonumpyutf8, numpyutf8tolist],
    }

    def __init__(
        self, one_hot_lookup: List[Union[DataSubject, str]], entities_indexed: np.ndaray
    ) -> None:
        self.one_hot_lookup = one_hot_lookup
        self.entities_indexed = entities_indexed
    
    @staticmethod
    def from_series(entities_dataframe_slice: pd.Series) -> DataSubjectList:
        """ Given a Pandas Series object (such as from 
        getting a column from a pandas DataFrame, return an DataSubjectList """
        
        # This will be the equivalent of the DataSubjectList.entities_indexed
        data_subjects = entities_dataframe_slice.to_numpy()
    
        # This will be the equivalent of the DataSubjectList.one_hot_indexed- a sorted array of all unique entities
        unique_data_subjects = np.sort(entities_dataframe_slice.unique())
        return DataSubjectList(one_hot_lookup=unique_data_subjects, entities_indexed=data_subjects)

    @staticmethod
    def from_objs(entities: Union[np.ndarray, list]) -> DataSubjectList:
        if isinstance(entities, list):
            entities = np.array(entities)
        one_hot_lookup, entities_indexed = np.unique(entities, return_inverse=True)

        return DataSubjectList(one_hot_lookup, entities_indexed)
    

    def __getitem__(self, key: Union[int, slice, str]) -> Union[DataSubject, str]:
        return self.one_hot_lookup[self.entities_indexed[key]]

    def copy(self, order: Optional[str] = "K") -> DataSubjectList:
        return DataSubjectList(
            self.one_hot_lookup.copy(), self.entities_indexed.copy(order=order)
        )

    def __len__(self) -> int:
        return len(self.entities_indexed)

    def __eq__(self, other: Any) -> bool:
        if isinstance(other, DataSubjectList):
            if (self.entities_indexed == other.entities_indexed).all() and (  # type: ignore
                self.one_hot_lookup == other.one_hot_lookup
            ).all():
                return True
            return False
        return self == other

    def sum(self):
        # If sum is used without any arguments then the result is always a singular value
        return DataSubjectList(
            self.one_hot_lookup.copy(), self.entities_indexed.reshape(1, len(self.entities_indexed))
        )

In [46]:
%%time
new_entity_list = DataSubjectList.from_series(bil_row['user_id'])

CPU times: user 6.19 s, sys: 284 µs, total: 6.19 s
Wall time: 6.19 s
