In [1]:
%load_ext autoreload
%autoreload 2
%load_ext line_profiler
import numpy as np
import pandas as pd
import torch
import gc
import syft as sy
from syft.core.adp.entity import Entity
from pympler.asizeof import asizeof #pip install pympler
from syft import serialize
from syft import deserialize
import timeit
from syft.core.tensor.autodp.dp_tensor_converter import convert_to_gamma_tensor

from functools import reduce
import time
import pyarrow as pa

In [17]:
df = pd.read_parquet("10M_rows_dataset_sample.parquet")
print("Number of Rows: ",df.shape[0])
df.head()
print(df.shape[0])

Number of Rows:  10000000
10000000


In [3]:
# Let's login into the domain node
domain_node = sy.login(email="info@openmined.org", password="changethis", port=8081)


Anyone can login as an admin to your node right now because your password is still the default PySyft username and password!!!

Connecting to None... done! 	 Logging into canada... done!


In [18]:
%%time
name = "Tweets- 100000 rows dataset "
impressions = ((np.array(list(df['impressions'])))).astype(np.int32)
publication_title = ((list(df['publication_title'])))

entities = list()
for i in range(len(publication_title)):
    entities.append(Entity(name=publication_title[i]))

tweets_data = sy.Tensor(impressions).private(min_val=0, max_val=30, entities = entities)

In [19]:
%lprun -f tweets_data.child.child[0].entity.simple_assets_for_serde tweets_data.child.child[0].entity.simple_assets_for_serde()

Timer unit: 1e-06 s

Total time: 5e-06 s
File: /home/azureuser/PySyft/packages/syft/src/syft/core/adp/entity.py
Function: simple_assets_for_serde at line 111

Line #      Hits         Time  Per Hit   % Time  Line Contents
   111                                               def simple_assets_for_serde(self) -> list:
   112                                                   # assets = list()
   113                                                   # assets.append(self.name)
   114                                                   # bytes_value = self.id.get_bytes
   115                                                   # assets.append(bytes_value)  # type: ignore
   116                                                   # return assets
   117         1          5.0      5.0    100.0          return [self.name, self.id.get_bytes]

In [None]:
%lprun -f tweets_data.child.arrow_serialize tweets_data.child.arrow_serialize()

In [None]:
# %%timeit
start = time.time()
tweets_data.child.arrow_serialize()
end= time.time()
print(end-start)

In [None]:
tweets_data.child.child[0].scalar_manager.prime2symbol

































































In [None]:
%%time
pa.serialize(np.random.rand(1_000_000))

In [None]:
%%time
for i in range(1_000_000):
    pa.serialize(np.random.rand(1))

In [None]:
%%time
serialize(tweets_data.child.child[0],to_bytes=True)

In [None]:
%%time
result = tweets_data.sum()

In [None]:
tweets_data.child.serde_concurrency= 1

In [None]:
#measuring size of objects in python is not easy, 
#as there might be several references in a user defined class
#we pympler library to measure the size of an object.
#pymler does not give an exact size on disk, but a close accurate instead of sys.getsizeof()

print("Size of Twitter Tensor Data(MB) ====> " , asizeof(tweets_data)/(1024*1024)) 

In [None]:
#Twitter data serialization.
start = timeit.default_timer()
serialized_data = serialize(tweets_data,to_bytes=True)
end = timeit.default_timer()
print("Serialization Time =====>" , end-start," seconds" )
print("Twitter Serialized Data Size ",len(serialized_data)/(1024*1024))


In [None]:
%lprun -f tweets_data.child._object2proto tweets_data.child._object2proto()

In [None]:
#Twitter data derserialization
start = timeit.default_timer()
deserialized_data = deserialize(serialized_data,from_bytes=True)
end = timeit.default_timer()
print("Deserialization Time =====>" , end-start," seconds" )
print("Twitter Deserialized Data Size ",asizeof(deserialized_data)/(1024*1024))


In [None]:
%%time
#Uploading Dataset....
domain_node.load_dataset(
assets={"tweets": tweets_data},
name=name,
description="""Tweets- 1M rows """
)

#The main bottleneck is the serialization as uploading dataset is a blocking call(immediate_msg_with_reply), 
#bottleneck = serialization at client + DeSerialization at node


In [None]:
domain_node.datasets

In [None]:
dataset = domain_node.datasets[0]["tweets"]

In [None]:
%%time
res = dataset.sum()
res.block

In [None]:
res.exists

In [None]:
res = res.publish(sigma=0.1)

In [None]:
res.get()

In [None]:
l = tweets_data.child.child

In [None]:
a = l[0]
b= l[50611]
c = l[83931]
print(a)
print(b)
print(c)

In [None]:
split_lst=[]
d = {}
c=0
for i in l:
    if i.entity not in d:
        d[i.entity]=c
        split_lst.append([i])
        c+=1
    else:
        split_lst[d[i.entity]].append(i)
        

In [None]:
first= split_lst[0]
second = split_lst[1]
third = split_lst[2]
fourth = split_lst[3]


In [None]:
def list_sum(lst):
    s=lst[0]
    for i in range(1,len(lst)):
        s+=lst[i]
    return s

In [None]:
s=first[0]
for i in first[1::]:
    s=s+i
print("final: ",s)
    

In [None]:
def lst_sum(a,b):
    return a+b

In [None]:
t1 = reduce(lst_sum,first)
t2 = reduce(lst_sum,second)
t3 = reduce(lst_sum,third)
t4 = reduce(lst_sum,fourth)

In [None]:
%%time
r1 = t1+t2
r2 = t3+t4

In [None]:
tweets_data.child.child[1].dtype