In [9]:
# %load_ext autoreload
# %autoreload 2

In [1]:
import syft as sy
import numpy as np
import pandas as pd

In [5]:
domain = sy.login(
    email="info@openmined.org",
    password="changethis",
    port=8081,

)

Connecting to http://localhost:8081... done! 	 Logging into mednode... done!


In [7]:
# Let's create an empty dataset

domain.create_dataset(
    name="Empty Dataset",
    description="Test delete",
)

Creating an empty dataset... Creating... SUCCESS!

In [8]:
# We can see that there are not existing assets attached to the dataset
domain.datasets

Idx,Name,Description,Assets,Id
[0],Empty Dataset,Test delete,,7fafab3b-4797-419b-9bae-a72888c96992


In [9]:
# Let's grab the dataset pointer
dataset_pointer = domain.datasets[0]

In [10]:
dataset_pointer

Dataset: Empty Dataset
Description: Test delete



Asset Key,Type,Shape


In [11]:
# Let's add a random asset to the dataset

dataset_pointer.add(name="random-asset", value=np.random.rand(100, 8))




This means you'll need to manually approve any requests which leverage this data. If this is ok with you, proceed. If you'd like to use automatic differential privacy budgeting, please pass in a DP-compatible tensor type such as by calling annotate_with_dp_metadata() on a sy.Tensor with a np.int32 or np.float32 inside.
Are you sure you want to proceed? (y/n)y
Loading dataset... uploading... 
SUCCESS!

In [12]:
# Let's check the dataset pointer
dataset_pointer

Dataset: Empty Dataset
Description: Test delete



Asset Key,Type,Shape
"[""random-asset""]",ndarray,"(100, 8)"


In [13]:
# Great !!! we have an asset attached to the dataset
# Let's try an asset with the same name again

dataset_pointer.add(name="random-asset", value=np.random.rand(50, 4))




This means you'll need to manually approve any requests which leverage this data. If this is ok with you, proceed. If you'd like to use automatic differential privacy budgeting, please pass in a DP-compatible tensor type such as by calling annotate_with_dp_metadata() on a sy.Tensor with a np.int32 or np.float32 inside.
Are you sure you want to proceed? (y/n)y


KeyError: 'Asset with name: `random-asset` already exists. Please use a different name.'

In [14]:
# Great!!! So, we cannot add asset with the same name.
# Let's try and delete the asset

dataset_pointer.delete(name="random-asset")

You are about to permanantely delete the asset `random-asset` ? 🚨 
Please enter y/n to proceed: y


True

In [15]:
# The dataset is deleted. 
# let's check the dataset_pointer again

dataset_pointer

Dataset: Empty Dataset
Description: Test delete



Asset Key,Type,Shape


In [16]:
# We don't have any assets attached, which is what is expected.

# If we delete an asset that does not exists, we will get an error
dataset_pointer.delete("key-not-exists")

KeyError: 'The asset with name `key-not-exists` does not exists.'

In [17]:
# Great, now that we can add/delete assets, we can upload the dataset
# in batches, let's try to do that.

Let's load some data... into memory

In [19]:
# Load the first 100 rows
ca_data = pd.read_csv("../../trade_demo/datasets/ca - feb 2021.csv")[:100]

In [20]:
# We will transfer data in batches of 10

In [21]:
start, end = 0, len(ca_data)
batch_size = 10
idx = 0
while(start < end):
    idx += 1
    
    batch = ca_data[start:start+batch_size] # Select a batch of images
    
    name = f"ca_batch_{idx+1}"  # Asset key name
    dataset_pointer.add(name=name, value=batch)  # Add asset to the dataset pointer
    start += batch_size
    print(f"Batches successfully uploaded: {idx}/{end//batch_size}")




This means you'll need to manually approve any requests which leverage this data. If this is ok with you, proceed. If you'd like to use automatic differential privacy budgeting, please pass in a DP-compatible tensor type such as by calling annotate_with_dp_metadata() on a sy.Tensor with a np.int32 or np.float32 inside.
Are you sure you want to proceed? (y/n)y
Loading dataset... uploading... 
SUCCESS!Batches successfully uploaded: 1/10



This means you'll need to manually approve any requests which leverage this data. If this is ok with you, proceed. If you'd like to use automatic differential privacy budgeting, please pass in a DP-compatible tensor type such as by calling annotate_with_dp_metadata() on a sy.Tensor with a np.int32 or np.float32 inside.
Are you sure you want to proceed? (y/n)y
Loading dataset... uploading... 
SUCCESS!Batches successfully uploaded: 2/10



This means you'll need to manually approve any requests which leverage this data. If this is ok with you, proceed

In [22]:
# Great !!! we uploaded the whole data into batches of 10.
# Let's check the dataset pointer
dataset_pointer

Dataset: Empty Dataset
Description: Test delete



Asset Key,Type,Shape
"[""ca_batch_2""]",DataFrame,"(10, 22)"
"[""ca_batch_3""]",DataFrame,"(10, 22)"
"[""ca_batch_4""]",DataFrame,"(10, 22)"
"[""ca_batch_5""]",DataFrame,"(10, 22)"
"[""ca_batch_6""]",DataFrame,"(10, 22)"
"[""ca_batch_7""]",DataFrame,"(10, 22)"
"[""ca_batch_8""]",DataFrame,"(10, 22)"
"[""ca_batch_9""]",DataFrame,"(10, 22)"
"[""ca_batch_10""]",DataFrame,"(10, 22)"
"[""ca_batch_11""]",DataFrame,"(10, 22)"


In [23]:
# Let's check the dataset in the domain
domain.datasets

Idx,Name,Description,Assets,Id
[0],Empty Dataset,Test delete,"[""ca_batch_2""] -> DataFrame [""ca_batch_3""] -> DataFrame [""ca_batch_4""] -> DataFrame [""ca_batch_5""] -> DataFrame [""ca_batch_6""] -> DataFrame [""ca_batch_7""] -> DataFrame [""ca_batch_8""] -> DataFrame [""ca_batch_9""] -> DataFrame [""ca_batch_10""] -> DataFrame [""ca_batch_11""] -> DataFrame",7fafab3b-4797-419b-9bae-a72888c96992


In [24]:
# Now, lastly, accessing the assets as an iterator
# We want to access the assets of this dataset as an iterator

dataset_ptr = domain.datasets[0] # Selected the MedNIST dataset

# We want an iterator and exclude certain assets.
data_iterator = dataset_ptr.iter(exclude=["ca_batch_2", "ca_batch_3", "ca_batch_4"])

In [25]:
cnt = 0
for d in data_iterator:
    cnt += 1
    print(d)
print(f"total assets in the iterator: {cnt}")

<DataFramePointer -> mednode:e4260c35d9ea46ba8bacb1fdac713d15>
<DataFramePointer -> mednode:a145658cc60e4bacbf0d5111f21dc770>
<DataFramePointer -> mednode:5f73960d949f498aa5cd441a4118b22a>
<DataFramePointer -> mednode:7ff1ecbc2a6547068df2b2e388fc2c6c>
<DataFramePointer -> mednode:2bbac7ae86b04538800f92277d5518cb>
<DataFramePointer -> mednode:ef06b73c8db145ef8e2b8bec24120d3c>
<DataFramePointer -> mednode:fe3ac199966d449aa2f532ce54113ece>
total assets in the iterator: 7


Since we excluded three assets, so we had pointers to the remaining seven assets.

In [26]:
# Finally, the delete dataset functionality is also fixed,
# So, let's delete the whole dataset
domain.datasets

Idx,Name,Description,Assets,Id
[0],Empty Dataset,Test delete,"[""ca_batch_2""] -> DataFrame [""ca_batch_3""] -> DataFrame [""ca_batch_4""] -> DataFrame [""ca_batch_5""] -> DataFrame [""ca_batch_6""] -> DataFrame [""ca_batch_7""] -> DataFrame [""ca_batch_8""] -> DataFrame [""ca_batch_9""] -> DataFrame [""ca_batch_10""] -> DataFrame [""ca_batch_11""] -> DataFrame",7fafab3b-4797-419b-9bae-a72888c96992


In [27]:
# Deleting the dataset
del domain.datasets[0]

You are about to delete the `Empty Dataset` ? 🚨 
All information related to this dataset will be permanantely deleted.
Please enter y/n to proceed: y
Dataset: `Empty Dataset` is successfully deleted.

In [28]:
# There are no datasets in the domain
domain.datasets