#### Upload dataset to domain

- Currently, a user needs to load the whole dataset into memory and upload it over to the domain at once. This solution is not scalable when the user has a large amount of data to be uploaded to the domain.
- A solution to the problem is to allow a user to add/upload assets to an already uploaded dataset. In this way the user can add multiple assets to the same dataset in batches.

In [1]:
import syft as sy
import os
from PIL import Image
import numpy as np
import gc

In [55]:
# Logging into the domain

domain = sy.login(
    url="http://localhost",
    email="info@openmined.org",
    password="changethis",
    port=8081
)

Connecting to http://localhost:8081... done! 	 Logging into mednode... done!


In [21]:
from tqdm import tqdm

#### Uploading the whole dataset

In [3]:
#### Load datasest to the domain
data_dir = 'data/MedNIST/'
class_names = sorted([x for x in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, x))])
num_classes = len(class_names)
image_files = [[os.path.join(data_dir, class_name, x) 
                for x in os.listdir(os.path.join(data_dir, class_name))] 
               for class_name in class_names]

image_file_list = []
image_label_list = []
for i, class_name in enumerate(class_names):
    image_file_list.extend(image_files[i])
    image_label_list.extend([i] * len(image_files[i]))
num_total = len(image_label_list)
image_width, image_height = Image.open(image_file_list[0]).size

In [36]:
def load_as_numpy_array(image_file_list):
    img_list = []
    for image_path in tqdm(image_file_list):
        img_list.append(np.asarray(Image.open(image_path)).astype(np.int32))
    return img_list

In [37]:
data = load_as_numpy_array(image_file_list)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 58954/58954 [00:09<00:00, 6138.40it/s]


In [39]:
data_tensors = sy.Tensor(data)

In [54]:
private_image_tensors = data_tensors.annotate_with_dp_metadata(lower_bound=0, upper_bound=255, entities=[str(s) for s in range(data_tensors.shape[0])])

In [57]:
label_tensors = sy.Tensor(np.array(image_label_list).astype(np.int32))
private_label_tensors = label_tensors.annotate_with_dp_metadata(lower_bound=0, upper_bound=5, entities=[str(s) for s in range(data_tensors.shape[0])])

In [None]:
# There are two ways in which a user can upload their datasets to the domain

In [None]:
# Metadata related to the dataset

metadata = {
    "class_names": class_names,  # Class names of the labels
}

In [None]:
# Ist Way: Current Way
# Load all the dataset into memory and load the whole dataset to the domain at once

domain.load_dataset(
    assets={"imageData": private_image_tensors, "labels": private_label_tensors},
    name="MedNIST",
    description="MedNIST Description",
    metadata=metadata
)

In [8]:
domain.datasets[0]


Name: MedNIST
Description: MedNIST Description



Unnamed: 0,Asset Key,Type,Shape
0,"[""imageData""]",Tensor,"(40000, 256, 256, 3)"
1,"[""labels""]",Tensor,"(40000,)"


#### Uploading individual assets to a dataset

In [None]:
# IInd Way
# Creates an empty dataset object

# Step I: Create an empty dataset
dataset_pointer = domain.create_dataset(
    name="MedNIST", description="MedNIST Description",
    tags=["Medical", "X-Ray Images"],
    metadata=metadata,
)

# Ste II: Add asset to existing dataset
dataset_pointer.add(key="imageData", val=private_image_tensors)

```bash
Loading dataset ......  Checking assets......
Uploading ..... [100%]
Done.

```

In [6]:
dataset_pointer


Name: MedNIST
Description: MedNIST Description



Unnamed: 0,Asset Key,Type,Shape
0,"[""imageData""]",Tensor,"(40000, 256, 256, 3)"


In [None]:
dataset_pointer.add(key="labels", val=private_label_tensors)

In [7]:
dataset_pointer


Name: MedNIST
Description: MedNIST Description



Unnamed: 0,Asset Key,Type,Shape
0,"[""imageData""]",Tensor,"(40000, 256, 256, 3)"
1,"[""labels""]",Tensor,"(40000,)"


In [None]:
# We can perform other CRUD operations on the assets

# delete an asset
dataset_pointer.delete(key="imageData")

# update an asset
dataset_pointer.update(key="imageData", val=private_image_tensors)

In [4]:
# If key is not present, raise an Error
dataset_pointer.delete(key="assetKey")


    [91mAssetDoesNotExists:[0m
        The asset with key `assetKey` does not exists.



In [6]:
# Similarly, throw an error on update if the asset key doesn't exist
dataset_pointer.update(key="assetKey", value=list(range(100)))


    [91mAssetDoesNotExists:[0m
        The asset with key `assetKey` does not exists.
        Please use `.add` to add a new asset to the dataset.



In [15]:
# Adding an asset with `key` already exists

dataset_pointer.add(key="imageData", val=np.random.randrange(100, 2, 3))


    [91mIntegrityError:[0m
        The asset with key `imageData` already exists.
        Please use a different key name.



In [None]:
#### Uploading asset in batches


start, end = 0, len(image_file_list)
batch_size = 1000
idx = 0
while(start < end):
    idx += 1
    
    batch = image_file_list[start:end] # Select a batch of images
    data_tensors = sy.Tensor(load_as_numpy_array(batch)) # Load the images as numpy array and convert them to Tensors
    
    # Add ADP metadata to image tensors 
    private_image_tensors = data_tensors.annotate_with_dp_metadata(
        lower_bound=0, upper_bound=255, entities=[str(s) for s in range(start, end)]
    )
    
    key = f"imageData_{idx}"  # Asset key
    dataset_pointer.add(key=key, val=private_image_tensors)  # Add asset to the dataset pointer
    start += batch_size
    print(f"Batches successfully uploaded: {idx}/{end//batch_size}")

```
    Batches successfully uploaded: 2/60
```

In [13]:
#### A Data Scientist tries to upload or add an asset
# Or any user who doesn't have upload permissions to the domain
# Then throw an error.

ds_client = sy.login(
    url="http://localhost",
    email="sheldon@caltech.edu",
    password="bazinga",
    port=8081
)

# Tries to create a dataset
dataset_ptr = ds_client.create_dataset(
    name="Hello", description="My Dataset",
    tags=["Tag1", "Tag2"]
)

# or add asset to the dataset
dataset_ptr.add(key="hello", val=[1,2,3,4,5])

# or delete/update an asset
dataset_ptr.delete(key="hello")
dataset_ptr.update(key="hello", val=[1,2,3,4,5])


    [91mPermissionDenied:[0m
        Sorry, looks like you don't have permissions to perform the operation.



In [None]:
#### Accessing the assets to an iterator
# We want to access the assets of this dataset as an iterator

dataset_ptr = ds_client.datasets[0] # Selected the MedNIST dataset

# We want an iterator around the image assets only.
image_data_iterator = dataset_ptr.iter(exclude=["labels",]) # This returns all the assets, except lables as an iterator

mean_image = None
cnt = 0
for image_data in image_data_iterator:
    cnt += 1
    if not mean_value:
        mean_image = image_data
    else:
        mean_image += image_data
    
mean_image = mean_image / cnt

result = mean_image.request(reason="Mean Image of the whole dataset")
result.get()

#### Dummy Dataset

In [1]:
import pandas as pd
from enum import Enum
import uuid
import datetime


class bcolors(Enum):
    HEADER = "\033[95m"
    OKBLUE = "\033[94m"
    OKCYAN = "\033[96m"
    OKGREEN = "\033[92m"
    WARNING = "\033[93m"
    FAIL = "\033[91m"
    ENDC = "\033[0m"
    BOLD = "\033[1m"
    UNDERLINE = "\033[4m"

In [6]:
asset_deleted = f"""
    {bcolors.FAIL.value}AssetDoesNotExists:{bcolors.ENDC.value}
        The asset with key `assetKey` does not exists.
"""
print(asset_deleted)


    [91mAssetDoesNotExists:[0m
        The asset with key `assetKey` does not exists.



In [7]:
update_asset_not_exists = f"""
    {bcolors.FAIL.value}AssetDoesNotExists:{bcolors.ENDC.value}
        The asset with key `assetKey` does not exists.
        Please use `.add` to add a new asset to the dataset.
"""
print(update_asset_not_exists)


    [91mAssetDoesNotExists:[0m
        The asset with key `assetKey` does not exists.
        Please use `.add` to add a new asset to the dataset.



In [12]:
authorization_error = f"""
    {bcolors.FAIL.value}PermissionDenied:{bcolors.ENDC.value}
        Sorry, looks like you don't have permissions to perform the operation.
"""
print(authorization_error)


    [91mPermissionDenied:[0m
        Sorry, looks like you don't have permissions to perform the operation.



In [14]:
key_already_exists = f"""
    {bcolors.FAIL.value}IntegrityError:{bcolors.ENDC.value}
        The asset with key `imageData` already exists.
        Please use a different key name.
"""

print(key_already_exists)


    [91mIntegrityError:[0m
        The asset with key `imageData` already exists.
        Please use a different key name.



In [3]:
dataset_detail = [
    {
        "Asset Key": '["imageData"]',
        "Type": "Tensor",
        "Shape": "(40000, 256, 256, 3)"
    },
    {
        "Asset Key": '["labels"]',
        "Type": "Tensor",
        "Shape": "(40000,)"
    },

]
print("""
Name: MedNIST
Description: MedNIST Description
""")
dataset_detail_df = pd.DataFrame(dataset_detail)
dataset_detail_df


Name: MedNIST
Description: MedNIST Description



Unnamed: 0,Asset Key,Type,Shape
0,"[""imageData""]",Tensor,"(40000, 256, 256, 3)"
1,"[""labels""]",Tensor,"(40000,)"


In [4]:
print("""
Name: MedNIST
Description: MedNIST Description
""")
empty_df = pd.DataFrame([])
empty_df


Name: MedNIST
Description: MedNIST Description



In [5]:
one_added_detail = [
    {
        "Asset Key": '["imageData"]',
        "Type": "Tensor",
        "Shape": "(40000, 256, 256, 3)"
    },
]
print("""
Name: MedNIST
Description: MedNIST Description
""")
one_added_detail_df = pd.DataFrame(one_added_detail)
one_added_detail_df


Name: MedNIST
Description: MedNIST Description



Unnamed: 0,Asset Key,Type,Shape
0,"[""imageData""]",Tensor,"(40000, 256, 256, 3)"
