# Step 4: Upload a Dataset

## Step 4a: Log into our Domain

In [1]:
import syft as sy

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Let's log into the domain using the credentials
ADMIN_EMAIL = "info@openmined.org"
ADMIN_PASSWORD = "changethis"
DOMAIN1_PORT = 8081
domain_client = sy.login(
     email=ADMIN_EMAIL, password=ADMIN_PASSWORD,port = DOMAIN1_PORT
)


Anyone can login as an admin to your node right now because your password is still the default PySyft username and password!!!

Connecting to localhost... done! 	 Logging into canada... done!


## Step 4b: Creating a Dataset

In [3]:
from utils import download_mednist_dataset

download_mednist_dataset()

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:--  0:00:04 --:--:--     0

MedNIST is successfully downloaded.


100  232M  100  232M    0     0  47.6M      0  0:00:04  0:00:04 --:--:-- 47.7M


In [4]:
# file path where the MedNIST.pkl is downloaded
FILE_PATH = "./MedNIST.pkl"

In [5]:
# replace these with your own from the session details
MY_PARTICIPANT_NUMBER = 1
TOTAL_PARTICIPANTS = 10

### Load the Dataset

In [6]:
# Import helper methods
from syft.core.adp.data_subject_list import DataSubjectList
from utils import (
    get_data_description,
    get_label_mapping,
    split_into_train_test_val_sets,
    load_data_as_df,
)

In [7]:
# Let's load the dataset as a dataframe
dataset_df = load_data_as_df(MY_PARTICIPANT_NUMBER, TOTAL_PARTICIPANTS, FILE_PATH)
# Split the dataset into `train`, `validation` and `test` sets.
data_dict = split_into_train_test_val_sets(dataset_df)
dataset_description = get_data_description(dataset_df)
print(dataset_description)

Columns: Index(['patient_id', 'image', 'label'], dtype='object')
Total Images: 5895
Label Mapping {'AbdomenCT': 0, 'BreastMRI': 1, 'CXR': 2, 'ChestCT': 3, 'Hand': 4, 'HeadCT': 5}
The MedNIST dataset was gathered from several sets from TCIA, the RSNA Bone Age Challenge, and the NIH Chest X-ray dataset. The dataset is kindly made available by Dr. Bradley J. Erickson M.D., Ph.D. (Department of Radiology, Mayo Clinic) under the Creative Commons CC BY-SA 4.0 license.
Label Count: 6
Label Mapping: {"AbdomenCT": 0, "BreastMRI": 1, "CXR": 2, "ChestCT": 3, "Hand": 4, "HeadCT": 5}
Image Dimensions: (64, 64)
Total Images: 5895



In [8]:
import numpy as np

assets = dict()

for name, data in data_dict.items():

    # Let's create data subjects list.
    # Data Subjects are the individuals whose privacy we're trying to protect.
    data_subjects = DataSubjectList.from_series(data["patient_id"])

    # Convert images to numpy int64 array
    images = data["image"]
    images = np.dstack(images.values).astype(np.int64)  # type cast to int64
    dims = images.shape
    images = images.reshape(dims[0] * dims[1], dims[2])  # reshape to 2D array
    images = np.rollaxis(images, -1)

    # Convert labels to numpy int64 array
    labels = data["label"].to_numpy().astype("int64")

    # Next we will make your data private private with min, max and data subjects.
    # The min and max are minimum and maximum value in the given data.

    # converting images to private data
    image_data = sy.Tensor(images).private(
        min_val=0, max_val=255, data_subjects=data_subjects
    )

    # converting labels to private data
    label_data = sy.Tensor(labels).private(
        min_val=0, max_val=5, data_subjects=data_subjects
    )

    assets[f"{name}_images"] = image_data
    assets[f"{name}_labels"] = label_data

## Step 4c: Upload the Dataset

In [9]:
# creating/uploading the dataset
# Name of the dataset

name = f"MedNIST Data {MY_PARTICIPANT_NUMBER}/{TOTAL_PARTICIPANTS}"

In [10]:
# upload the MedNIST data
domain_client.load_dataset(
    assets=assets, name=name, description=dataset_description
)

Loading dataset... uploading...🚀                        



Dataset is uploaded successfully !!! 🎉

Run `<your client variable>.datasets` to see your new dataset loaded into your machine!


Now let's check if the dataset we successfully uploaded

In [11]:
domain_client.datasets

Idx,Name,Description,Assets,Id
[0],Canada Trade Data - First few rows,A collection of reports from Canada's statistics bureau about how much it thinks it imports and exports from other countries.,"[""Canada Trade""] -> int64",2880b9bc-f795-4e0a-8554-b048629fd37f
[1],Canada Trade Data - First few rows,A collection of reports from Canada's statistics bureau about how much it thinks it imports and exports from other countries.,"[""Canada Trade""] -> int64",0ab77b74-6229-4016-9b2c-23001d15f3f2
[2],Canada Trade Data - First few rows,A collection of reports from Canada's statistics bureau about how much it thinks it imports and exports from other countries.,"[""Canada Trade""] -> int64",981eba78-c7b4-436a-96e4-2ca7f0d3a587
[3],Canada Trade Data - First few rows,A collection of reports from Canada's statistics bureau about how much it thinks it imports and exports from other countries.,"[""Canada Trade""] -> int64",0a84109b-966c-4f1d-a259-358cfb873d64
[4],MedNIST Data 1/10,"The MedNIST dataset was gathered from several sets from TCIA, the RSNA Bone Age Challenge, and the NIH Chest X-ray dataset. The dataset is kindly made available by Dr. Bradley J. Erickson M.D., Ph.D. (Department of Radiology, Mayo Clinic) under the Creative Commons CC BY-SA 4.0 license. Label Count: 6 Label Mapping: {""AbdomenCT"": 0, ""BreastMRI"": 1, ""CXR"": 2, ""ChestCT"": 3, ""Hand"": 4, ""HeadCT"": 5} Image Dimensions: (64, 64) Total Images: 5895","[""train_images""] -> [""train_labels""] -> [""val_images""] -> ...",760b0571-3b09-4fdd-bfca-3b091c5cdd19
[5],test,test dataset,"[""test""] -> int64",d2f09e24-ce96-40fe-b949-8d3aa6a56198
[6],MedNIST Data 1/10,"The MedNIST dataset was gathered from several sets from TCIA, the RSNA Bone Age Challenge, and the NIH Chest X-ray dataset. The dataset is kindly made available by Dr. Bradley J. Erickson M.D., Ph.D. (Department of Radiology, Mayo Clinic) under the Creative Commons CC BY-SA 4.0 license. Label Count: 6 Label Mapping: {""AbdomenCT"": 0, ""BreastMRI"": 1, ""CXR"": 2, ""ChestCT"": 3, ""Hand"": 4, ""HeadCT"": 5} Image Dimensions: (64, 64) Total Images: 5895","[""train_images""] -> int64 [""train_labels""] -> int64 [""val_images""] -> int64 ...",ddcef916-ddb8-48af-8cf2-22aa93710735


In [12]:
domain_client.datasets[-1]

Dataset: MedNIST Data 1/10
Description: The MedNIST dataset was gathered from several sets from TCIA, the RSNA Bone Age Challenge, and the NIH Chest X-ray dataset. The dataset is kindly made available by Dr. Bradley J. Erickson M.D., Ph.D. (Department of Radiology, Mayo Clinic) under the Creative Commons CC BY-SA 4.0 license.
Label Count: 6
Label Mapping: {"AbdomenCT": 0, "BreastMRI": 1, "CXR": 2, "ChestCT": 3, "Hand": 4, "HeadCT": 5}
Image Dimensions: (64, 64)
Total Images: 5895




Asset Key,Type,Shape
"[""train_images""]",int64,"(4701, 4096)"
"[""train_labels""]",int64,"(4701,)"
"[""val_images""]",int64,"(556, 4096)"
"[""val_labels""]",int64,"(556,)"
"[""test_images""]",int64,"(638, 4096)"
"[""test_labels""]",int64,"(638,)"


## Step 4d: Create a Data Scientist Account

In [None]:
data_scientist_details = {
    "name": "Samantha Carter",
    "email": "sam@sg1.net",
    "password": "stargate",
    "budget": 9999,
}

In [None]:
domain_client.users.create(**data_scientist_details)

In [None]:
print("Please give these details to the data scientist 👇🏽")
login_details = {}
login_details["url"] = 8081
login_details["name"] = data_scientist_details["name"]
login_details["email"] = data_scientist_details["email"]
login_details["password"] = data_scientist_details["password"]
login_details["dataset_name"] = name
print()
print(login_details)