In [40]:
from syft.core.node.new.data_subject import DataSubject, DataSubjectRegistry
from syft.core.node.new.dataset import CreateAsset, CreateDataset
import syft as sy
from syft.core.node.worker import Worker
import numpy as np
import pandas as pd

#### Creating Data Subjects

In [2]:
country = DataSubject(name="Country", aliases=["country_code"])

In [3]:
canada = DataSubject(name="Canada", aliases=["country_code:ca"])
germany = DataSubject(name="Germany", aliases=["country_code:de"])
spain = DataSubject(name="Spain", aliases=["country_code:es"])
france = DataSubject(name="France", aliases=["country_code:fr"])
japan = DataSubject(name="Japan", aliases=["country_code:jp"])
uk = DataSubject(name="United Kingdom", aliases=["country_code:uk"])
usa = DataSubject(name="United States of America", aliases=["country_code:us"])

In [4]:
country.add_member(canada)
country.add_member(germany)
country.add_member(spain)
country.add_member(france)
country.add_member(japan)
country.add_member(uk)
country.add_member(usa)


In [5]:
country

```python
DataSubject: Country
Description: None
Aliases: ['country_code']
Members: 7

```

In [6]:
country.members

Unnamed: 0,key,type,id
0,Canada,<DataSubject: Canada>,b8bb7c9e07f845aabe7557c38c453a30
1,Germany,<DataSubject: Germany>,d41b09667c994557aa8f3bbfb97b82b5
2,Spain,<DataSubject: Spain>,d3c2d6463ad247c680be2410afa70350
3,France,<DataSubject: France>,bb806ada99214171aa733a07dba90af3
4,Japan,<DataSubject: Japan>,a929972df00840998e3371b715d48ce9


In [7]:
country.members['Canada']

```python
DataSubject: Canada
Description: None
Aliases: ['country_code:ca']
Members: 0

```

#### Creating Asset

In [8]:
gppwa = CreateAsset(name="grouped_political_parties_with_amplification")

asset_desc = """
BACKGROUND:

This dataset refers to the calculated amplification ratios in the paper grouped by
political party. This allows one to create a graph indicating the degree of ammplification 
that each party received over the time of the study.

STATISTICS:

    - num cols: 5
    - num rows: 7073

ASSETS:

    - countries:
        REAL: this is the country code of the political party being referred to.
        MOCK: this draws from the same distribution of countries but is generated randomly

    - grouping_id:
        REAL: each country is divided into a small number of groups, specified by the grouping id
        MOCK: these groups are pulled from the same labels as the underlying dataset and is randomly
        sorted but in a way that matches the corresponding country code in "countries".

    - group_label:
        REAL: the political party of this row. 
        MOCK: uses the same political party labels as the real dataset, and puts them in the correct
        country but within a country's rows the parties are drawn at random

    - bootstrap_fold_id:
        REAL: the int id of a corresponding group that came out of the bootstrap algorithm. I honestly
        don't really know what this is but I encourage you to study the paper and figure it out.
        MOCK: randomly generated IDs between 1 and 1000. Since ids were randomly generated we did
        ensure that all IDs from the original dataset are represented at least once.

    - amplification:
        REAL: the relative measure of amplification. Please see the paper for details. They're floats.
        MOCK: randomly generated amplification floats between -100 and 1000.

DATA SUBJECTS / PRIVACY NOTES:

    Data subjects are tracked at country and party level as there may be some sensitives when
    releasing the dataset based on these issues.
"""

gppwa.set_description(asset_desc)

In [9]:
gppwa.add_data_subject(country)

In [10]:
gppwa.add_contributor(role="UPLOADER",#sy.roles.UPLOADER, 
                      name="Andrew Trask", 
                      email="andrew@openmined.org",
                      note="Andrew runs this domamin and prepared the asset.")

In [11]:
original_data = {"id": [1, 2], "age": [10, 12]}
mock_data = {"id": [5, 6], "age": [2, 7]}
original_data = pd.DataFrame(original_data)
mock_data = pd.DataFrame(mock_data)

In [12]:
orig_data_as_numpy = original_data.to_numpy()
mock_data_as_numpy = mock_data.to_numpy()

In [13]:
gppwa.set_obj(data=orig_data_as_numpy)

In [14]:
gppwa.set_mock(mock_data=mock_data_as_numpy, mock_is_real=False)

In [15]:
gppwa.set_shape(shape=original_data.shape)

In [16]:
gppwa

```python
Asset: grouped_political_parties_with_amplification
Pointer Id: None
Description: 
BACKGROUND:

This dataset refers to the calculated amplification ratios in the paper grouped by
political party. This allows one to create a graph indicating the degree of ammplification 
that each party received over the time of the study.

STATISTICS:

    - num cols: 5
    - num rows: 7073

ASSETS:

    - countries:
        REAL: this is the country code of the political party being referred to.
        MOCK: this draws from the same distribution of countries but is generated randomly

    - grouping_id:
        REAL: each country is divided into a small number of groups, specified by the grouping id
        MOCK: these groups are pulled from the same labels as the underlying dataset and is randomly
        sorted but in a way that matches the corresponding country code in "countries".

    - group_label:
        REAL: the political party of this row. 
        MOCK: uses the same political party labels as the real dataset, and puts them in the correct
        country but within a country's rows the parties are drawn at random

    - bootstrap_fold_id:
        REAL: the int id of a corresponding group that came out of the bootstrap algorithm. I honestly
        don't really know what this is but I encourage you to study the paper and figure it out.
        MOCK: randomly generated IDs between 1 and 1000. Since ids were randomly generated we did
        ensure that all IDs from the original dataset are represented at least once.

    - amplification:
        REAL: the relative measure of amplification. Please see the paper for details. They're floats.
        MOCK: randomly generated amplification floats between -100 and 1000.

DATA SUBJECTS / PRIVACY NOTES:

    Data subjects are tracked at country and party level as there may be some sensitives when
    releasing the dataset based on these issues.

Total Data Subjects: 1
Shape: (2, 2)
Contributors:
	Andrew Trask: andrew@openmined.org

```

In [17]:
gppwa.mock

array([[5, 2],
       [6, 7]])

#### Creating a Dataset

In [18]:
dataset = CreateDataset(name="My Dataset")
dataset.set_description("My Short Description")
dataset.add_citation("My Citations.......")
dataset.add_contributor(name="Shubham", email="shubham@openmined", note="My Note... Do not touch :)", role="Developer")
dataset.add_url("https://www.openmined.org")

In [19]:
# Adding asset
dataset.add_asset(gppwa)

In [20]:
dataset

```python
Syft Dataset: My Dataset
Assets:
	grouped_political_parties_with_amplification: 
BACKGROUND:

This dataset refers to the calculated amplification ratios in the paper grouped by
political party. This allows one to create a graph indicating the degree of ammplification 
that each party received over the time of the study.

STATISTICS:

    - num cols: 5
    - num rows: 7073

ASSETS:

    - countries:
        REAL: this is the country code of the political party being referred to.
        MOCK: this draws from the same distribution of countries but is generated randomly

    - grouping_id:
        REAL: each country is divided into a small number of groups, specified by the grouping id
        MOCK: these groups are pulled from the same labels as the underlying dataset and is randomly
        sorted but in a way that matches the corresponding country code in "countries".

    - group_label:
        REAL: the political party of this row. 
        MOCK: uses the same political party labels as the real dataset, and puts them in the correct
        country but within a country's rows the parties are drawn at random

    - bootstrap_fold_id:
        REAL: the int id of a corresponding group that came out of the bootstrap algorithm. I honestly
        don't really know what this is but I encourage you to study the paper and figure it out.
        MOCK: randomly generated IDs between 1 and 1000. Since ids were randomly generated we did
        ensure that all IDs from the original dataset are represented at least once.

    - amplification:
        REAL: the relative measure of amplification. Please see the paper for details. They're floats.
        MOCK: randomly generated amplification floats between -100 and 1000.

DATA SUBJECTS / PRIVACY NOTES:

    Data subjects are tracked at country and party level as there may be some sensitives when
    releasing the dataset based on these issues.

Citation: My Citations.......
URL: https://www.openmined.org
Description: My Short Description

```

#### Uploading Dataset

In [21]:
worker = Worker()

Starting Worker: Zealous Wolf - cd41db9cf1d4429b90237c0f0ac99330 [<class 'syft.core.node.new.user_service.UserService'>, <class 'syft.core.node.new.action_service.ActionService'>, <class 'syft.core.node.new.test_service.TestService'>, <class 'syft.core.node.new.dataset_service.DatasetService'>, <class 'syft.core.node.new.user_code_service.UserCodeService'>]


In [22]:
client = sy.new_login(node=worker, email="info@openmined.org", password="changethis")

Logged into Zealous Wolf as <info@openmined.org>


In [23]:
client.upload_dataset(dataset=dataset)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 268.32it/s]

Uploading: grouped_political_parties_with_amplification





In [27]:
datasets = client.api.services.dataset.get_all()

In [28]:
datasets

Unnamed: 0,type,id
0,syft.core.node.new.dataset.Dataset,ac81fa7651774529b550500d35e8b527


In [29]:
dataset = datasets[0]

In [30]:
dataset

```python
Syft Dataset: My Dataset
Assets:
	grouped_political_parties_with_amplification: 
BACKGROUND:

This dataset refers to the calculated amplification ratios in the paper grouped by
political party. This allows one to create a graph indicating the degree of ammplification 
that each party received over the time of the study.

STATISTICS:

    - num cols: 5
    - num rows: 7073

ASSETS:

    - countries:
        REAL: this is the country code of the political party being referred to.
        MOCK: this draws from the same distribution of countries but is generated randomly

    - grouping_id:
        REAL: each country is divided into a small number of groups, specified by the grouping id
        MOCK: these groups are pulled from the same labels as the underlying dataset and is randomly
        sorted but in a way that matches the corresponding country code in "countries".

    - group_label:
        REAL: the political party of this row. 
        MOCK: uses the same political party labels as the real dataset, and puts them in the correct
        country but within a country's rows the parties are drawn at random

    - bootstrap_fold_id:
        REAL: the int id of a corresponding group that came out of the bootstrap algorithm. I honestly
        don't really know what this is but I encourage you to study the paper and figure it out.
        MOCK: randomly generated IDs between 1 and 1000. Since ids were randomly generated we did
        ensure that all IDs from the original dataset are represented at least once.

    - amplification:
        REAL: the relative measure of amplification. Please see the paper for details. They're floats.
        MOCK: randomly generated amplification floats between -100 and 1000.

DATA SUBJECTS / PRIVACY NOTES:

    Data subjects are tracked at country and party level as there may be some sensitives when
    releasing the dataset based on these issues.

Citation: My Citations.......
URL: https://www.openmined.org
Description: My Short Description

```

In [31]:
dataset.assets

Unnamed: 0,key,type,id
0,grouped_political_parties_with_amplification,syft.core.node.new.dataset.Asset,7380f668f2c0425e8f6a62e10dc8fd27


In [32]:
data_asset = dataset.assets["grouped_political_parties_with_amplification"]

In [33]:
data_asset

```python
Asset: grouped_political_parties_with_amplification
Pointer Id: 3051e4065b284dfda60652c08aad4076
Description: 
BACKGROUND:

This dataset refers to the calculated amplification ratios in the paper grouped by
political party. This allows one to create a graph indicating the degree of ammplification 
that each party received over the time of the study.

STATISTICS:

    - num cols: 5
    - num rows: 7073

ASSETS:

    - countries:
        REAL: this is the country code of the political party being referred to.
        MOCK: this draws from the same distribution of countries but is generated randomly

    - grouping_id:
        REAL: each country is divided into a small number of groups, specified by the grouping id
        MOCK: these groups are pulled from the same labels as the underlying dataset and is randomly
        sorted but in a way that matches the corresponding country code in "countries".

    - group_label:
        REAL: the political party of this row. 
        MOCK: uses the same political party labels as the real dataset, and puts them in the correct
        country but within a country's rows the parties are drawn at random

    - bootstrap_fold_id:
        REAL: the int id of a corresponding group that came out of the bootstrap algorithm. I honestly
        don't really know what this is but I encourage you to study the paper and figure it out.
        MOCK: randomly generated IDs between 1 and 1000. Since ids were randomly generated we did
        ensure that all IDs from the original dataset are represented at least once.

    - amplification:
        REAL: the relative measure of amplification. Please see the paper for details. They're floats.
        MOCK: randomly generated amplification floats between -100 and 1000.

DATA SUBJECTS / PRIVACY NOTES:

    Data subjects are tracked at country and party level as there may be some sensitives when
    releasing the dataset based on these issues.

Total Data Subjects: 1
Shape: (2, 2)
Contributors:
	Andrew Trask: andrew@openmined.org

```

In [34]:
data_asset.mock

array([[5, 2],
       [6, 7]])

In [35]:
hasattr(data_asset, "data")

False

In [36]:
data_asset.action_id

<UID: 3051e4065b284dfda60652c08aad4076>

In [37]:
data_ptr = data_asset.pointer

In [38]:
data_ptr

```python
class NumpyArrayObjectPointer:
  id: str = 3051e4065b284dfda60652c08aad4076
  node_uid: str = cd41db9cf1d4429b90237c0f0ac99330
  parent_id: str = None

```

In [39]:
data_ptr.get_from(client)

array([[ 1, 10],
       [ 2, 12]])

#### Exploring Data Subject Registry

This is currently not integrated on the server, but would behave in a following fashion and recursively add all members of the Data subject to the registry.

In [None]:
data_subject_registry = DataSubjectRegistry()

In [43]:
data_subject_registry.add_data_subject(data_subject=country)

In [44]:
data_subject_registry

```python
DataSubjects: 8

```

In [45]:
data_subject_registry.data_subjects

Unnamed: 0,type,id
0,<DataSubject: Country>,af209471e47042ab922a2db9c2fa2acf
1,<DataSubject: Canada>,b8bb7c9e07f845aabe7557c38c453a30
2,<DataSubject: Germany>,d41b09667c994557aa8f3bbfb97b82b5
3,<DataSubject: Spain>,d3c2d6463ad247c680be2410afa70350
4,<DataSubject: France>,bb806ada99214171aa733a07dba90af3
