In [None]:
# syft absolute
import syft as sy

SYFT_VERSION = ">=0.9.0,<0.9.1"
sy.requires(SYFT_VERSION)

In [None]:
# launching a test node
node = sy.orchestra.launch(name="test_domain", port=8080, dev_mode=False, reset=True)

# logging in with default credentials (just for the example)
domain = sy.login(email="info@openmined.org", password="changethis", port=8080)

In [None]:
# stdlib
import os

if not os.path.exists("ages_dataset.csv"):
    !curl -O https://openminedblob.blob.core.windows.net/csvs/ages_dataset.csv

In [None]:
# third party
import pandas as pd

# syft absolute
import syft as sy

age_df = pd.read_csv("ages_dataset.csv")
age_df = age_df.dropna(how="any")
age_df.head()

In [None]:
# stdlib
# TODO: also move to dataset repo
import os

if not os.path.exists("ages_mock_dataset.csv"):
    !curl -O https://openminedblob.blob.core.windows.net/csvs/ages_mock_dataset.csv

In [None]:
age_mock_df = pd.read_csv("ages_mock_dataset.csv")
age_mock_df = age_mock_df.dropna(how="any")
age_mock_df.head()

### Creating a sy.Asset

To create an asset (sy.Asset), the available parameters are:
- `name (type: string)`: name of the asset, acts as a key among the assets in the same dataset and it must be unique
- `data (type: Pandas DataFrame)`: contains the private data; if you are preparing the assets for the low-side domain (as in Level 0), this can also use the mock data.
- `mock (type: Pandas Dataframe)`: contains the fake data; this data should have the same schema as the private data, but does not contain any sensitive information
<div class="admonition info">
    <p class="admonition-title" style="font-weight:bold">Info</p>
    If you are uploading data to the high-side and there is no mock available or it is not considered necessary, you can pass <code>mock=sy.ActionObject.empty()</code> to signal this.
</div>    

In [None]:
# How an asset for low side and high-side would be defined:
main_contributor = sy.Contributor(
    name="Jeffrey Salazar", role="Dataset Creator", email="jsala@ailab.com"
)

low_side_asset = sy.Asset(
    name="asset_name",
    data=age_df,  # real dataframe
    mock=age_mock_df,  # mock dataframe
    contributors=[main_contributor],
)

high_side_asset = sy.Asset(
    name="Age Data 2023",
    data=age_df,
    mock=sy.ActionObject.empty(),
    contributors=[main_contributor],
)

In [None]:
# Example creating the dataset for a low-side domain

low_side_dataset = sy.Dataset(
    name="Dataset name",
    description="**Dataset description**",
    asset_list=[low_side_asset],
    contributors=[main_contributor],
)


high_side_dataset = sy.Dataset(
    name="Dataset name",
    description="**Dataset description**",
    asset_list=[high_side_asset],
    contributors=[main_contributor],
)

In [None]:
description_template = """### About the dataset
This extensive dataset provides a rich collection of demographic and life events records for individuals across multiple countries. It covers a wide range of indicators and attributes related to personal information, birth and death events, gender, occupation, and associated countries. The dataset offers valuable insights into population dynamics and various aspects of human life, enabling comprehensive analyses and cross-country comparisons. The dataset is the largest one on notable deceased people and includes individ- uals from a variety of social groups, including but not limited to 107k females, 90k researchers, and 124 non-binary indi- viduals, spread across more than 300 contemporary or histor- ical regions.

### Dataset usage policy
This dataset is subject to compliance with internal data use and mis-use policies at our organisation. The following rules apply:
- only aggregate statistics can be released from data computation
- data subjects should never be identifiable through the data computation outcomes
- a fixed privacy budget of eps=5 must be preserved by each researcher

### Data collection and pre-processing
The dataset is based on open data hosted by Wikimedia Foundation.

**Age**
Whenever possible, age was calculated based on the birth and death year mentioned in the description of the individual.

**Gender**
Gender was available in the original dataset for 50% of participants. For the remaining, it was added from predictions based on name, country and century in which they lived. (97.51% accuracy and 98.89% F1-score)

**Occupation**
The occupation was available in the original dataset for 66% of the individuals. For the remaining, it was added from predictions from a multiclass text classificator model. (93.4% accuracy for 84% of the dataset)

More details about the features can be found by reading the paper.

### Key features
1. **Id**: Unique identifier for each individual.
2. **Name**: Name of the person.
3. **Short description**: Brief description or summary of the individual.
4. **Gender**: Gender/s of the individual.
5. **Country**: Countries/Kingdoms of residence and/or origin.
6. **Occupation**: Occupation or profession of the individual.
7. **Birth year**: Year of birth for the individual.
8. **Death year**: Year of death for the individual.
9. **Manner of death**: Details about the circumstances or manner of death.
10. **Age of death**: Age at the time of death for the individual.
11. **Associated Countries**: Modern Day Countries associated with the individual.
12. **Associated Country Coordinates (Lat/Lon)**: Modern Day Latitude and longitude coordinates of the associated countries.
13. **Associated Country Life Expectancy**: Life expectancy of the associated countries.

### Use cases
- Analyze demographic trends and birth rates in different countries.
- Investigate factors affecting life expectancy and mortality rates.
- Study the relationship between gender and occupation across regions.
- Explore correlations between age of death and associated country attributes.
- Examine patterns of migration and associated countries' life expectancy.


### Getting started

```
!curl -O https://openminedblob.blob.core.windows.net/csvs/ages_dataset.csv

age_df = pd.read_csv("ages_dataset.csv")
```

### Execution environment
The data is hosted in a remote compute environment with the following specifications:
- X CPU cores
- 1 GPU of type Y
- Z RAM
- A additional available storage

### Citation
Annamoradnejad, Issa; Annamoradnejad, Rahimberdi (2022), “Age dataset: A structured general-purpose dataset on life, work, and death of 1.22 million distinguished people”, In Workshop Proceedings of the 16th International AAAI Conference on Web and Social Media (ICWSM), doi: 10.36190/2022.82
"""

In [None]:
# Complete example: creating a dataset with one asset

low_side_dataset = sy.Dataset(
    name="Dataset name",
    description=description_template,
    asset_list=[sy.Asset(name="Age Data 2023", data=age_df, mock=age_mock_df)],
    contributors=[main_contributor],
)

In [None]:
# Uploading the dataset
domain.upload_dataset(low_side_dataset)

In [None]:
# returns a list of all the available datasets for that domain (or empty list if none)
domain.datasets

In [None]:
# access a particular dataset by its index, or by its unique key (name)
# domain.datasets[0] # or domain.datasets["Dataset name"]

In [None]:
# access a particular asset by its index, or by its unique key (name)
asset = domain.datasets[0].assets[0]  # or domain.datasets[0].assets["Age Data 2023"]

In [None]:
# Access the mock or the real data within an asset
dataset = domain.datasets[0]

mock_data = dataset.assets[0].mock  # or dataset.assets[0].data, for the real data

In [None]:
# access the markdown description of the dataset
domain.datasets[0].description