In [None]:
# TODO: also move to dataset repo
# stdlib
import os

if not os.path.exists("ages_dataset.csv"):
    !curl -O https://openminedblob.blob.core.windows.net/csvs/ages_dataset.csv

if not os.path.exists("ages_mock_dataset.csv"):
    !curl -O https://openminedblob.blob.core.windows.net/csvs/ages_mock_dataset.csv

In [None]:
# syft absolute
import syft as sy


def helper_function():
    # third party
    import pandas as pd

    description_template = """### About the dataset
This extensive dataset provides a rich collection of demographic and life events records for individuals across multiple countries. It covers a wide range of indicators and attributes related to personal information, birth and death events, gender, occupation, and associated countries. The dataset offers valuable insights into population dynamics and various aspects of human life, enabling comprehensive analyses and cross-country comparisons. The dataset is the largest one on notable deceased people and includes individ- uals from a variety of social groups, including but not limited to 107k females, 90k researchers, and 124 non-binary indi- viduals, spread across more than 300 contemporary or histor- ical regions.

### Dataset usage policy
This dataset is subject to compliance with internal data use and mis-use policies at our organisation. The following rules apply:
- only aggregate statistics can be released from data computation
- data subjects should never be identifiable through the data computation outcomes
- a fixed privacy budget of eps=5 must be preserved by each researcher

### Data collection and pre-processing
The dataset is based on open data hosted by Wikimedia Foundation.

**Age**
Whenever possible, age was calculated based on the birth and death year mentioned in the description of the individual.

**Gender**
Gender was available in the original dataset for 50% of participants. For the remaining, it was added from predictions based on name, country and century in which they lived. (97.51% accuracy and 98.89% F1-score)

**Occupation**
The occupation was available in the original dataset for 66% of the individuals. For the remaining, it was added from predictions from a multiclass text classificator model. (93.4% accuracy for 84% of the dataset)

More details about the features can be found by reading the paper.

### Key features
1. **Id**: Unique identifier for each individual.
2. **Name**: Name of the person.
3. **Short description**: Brief description or summary of the individual.
4. **Gender**: Gender/s of the individual.
5. **Country**: Countries/Kingdoms of residence and/or origin.
6. **Occupation**: Occupation or profession of the individual.
7. **Birth year**: Year of birth for the individual.
8. **Death year**: Year of death for the individual.
9. **Manner of death**: Details about the circumstances or manner of death.
10. **Age of death**: Age at the time of death for the individual.
11. **Associated Countries**: Modern Day Countries associated with the individual.
12. **Associated Country Coordinates (Lat/Lon)**: Modern Day Latitude and longitude coordinates of the associated countries.
13. **Associated Country Life Expectancy**: Life expectancy of the associated countries.

### Use cases
- Analyze demographic trends and birth rates in different countries.
- Investigate factors affecting life expectancy and mortality rates.
- Study the relationship between gender and occupation across regions.
- Explore correlations between age of death and associated country attributes.
- Examine patterns of migration and associated countries' life expectancy.


### Getting started

```
!curl -O https://openminedblob.blob.core.windows.net/csvs/ages_dataset.csv
!curl -O https://openminedblob.blob.core.windows.net/csvs/ages_mock_dataset.csv

age_df = pd.read_csv("ages_dataset.csv")
```

### Execution environment
The data is hosted in a remote compute environment with the following specifications:
- X CPU cores
- 1 GPU of type Y
- Z RAM
- A additional available storage

### Citation
Annamoradnejad, Issa; Annamoradnejad, Rahimberdi (2022), “Age dataset: A structured general-purpose dataset on life, work, and death of 1.22 million distinguished people”, In Workshop Proceedings of the 16th International AAAI Conference on Web and Social Media (ICWSM), doi: 10.36190/2022.82
"""

    # launching a test node
    node = sy.orchestra.launch(
        name="test_domain", port=8085, dev_mode=False, reset=True
    )

    # logging in with default credentials (only for example)
    domain = node.login(email="info@openmined.org", password="changethis", port=8085)

    age_df = pd.read_csv("ages_dataset.csv")
    age_df = age_df.dropna(how="any")

    age_mock_df = pd.read_csv("ages_mock_dataset.csv")
    age_mock_df = age_mock_df.dropna(how="any")

    level_0_dataset = sy.Dataset(
        name="Age Dataset",
        description=description_template,
        asset_list=[sy.Asset(name="Age Data 2023", data=age_df, mock=age_mock_df)],
    )
    level_0_dataset.add_contributor(
        name="Markus", role="Uploader", email="markus@gmail.com"
    )
    level_0_dataset.assets[0].add_contributor(
        name="Markus", role="Uploader", email="markus@gmail.com"
    )

    # Uploading the dataset
    domain.upload_dataset(level_0_dataset)

    # Register a new user as a GUEST
    domain.register(
        name="Jane Doe",
        email="jane@caltech.edu",
        password="abc123",
        password_verify="abc123",
        institution="California Institute of Technology",
    )

    domain.settings.allow_guest_signup(enable=True)


def helper_function_approval():
    domain = sy.login(email="info@openmined.org", password="changethis", port=8085)
    requests = domain.requests
    requests[0].approve()


helper_function()

In [None]:
# syft absolute

# Join as a GUEST
domain = sy.login_as_guest(url="localhost", port=8085)

# Create your own account
domain.register(
    email="holmes@bakerstreet.com",
    password="SKY5cC2zQPRP",
    password_verify="SKY5cC2zQPRP",
    name="Holmes",
)

# Register and login into the domain with your own set of credentials
client = sy.login(
    url="localhost", port=8085, email="holmes@bakerstreet.com", password="SKY5cC2zQPRP"
)

client

In [None]:
# Access a list of all available datasets in the domain
client.datasets

In [None]:
# Access one of the datasets in the list by index and store it in a variable for later use

dataset = client.datasets[0]
dataset

In [None]:
# Access specific properties of a dataset
dataset.description

In [None]:
# Access the list of assets attached to a dataset. In this dataset, there is just one asset
dataset.assets

In [None]:
asset = dataset.assets[0]

In [None]:
# Access data (mock data) from an asset (Approach 1)

mock_df = dataset.assets[0].mock
mock_df

In [None]:
print(mock_df.shape)
mock_df.head()

In [None]:
mock_df.describe()

In [None]:
real_df = dataset.assets[0].data
real_df

In [None]:
print(type(real_df))