# Dataset Examples

This Jupyter Notebook demonstrates various use cases for the Dataset class, including:

1. Initializing an Empty Dataset and Adding Samples
2. Retrieving and Manipulating Samples from a Dataset
3. Performing Operations on the Dataset
4. Saving and Loading Datasets from directories or files

This notebook provides detailed examples of using the Dataset class to manage data, Samples, and information within a PLAID Dataset. It is intended for documentation purposes and familiarization with the PLAID library.

**Each section is documented and explained.**

In [2]:
# Import required libraries
import numpy as np
import os

In [3]:
# Import necessary libraries and functions
import Muscat.Containers.ElementsDescription as ElementsDescription
from Muscat.Bridges.CGNSBridge import MeshToCGNS
from Muscat.Containers import MeshCreationTools as MCT

from plaid.containers.dataset import Dataset
from plaid.containers.sample import Sample

In [4]:
# Print dict util
def dprint(name: str, dictio: dict, end: str = "\n"):
    print(name, '{')
    for key, value in dictio.items():
	    print("    ", key, ':', value)

    print('}', end=end)

## Section 1: Initializing an Empty Dataset and Samples construction

This section demonstrates how to initialize an empty Dataset and handle Samples.

### Initialize an empty Dataset

In [5]:
print("#---# Empty Dataset")
dataset = Dataset()
print(f"{dataset=}")

#---# Empty Dataset
dataset=Dataset(0 samples, 0 scalars, 0 fields)


### Create Sample

In [6]:
# Create Sample
points = np.array([
        [0.0, 0.0],
        [1.0, 0.0],
        [1.0, 1.0],
        [0.0, 1.0],
        [0.5, 1.5],
    ])

triangles = np.array([
        [0, 1, 2],
        [0, 2, 3],
        [2, 4, 3],
    ])

bars = np.array([
        [0, 1],
        [0, 2]
    ])

Mesh = MCT.CreateMeshOfTriangles(points, triangles)
elbars = Mesh.GetElementsOfType(ElementsDescription.Bar_2)
elbars.AddNewElements(bars, [1, 2])
cgns_mesh = MeshToCGNS(Mesh)

# Initialize an empty Sample
print("#---# Empty Sample")
sample_01 = Sample()
print(f"{sample_01 = }")

#---# Empty Sample
sample_01 = Sample(0 scalars, 0 timestamps, 0 fields, no tree)


In [7]:
# Add a CGNS tree structure to the Sample
sample_01.add_tree(cgns_mesh)
print(f"{sample_01 = }")

sample_01 = Sample(0 scalars, 1 timestamp, 1 field)


In [8]:
# Add a scalar to the Sample
sample_01.add_scalar('rotation', np.random.randn())
print(f"{sample_01 = }")

sample_01 = Sample(1 scalar, 1 timestamp, 1 field)


### Print Sample general data

In [9]:
# Initialize another empty Sample
print("#---# Empty Sample")
sample_02 = Sample()
print(f"{sample_02 = }")

#---# Empty Sample
sample_02 = Sample(0 scalars, 0 timestamps, 0 fields, no tree)


In [10]:
# Add a scalar to the second Sample
sample_02.add_scalar('rotation', np.random.randn())
print(f"{sample_02 = }")

sample_02 = Sample(1 scalar, 0 timestamps, 0 fields, no tree)


### Display Sample CGNS tree

In [11]:
# Initialize a third empty Sample
print("#---# Empty Sample")
sample_03 = Sample()
sample_03.add_scalar('speed', np.random.randn())
sample_03.add_scalar('rotation', sample_01.get_scalar('rotation'))
sample_03.add_tree(cgns_mesh)

# Show Sample CGNS content
sample_03.show_tree()

#---# Empty Sample
- "CGNSTree"(CGNSTree_t), 2 children, data(<class 'NoneType'>): None
    - "CGNSLibraryVersion"(CGNSLibraryVersion_t), 0 children, data(<class 'numpy.ndarray'>): [3.4]
    - "Base_2_2"(CGNSBase_t), 3 children, data(<class 'numpy.ndarray'>): [2 2]
        - "Bulk"(FamilyName_t), 0 children, data(<class 'numpy.ndarray'>): b'Bulk'
        - "Zone"(Zone_t), 8 children, data(<class 'numpy.ndarray'>): [[5 5 1]]
            - "ZoneType"(ZoneType_t), 0 children, data(<class 'numpy.ndarray'>): b'Unstructured'
            - "FamilyName"(FamilyName_t), 0 children, data(<class 'numpy.ndarray'>): b'Bulk'
            - "GridCoordinates"(GridCoordinates_t), 2 children, data(<class 'NoneType'>): None
                - "CoordinateX"(DataArray_t), 0 children, data(<class 'numpy.ndarray'>): [0.  1.  1.  0.  0.5]
                - "CoordinateY"(DataArray_t), 0 children, data(<class 'numpy.ndarray'>): [0.  0.  1.  1.  1.5]
            - "Elements_BAR_2"(Elements_t), 2 children, data(<cla

In [12]:
# Add a field to the third empty Sample
sample_03.add_field('temperature', np.random.rand(5), "Zone", "Base_2_2")
sample_03.show_tree()

- "CGNSTree"(CGNSTree_t), 2 children, data(<class 'NoneType'>): None
    - "CGNSLibraryVersion"(CGNSLibraryVersion_t), 0 children, data(<class 'numpy.ndarray'>): [3.4]
    - "Base_2_2"(CGNSBase_t), 3 children, data(<class 'numpy.ndarray'>): [2 2]
        - "Bulk"(FamilyName_t), 0 children, data(<class 'numpy.ndarray'>): b'Bulk'
        - "Zone"(Zone_t), 8 children, data(<class 'numpy.ndarray'>): [[5 5 1]]
            - "ZoneType"(ZoneType_t), 0 children, data(<class 'numpy.ndarray'>): b'Unstructured'
            - "FamilyName"(FamilyName_t), 0 children, data(<class 'numpy.ndarray'>): b'Bulk'
            - "GridCoordinates"(GridCoordinates_t), 2 children, data(<class 'NoneType'>): None
                - "CoordinateX"(DataArray_t), 0 children, data(<class 'numpy.ndarray'>): [0.  1.  1.  0.  0.5]
                - "CoordinateY"(DataArray_t), 0 children, data(<class 'numpy.ndarray'>): [0.  0.  1.  1.  1.5]
            - "Elements_BAR_2"(Elements_t), 2 children, data(<class 'numpy.ndarray'>

### Get Sample data

In [13]:
# Print sample general data
print(f"{sample_03 = }", end="\n\n")

# Print sample scalar data
print(f"{sample_03.get_scalar_names() = }")
print(f"{sample_03.get_scalar('speed') = }")
print(f"{sample_03.get_scalar('rotation') = }", end="\n\n")

# Print sample scalar data
print(f"{sample_03.get_field_names() = }")
print(f"{sample_03.get_field('temperature') = }")

sample_03 = Sample(2 scalars, 1 timestamp, 2 fields)

sample_03.get_scalar_names() = ['rotation', 'speed']
sample_03.get_scalar('speed') = 0.7505030544770248
sample_03.get_scalar('rotation') = -0.9898042104558067

sample_03.get_field_names() = ['OriginalIds', 'temperature']
sample_03.get_field('temperature') = array([0.11265573, 0.81479523, 0.34774428, 0.05150114, 0.65366643])


## Section 2: Performing Operations on the Dataset

This section demonstrates how to add Samples to the Dataset, add information, and access data.

### Add Samples in the Dataset

In [14]:
# Add Samples by id in the Dataset
dataset.set_sample(id=0, sample=sample_01)
dataset.set_sample(1, sample_02)

# Add unique Sample and automatically create its id
added_sample_id = dataset.add_sample(sample_03)
print(f"{added_sample_id = }")

added_sample_id = 2


### Add and display information to the Dataset

In [15]:
# Add node information to the Dataset
dataset.add_info("legal", "owner", "Safran")

# Retrive dataset information
import json
dataset_info = dataset.get_infos()
print("dataset info =", json.dumps(dataset_info, sort_keys=False, indent=4), end="\n\n")

# Overwrite information (logger will display warnings)
infos = {"legal": {"owner": "Safran", "license": "CC0"}}
dataset.set_infos(infos)

# Retrive dataset information
dataset_info = dataset.get_infos()
print("dataset info =", json.dumps(dataset_info, sort_keys=False, indent=4), end="\n\n")

# Add tree information to the Dataset (logger will display warnings)
dataset.add_infos("data_description", {"number_of_samples" : 0, "number_of_splits": 0})

# Pretty print dataset information
dataset.print_infos()



dataset info = {
    "legal": {
        "owner": "Safran"
    }
}

dataset info = {
    "legal": {
        "owner": "Safran",
        "license": "CC0"
    }
}

*********************** [34;1mdataset infos[0m **********************
[33;1mlegal[0m
  [32;1mowner[0m:Safran
  [32;1mlicense[0m:CC0
[33;1mdata_description[0m
  [32;1mnumber_of_samples[0m:0
  [32;1mnumber_of_splits[0m:0
************************************************************



### Get a list of specific Samples in a Dataset

In [16]:
get_samples_from_ids = dataset.get_samples(ids=[0, 1])
dprint("get samples from ids =", get_samples_from_ids)

get samples from ids = {
     0 : Sample(1 scalar, 1 timestamp, 2 fields)
     1 : Sample(1 scalar, 0 timestamps, 0 fields, no tree)
}


### Get the list of Sample ids in a Dataset

In [17]:
# Print sample IDs
print("get_sample_ids =", dataset.get_sample_ids())

get_sample_ids = [0, 1, 2]


### Print Dataset general data

In [18]:
# Print the Dataset
print(f"{dataset = }")
print("length of dataset =", len(dataset))

dataset = Dataset(3 samples, 2 scalars, 2 fields)
length of dataset = 3


### Add a list of Sample to a Dataset

In [19]:
# Create a new Dataset and add multiple samples
dataset = Dataset()
samples = [sample_01, sample_02, sample_03]
added_ids = dataset.add_samples(samples)
print(f"{added_ids = }")
print(f"{dataset = }")

added_ids = array([0, 1, 2])
dataset = Dataset(3 samples, 2 scalars, 2 fields)


### Access to Samples data through Dataset

In [20]:
# Access Sample data with indexes through the Dataset
print(f"{dataset(0) = }") # call strategy
print(f"{dataset[1] = }") # getitem strategy
print(f"{dataset[2] = }", end="\n\n")

print("scalar of the first sample = ", dataset[0].get_scalar_names())
print("scalar of the second sample = ", dataset[1].get_scalar_names())
print("scalar of the third sample = ", dataset[2].get_scalar_names())

dataset(0) = Sample(1 scalar, 1 timestamp, 2 fields)
dataset[1] = Sample(1 scalar, 0 timestamps, 0 fields, no tree)
dataset[2] = Sample(2 scalars, 1 timestamp, 2 fields)

scalar of the first sample =  ['rotation']
scalar of the second sample =  ['rotation']
scalar of the third sample =  ['rotation', 'speed']


In [21]:
# Access dataset information
print(f"{dataset[0].get_scalar('rotation') = }")
print(f"{dataset[1].get_scalar('rotation') = }")
print(f"{dataset[2].get_scalar('rotation') = }")

dataset[0].get_scalar('rotation') = -0.9898042104558067
dataset[1].get_scalar('rotation') = -0.20006775330325757
dataset[2].get_scalar('rotation') = -0.9898042104558067


### Get Dataset scalars to tabular

In [22]:
# Print scalars in tabular format
print(f"{dataset.get_scalar_names() = }", end="\n\n")

dprint("get rotation scalar = ", dataset.get_scalars_to_tabular(['rotation']))
dprint("get speed scalar = ", dataset.get_scalars_to_tabular(['speed']), end="\n\n")

# Get specific scalars in tabular format
dprint("get specific scalars =", dataset.get_scalars_to_tabular(['speed', 'rotation']))
dprint("get all scalars =", dataset.get_scalars_to_tabular())

dataset.get_scalar_names() = ['rotation', 'speed']

dataset.get_scalars_to_tabular(['rotation']) = {'rotation': array([-0.98980421, -0.20006775, -0.98980421])}
dataset.get_scalars_to_tabular(['speed']) = {'speed': array([       nan,        nan, 0.75050305])}

get specific scalars = {
     speed : [       nan        nan 0.75050305]
     rotation : [-0.98980421 -0.20006775 -0.98980421]
}
get all scalars = {
     rotation : [-0.98980421 -0.20006775 -0.98980421]
     speed : [       nan        nan 0.75050305]
}


In [None]:
# Get specific scalars np.array
print("get all scalar arrays = ", dataset.get_scalars_to_tabular(as_nparray=True))

### Get Dataset fields

In [23]:
# Print fields in the Dataset
print("fields in the dataset = ", dataset.get_field_names())

fields in the dataset =  ['OriginalIds', 'temperature']


## Section 3: Various operations on the Dataset

This section demonstrates operations like merging datasets, adding tabular scalars, and setting information.

### Initialize a Dataset with a list of Samples

In [24]:
# Create another Dataset
other_dataset = Dataset()
nb_samples = 3
samples = []
for _ in range(nb_samples):
    sample = Sample()
    sample.add_scalar('rotation', np.random.rand() + 1.0)
    sample.add_scalar('random_name', np.random.rand() - 1.0)
    samples.append(sample)

# Add a list of Samples
other_dataset.add_samples(samples)
print(f"{other_dataset = }")

other_dataset = Dataset(3 samples, 2 scalars, 0 fields)


### Merge two Datasets

In [25]:
# Merge the other dataset with the main dataset
print(f"before merge: {dataset = }")
dataset.merge_dataset(other_dataset)
print(f"after merge: {dataset = }", end="\n\n")

dprint("dataset scalars = ", dataset.get_scalars_to_tabular())

before merge: dataset = Dataset(3 samples, 2 scalars, 2 fields)
after merge: dataset = Dataset(6 samples, 3 scalars, 2 fields)

dataset scalars =  {
     random_name : [        nan         nan         nan -0.74587743 -0.81683182 -0.38500395]
     rotation : [-0.98980421 -0.20006775 -0.98980421  1.76098243  1.27352035  1.72448285]
     speed : [       nan        nan 0.75050305        nan        nan        nan]
}


### Add tabular scalars to a Dataset

In [26]:
# Adding tabular scalars to the dataset
new_scalars = np.random.rand(3, 2)
dataset.add_tabular_scalars(new_scalars, names=['Tu', 'random_name'])

print(f"{dataset = }")
dprint("dataset scalars =", dataset.get_scalars_to_tabular())

dataset = Dataset(9 samples, 4 scalars, 2 fields)
dataset scalars = {
     Tu : [       nan        nan        nan        nan        nan        nan
 0.8211725  0.5322467  0.99084022]
     random_name : [        nan         nan         nan -0.74587743 -0.81683182 -0.38500395
  0.92351857  0.65170413  0.75017786]
     rotation : [-0.98980421 -0.20006775 -0.98980421  1.76098243  1.27352035  1.72448285
         nan         nan         nan]
     speed : [       nan        nan 0.75050305        nan        nan        nan
        nan        nan        nan]
}


### Set additional information to a dataset

In [27]:
infos = {
    "legal": {
        "owner": "Safran",
        "license": "CC0"},
    "data_production": {
        "type": "simulation",
        "simulator": "dummy"}
}
dataset.set_infos(infos)
dataset.print_infos()

*********************** [34;1mdataset infos[0m **********************
[33;1mlegal[0m
  [32;1mowner[0m:Safran
  [32;1mlicense[0m:CC0
[33;1mdata_production[0m
  [32;1mtype[0m:simulation
  [32;1msimulator[0m:dummy
************************************************************



## Section 4: Saving and Loading Dataset

This section demonstrates how to save and load a Dataset from a directory or file.

### Save a Dataset as a file tree

In [28]:
tmpdir = f'/tmp/test_safe_to_delete_{np.random.randint(1e10, 1e12)}'
print(f"Save dataset in: {tmpdir}")

dataset._save_to_dir_(tmpdir)

Save dataset in: /tmp/test_safe_to_delete_100939411122


100%|██████████| 9/9 [00:00<00:00, 348.34it/s]


### Get the number of Samples that can be loaded from a directory

In [29]:
empty_ds = Dataset()
nb_samples = empty_ds._load_number_of_samples_(tmpdir)

print(f"{nb_samples = }")

nb_samples = 9


### Load a Dataset from a directory via initialization

In [30]:
loaded_dataset_from_init = Dataset(tmpdir)

print(f"{loaded_dataset_from_init = }")

loaded_dataset_from_init = Dataset(9 samples, 4 scalars, 2 fields)


### Load a Dataset from a directory via the Dataset class

In [31]:
loaded_dataset_from_class = Dataset.load_from_dir(tmpdir)

print(f"{loaded_dataset_from_class = }")

loaded_dataset_from_class = Dataset(9 samples, 4 scalars, 2 fields)


### Load the dataset from a directory via a Dataset instance

In [32]:
loaded_dataset_from_instance = Dataset()
loaded_dataset_from_instance._load_from_dir_(tmpdir)

print(f"{loaded_dataset_from_instance = }")

loaded_dataset_from_instance = Dataset(9 samples, 4 scalars, 2 fields)


### Save the dataset to a TAR (Tape Archive) file

In [33]:
tmpdir = f'/tmp/test_safe_to_delete_{np.random.randint(1e10,1e12)}'
tmpfile = os.path.join(tmpdir, 'test_file.plaid')

print(f"Save dataset in: {tmpfile}")
dataset.save(tmpfile)

Save dataset in: /tmp/test_safe_to_delete_198618260988/test_file.plaid


100%|██████████| 9/9 [00:00<00:00, 575.59it/s]


### Load the dataset from a TAR (Tape Archive) file via Dataset instance

In [34]:
new_dataset = Dataset()
new_dataset.load(tmpfile)

print(f"{dataset = }")
print(f"{new_dataset = }")

dataset = Dataset(9 samples, 4 scalars, 2 fields)
new_dataset = Dataset(9 samples, 4 scalars, 2 fields)


### Load the dataset from a TAR (Tape Archive) file via initialization

In [35]:
new_dataset = Dataset(tmpfile)

print(f"{dataset = }")
print(f"{new_dataset = }")

dataset = Dataset(9 samples, 4 scalars, 2 fields)
new_dataset = Dataset(9 samples, 4 scalars, 2 fields)
