Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Introduce fields in schema to describe synthetic data #64

Merged
merged 10 commits into from
May 10, 2024
8 changes: 7 additions & 1 deletion batdata/schemas/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from pydantic import BaseModel, Field, AnyUrl, Extra

from batdata.schemas.modeling import ModelMetadata
from batdata.schemas.battery import BatteryDescription
from batdata.version import __version__

Expand All @@ -20,7 +21,9 @@ class BatteryMetadata(BaseModel, extra=Extra.allow):
name: Optional[str] = Field(None, description="Name of the cell. Any format for the name is acceptable,"
" as it is intended to be used by the battery data provider.")
comments: Optional[str] = Field(None, description="Long form comments describing the test")
version: str = Field(__version__, description="Version of this metadata")
version: str = Field(__version__, description="Version of this metadata. Set by the battery-data-toolkit")
is_measurement: bool = Field(True, description="Whether the data was created observationally as opposed to a computer simulation",
iri="https://w3id.org/emmo#EMMO_463bcfda_867b_41d9_a967_211d4d437cfb")

# Fields that describe the test protocol
cycler: Optional[str] = Field(None, description='Name of the cycling machine')
Expand All @@ -31,6 +34,9 @@ class BatteryMetadata(BaseModel, extra=Extra.allow):
# Field that describe the battery assembly
battery: Optional[BatteryDescription] = Field(None, description="Description of the battery being tested")

# Fields that describe source of synthetic data
modeling: ModelMetadata = Field(None, description="Description of simulation approach")

# Fields that describe the source of data
source: Optional[str] = Field(None, description="Organization who created this data")
dataset_name: Optional[str] = Field(None, description="Name of a larger dataset this data is associated with")
Expand Down
44 changes: 44 additions & 0 deletions batdata/schemas/modeling.py
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think I understand what this is used for...

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I get it now, oops...

Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
"""Metadata which describes how data produced by models were generated"""
from typing import Optional, List
from enum import Enum

from pydantic import BaseModel, Field, AnyUrl


class ModelTypes(str, Enum):
"""Type of computational method"""

physics = 'physics'
"""A computational application that uses a physical model to predict the behaviour of a system,
providing a identifiable analogy with the original object.

IRI: https://w3id.org/emmo#EMMO_8d4962d7_9608_44f7_a2f1_82a4bb173f4a"""
data: 'data'
"""A computational application that uses existing data to predict the behaviour of a system
without providing a identifiable analogy with the original object.

IRI: https://w3id.org/emmo#EMMO_a4b14b83_9392_4a5f_a2e8_b2b58793f59b"""

empirical: 'empirical'
"""A computational application that uses an empiric equation to predict the behaviour of a system
without relying on the knowledge of the actual physical phenomena occurring in the object.

IRI: https://w3id.org/emmo#EMMO_67c70dcd_2adf_4e6c_b3f8_f33dd1512487"""


class ModelMetadata(BaseModel):
"""Describe the type and version of a computational tool used to generate battery data"""

# High-level information about the code
name: str = Field(..., description='Name of the software')
version: Optional[str] = Field(..., description='Version of the software if known')
type: Optional[ModelTypes] = Field(None, description='Type of the computational method it implements.')
reference: Optional[List[AnyUrl]] = Field(None, description='List of references associated with the software')

# Details for physics based simulation
model_type: Optional[List[str]] = Field(None, description='Type of mathematical model(s) being used in physics simulation.'
'Use terms defined in BattINFO, such as "BatteryEquivalentCircuitModel".',
root_iri='https://w3id.org/emmo#EMMO_f7ed665b_c2e1_42bc_889b_6b42ed3a36f0')
simulation_type: Optional[str] = Field(None, description='Type of simulation being performed. '
'Use terms defined in BattINFO, such as "TightlyCoupledModelsSimulation"',
root_iri='https://w3id.org/emmo#EMMO_e97af6ec_4371_4bbc_8936_34b76e33302f')
102 changes: 102 additions & 0 deletions batdata/schemas/ontology.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
"""Tools used for linking terms in our data format to the BattINFO ontology"""
from dataclasses import dataclass, field
from functools import cache
from typing import Type, List, Optional, Union

from ontopy import World
from owlready2 import Thing
from pydantic import BaseModel

_battinfo_url = 'https://raw.githubusercontent.com/emmo-repo/domain-battery/master/battery-inferred.ttl'


@cache
def load_battinfo():
return World().get_ontology(_battinfo_url).load()


@dataclass
class TermInfo:
"""Information about a term as referenced from the BattINFO ontology"""

name: str
"""Name of the matching term"""
iri: str = field(repr=False)
"""IRI of the term"""
elucidation: Optional[str] = field(repr=False)
"""Explanation of the term"""

@classmethod
def from_thing(cls, thing: Thing):
# Retrieve the description, as provided by EMMO
eluc = thing.get_annotations().get('elucidation')
if eluc is not None:
eluc = str(eluc)
return TermInfo(name=str(thing), iri=thing.iri, elucidation=eluc)


def cross_reference_terms(model: Type[BaseModel]) -> dict[str, TermInfo]:
"""Gather the descriptions of fields from our schema which
are cross-referenced to a term within the BattINFO/EMMO ontologies

Args:
model: Schema object to be cross-referenced
Returns:
Mapping between metadata fields in elucidation field from the ontology
"""

# Load the BattINFO ontology
battinfo = load_battinfo()

# Loop over each field in the schema
terms = {}
for name, attr in model.model_fields.items():
# Map to the term in the ontology if known
if attr.json_schema_extra is not None and (iri := attr.json_schema_extra.get('iri')) is not None:
term = battinfo.search_one(iri=iri)
if term is None:
raise ValueError(f'Count not find matching term for {name} with iri={iri}')
terms[name] = TermInfo.from_thing(term)

return terms


def resolve_term(name_or_iri: str) -> Thing:
"""Resolve the Term object associated with a string

Args:
name_or_iri: The preferred label or the IRI of a term in the ontology
Returns:
Thing matching the term
"""

# Attempt to find it
bi = load_battinfo()
if name_or_iri.startswith('https://'):
term = bi.search_one(iri=name_or_iri)
t = 'IRI'
else:
term = bi.search_one(prefLabel=name_or_iri)
t = 'name'

if term is None:
raise ValueError(f'Could not find the {t}={name_or_iri}')
return term


def gather_descendants(term: Union[Type[Thing], str]) -> List[TermInfo]:
"""Get descriptions of the descendants of a certain base type

Args:
term: Term for which to gather all descendants. Either the class object itself or its preferred label or IRI
Returns:
List of descriptions of the descendants
"""

# Resolve the term object, if needed
if isinstance(term, str):
term = resolve_term(term)

return [
TermInfo.from_thing(d) for d in term.descendants(include_self=False)
]
87 changes: 87 additions & 0 deletions docs/schemas.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
# Battery Data Schemas

The metadata schemas used by `batdata` standardize how we describe the source of battery datasets.
Metadata are held as part of the `BatteryDataset` object and saved within the file formats
produced by `batdata` to ensure that the provenance of a dataset is kept alongside the actual data.


## Understanding the Metadata

The metadata we employ in `batdata` follows the style of the JSON or XML data structures which are ubiquitous
in scientific computation and data infrastructure.

We recommend creating the metadata for a battery through the Python interface.
Start by creating a `BatteryMetadata` object. There are no required fields, but you should always give your data a name.

```python
from batdata.schemas import BatteryMetadata

metadata = BatteryMetadata(
name='test-cell',
)
```

The metadata is a nested document where different types of information are grouped together into sub objects.
For example, the details about the battery being tested are in `BatteryDescription`

```python
from batdata.schemas.battery import BatteryDescription
from batdata.schemas import BatteryMetadata

metadata = BatteryMetadata(
name='test-cell',
battery=BatteryDescription(
manufacturer='famous',
nominal_capacity=1.,
)
)
```

See the [schemas module](https://github.com/ROVI-org/battery-data-toolkit/tree/main/batdata/schemas)
for a full accounting of the available fields in our schema.

> TODO: Render the schemas into an easier-to-read format

Feel free to add your own fields to any part fo the schema.
The schema is a continual work in progress and the battery-data-toolkit will
store your new fields.
Consider adding [an Issue](https://github.com/ROVI-org/battery-data-toolkit/issues) to our GitHub
if you find you use a term enough that we should add it to the schema.

### Source of Terminology

We use terms from [BattINFO ontology](https://big-map.github.io/BattINFO/index.html) wherever possible.

Fields in the schema whose names correspond to a BattINFO term are marked
with the "IRI" of the field, which points to a website containing the description.

Fields whose values should be terms from the BattINFO ontology are marked with the root of the terms.
For example, the `model_type` field of `ModelMetadata` can be any type of
[MathematicalModel](https://emmo-repo.github.io/emmo.html#EMMO_f7ed665b_c2e1_42bc_889b_6b42ed3a36f0).
Look them up using some utilities in `batdata`.

```python
from batdata.schemas.ontology import gather_descendants

print(gather_descendants('MathematicalModel'))
```

> TODO: Render the options in web-hosted documentation as well

## Column Datasets

The columns of datasets are described in the [cycling module](https://github.com/ROVI-org/battery-data-toolkit/blob/main/batdata/schemas/cycling.py).

Use the descriptions here when formatting your dataset, playing attention to the sign conventions and units for each column.

Record columns that are not defined in our schema in the `*_columns` fields
of the `BatteryMetadata`.

```python
from batdata.schemas import BatteryMetadata

metadata = BatteryMetadata(
name='test_cell',
raw_data_columns={'new_signal': 'A column not yet defined in our schemas.'}
)
```
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ dependencies = [
"h5py == 3.*",
"scythe-extractors >= 0.1",
"pyarrow >= 15",
"EMMOntoPy",
"xlrd"
]

Expand Down
26 changes: 26 additions & 0 deletions tests/schemas/test_ontology.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
"""Test the ability to resolve cross-references from the ontology"""

from batdata.schemas import BatteryMetadata
from batdata.schemas.ontology import cross_reference_terms, gather_descendants, load_battinfo, resolve_term


def test_crossref():
terms = cross_reference_terms(BatteryMetadata)
assert 'is_measurement' in terms
assert terms['is_measurement'].name == 'emmo.Measurement'
assert 'EMMO' in terms['is_measurement'].iri
assert 'well defined mesurement procedure.' in terms['is_measurement'].elucidation


def test_resolve():
assert resolve_term('PhysicsBasedSimulation') is not None
assert resolve_term('https://w3id.org/emmo#EMMO_f7ed665b_c2e1_42bc_889b_6b42ed3a36f0') is not None


def test_descendants():
bi = load_battinfo()
desc = [t.name for t in gather_descendants(bi.PhysicsBasedSimulation)]
assert 'emmo.StandaloneModelSimulation' in desc

desc = [t.name for t in gather_descendants('PhysicsBasedSimulation')]
assert 'emmo.StandaloneModelSimulation' in desc
Loading