# Convert XCEL Data from BatteryData.Energy.Gov
The XCEL dataset is housed on BatteryData.Energy.Gov, which is backed by [CKAN's REST API](https://docs.ckan.org/en/2.9/api/#example-importing-datasets-with-the-ckan-api). This notebook uses that API to download the data then convert it to HDF5 using batdata

In [1]:
from batdata.extractors.batterydata import BDExtractor
from batdata.schemas import BatteryMetadata, BatteryDescription
from tempfile import TemporaryDirectory
from shutil import copyfileobj, rmtree
from pathlib import Path
import requests

Configuration

In [2]:
dataset_ids = ['xcel-round-2-slpc_reupload_2', 'xcel-round-1-slpc_reupload_2']
extractor = BDExtractor(store_all=True)

## Part 1: Functions to download metadata
Get the metadata from the collection description

In [3]:
description = requests.get(f'https://batterydata.energy.gov/api/3/action/package_show?id={dataset_ids[0]}', timeout=15).json()

The response from CKAN gives a bunch of metadata fields. Our first step is to map them to fields in the batdata schema

In [4]:
dict((k, v) for k, v in description['result'].items() if v not in (None, '', []) and k != "resources")

{'cell_type': ['Pouch cell'],
 'creator_user_id': 'a853d711-0e37-44c9-80c9-a41d450c2da4',
 'date_dataset_created': '2018-08-16',
 'electrolyte_class_dataset': ['Organic liquid'],
 'id': 'ef9dec93-17a2-445a-b58e-dc3eadb1f79d',
 'isopen': False,
 'manufacturer_supplier': 'CAMP',
 'maximum_voltage': '4.1',
 'metadata_created': '2024-04-19T21:18:38.938069',
 'metadata_modified': '2024-04-20T00:45:59.866451',
 'minimum_voltage': '3',
 'name': 'xcel-round-2-slpc_reupload_2',
 'negative_electrode': ['Graphite'],
 'nominal_cell_capacity': '0.037',
 'notes': 'Single layer pouch cell from CAMP (2.5mAh/cm2) at various charge protocols (CCCV and Multi-step) Corresponds to paper by Tanvir Tanim et al, 2020 (https://doi.org/10.1016/j.xcrp.2020.100114)\r\n\r\nTest Condition\r\n\r\nCell 4, 5, 6: 6C - CCCV\r\n\r\nCell 7, 8, 9: 6C - MS1\r\n\r\nCell 11, 12 - MS5-1\r\n\r\nCell 13, 15 - MS5-2\r\n\r\nCell 16, 17, 18: 4C - CCCV\r\n\r\nCell 19, 20, 21 - 4C - MS1',
 'num_resources': 35,
 'num_tags': 9,
 'onec_

We need a function to convert them to our metadata format

In [5]:
def generate_metadata(desc: dict) -> BatteryMetadata:
    """Assemble the battery metadata which is the same for all cells

    Args:
        desc: Data from the CKAN metadata response
    Returns:
        Metadata for the cell provenance and construction
    """

    # Describe the battery
    battery = BatteryDescription(
        manufacturer=desc['manufacturer_supplier'],
        design=", ".join(desc['cell_type']),
        anode={'name': ", ".join(desc['negative_electrode'])},
        cathode={'name': ", ".join(desc['positive_electrode'])},
        electrolyte={'name': ", ".join(desc['electrolyte_class_dataset'])},
        nominal_capacity=desc['nominal_cell_capacity'],
    )

    # Describe the context of when it was tested
    return BatteryMetadata(
        source=desc['organization']['title'],
        dataset_name=desc['title'],
        associated_ids=[
            'https://doi.org/10.1016/j.xcrp.2020.100114',
            'https://doi.org/10.1016/j.ensm.2021.07.001'
        ],
        battery=battery,
    )
generate_metadata(description['result'])

BatteryMetadata(name=None, comments=None, version='0.3.1', is_measurement=True, cycler=None, start_date=None, set_temperature=None, schedule=None, battery=BatteryDescription(manufacturer='CAMP', design='Pouch cell', layer_count=None, anode=ElectrodeDescription(name='Graphite', supplier=None, product=None, thickness=None, area=None, loading=None, porosity=None), cathode=ElectrodeDescription(name='NMC532', supplier=None, product=None, thickness=None, area=None, loading=None, porosity=None), electrolyte=ElectrolyteDescription(name='Organic liquid', additives=[]), nominal_capacity=0.037), modeling=None, source='XCEL', dataset_name='XCEL Round 2 SLPC', authors=None, associated_ids=[Url('https://doi.org/10.1016/j.xcrp.2020.100114'), Url('https://doi.org/10.1016/j.ensm.2021.07.001')], raw_data_columns={}, cycle_stats_columns={}, eis_data_columns={})

## Grouping Data by Cell
Each cell is in two files. We must group them together before parsing

Each file is described under the `resources` key of the results

In [6]:
description['result']['resources'][4]

{'cache_last_updated': None,
 'cache_url': None,
 'ckan_url': 'https://batterydata.energy.gov',
 'created': '2024-04-19T21:19:19.797582',
 'datastore_active': True,
 'datastore_contains_all_records_of_source_file': True,
 'description': None,
 'format': 'CSV',
 'hash': 'e4bad0269b2ebe91e7800385f8457755',
 'id': '92ba973b-0cf2-4f43-a600-9b248b41976f',
 'ignore_hash': False,
 'last_modified': '2024-04-19T21:19:19.657813',
 'metadata_modified': '2024-04-20T00:02:45.740998',
 'mimetype': 'text/csv',
 'mimetype_inner': None,
 'name': 'P462 19 raw.csv',
 'original_url': 'https://batterydata.energy.gov/dataset/ef9dec93-17a2-445a-b58e-dc3eadb1f79d/resource/92ba973b-0cf2-4f43-a600-9b248b41976f/download/p462-19-raw.csv',
 'package_id': 'ef9dec93-17a2-445a-b58e-dc3eadb1f79d',
 'position': 4,
 'resource_id': '92ba973b-0cf2-4f43-a600-9b248b41976f',
 'resource_type': None,
 'set_url_type': False,
 'size': 13038502,
 'state': 'active',
 'task_created': '2024-04-19 21:19:20.326576',
 'url': 'https://b

We need a function to gather the download URLs to each then group them by name

In [7]:
def get_file_paths(resc: list[dict]) -> dict[str, str]:
    """Get the paths to all CSVs that are part of the dataset

    Args:
        resc: Resource list
    Returns:
        Map of file name to URL
    """

    return dict(
        (Path(x['url'][7:]).name, x['url']) for x in resc if x['format'] == 'CSV'
    )
files = get_file_paths(description['result']['resources'])
next(iter(files.items()))

('p462-18-raw.csv',
 'https://batterydata.energy.gov/dataset/ef9dec93-17a2-445a-b58e-dc3eadb1f79d/resource/21d92861-34bd-4d89-b523-d4337f868400/download/p462-18-raw.csv')

The battery extractor from `batdata` holds logic to perform the grouping

In [8]:
next(extractor.group(files.keys()))

['p462-18-raw.csv', 'p462-18-summary.csv']

## Perform the Download and Parse Loop
Collect metadata and files to parse together, then download and parse them, then save to HDF5

In [9]:
hdf5_path = Path('processed')
if hdf5_path.exists():
    rmtree(hdf5_path)
hdf5_path.mkdir(exist_ok=True)

In [10]:
for source in dataset_ids:
    # Get the metadata and file list
    description = requests.get(f'https://batterydata.energy.gov/api/3/action/package_show?id={source}', timeout=15).json()
    base_metadata = generate_metadata(description['result'])
    files = get_file_paths(description['result']['resources'])

    # Parse each pair of files
    for names in extractor.group(files.keys()):
        with TemporaryDirectory(dir='.') as tmp:
            # Start by downloading into a temporary directory
            tmp = Path(tmp)
            paths = []
            for name in names:
                url = files[name]
                out_path = tmp / name
                with out_path.open('wb') as fp:
                    copyfileobj(requests.get(url, stream=True, timeout=15).raw, fp)
                paths.append(out_path)

            # Parse them into a battery metadata object
            name = sorted(paths)[0].name[:-8]  # Get the name off the raw value
            my_metadata = base_metadata.copy()
            my_metadata.name = name
            dataset = extractor.parse_to_dataframe(paths, metadata=my_metadata)

            # Save it to HDF5 
            path = hdf5_path / f'{name}.h5'
            dataset.to_batdata_hdf(path, complevel=9)

  nrel_data = pd.read_csv(path)
