# Convert Kokam Data from BatteryData.Energy.Gov
The XCEL dataset is housed on BatteryData.Energy.Gov, which is backed by [CKAN's REST API](https://docs.ckan.org/en/2.9/api/#example-importing-datasets-with-the-ckan-api). This notebook uses that API to download the data then convert it to HDF5 using batdata

In [2]:
from battdat.io.batterydata import BDReader, generate_metadata
from battdat.schemas import BatteryMetadata, BatteryDescription
from tempfile import TemporaryDirectory
from shutil import copyfileobj, rmtree
from pathlib import Path
import requests

Configuration

In [4]:
dataset_ids = ['kokam-nmc-gr-75ah-accelerated-aging']
reader = BDReader(store_all=True)

## Grouping Data by Cell
Each cell is in two files. We must group them together before parsing

Each file is described under the `resources` key of the results

In [5]:
description = requests.get(f'https://batterydata.energy.gov/api/3/action/package_show?id={dataset_ids[0]}', timeout=15).json()

In [6]:
description['result']['resources'][4]

{'cache_last_updated': None,
 'cache_url': None,
 'ckan_url': 'https://batterydata.energy.gov',
 'created': '2024-03-05T16:54:49.292804',
 'datastore_active': True,
 'datastore_contains_all_records_of_source_file': True,
 'description': None,
 'format': 'CSV',
 'hash': '00b74580d8cbe9e39957b1282f11e2e0',
 'id': '43549194-c213-4ef6-ba43-87c2e01d7665',
 'ignore_hash': False,
 'last_modified': '2024-03-05T16:54:49.153749',
 'metadata_modified': '2024-03-05T17:23:49.938489',
 'mimetype': 'text/csv',
 'mimetype_inner': None,
 'name': 'Cell04 summary.csv',
 'original_url': 'https://batterydata.energy.gov/dataset/71c72c7b-39e3-4748-90f0-2d631c0f1df8/resource/43549194-c213-4ef6-ba43-87c2e01d7665/download/cell04-summary.csv',
 'package_id': '71c72c7b-39e3-4748-90f0-2d631c0f1df8',
 'position': 4,
 'resource_id': '43549194-c213-4ef6-ba43-87c2e01d7665',
 'resource_type': None,
 'set_url_type': False,
 'size': 963875,
 'state': 'active',
 'task_created': '2024-03-05 16:54:49.823804',
 'url': 'https

We need a function to gather the download URLs to each then group them by name

In [7]:
def get_file_paths(resc: list[dict]) -> dict[str, str]:
    """Get the paths to all CSVs that are part of the dataset

    Args:
        resc: Resource list
    Returns:
        Map of file name to URL
    """

    return dict(
        (Path(x['url'][7:]).name, x['url']) for x in resc if x['format'] == 'CSV'
    )
files = get_file_paths(description['result']['resources'])
next(iter(files.items()))

('cell01-summary.csv',
 'https://batterydata.energy.gov/dataset/71c72c7b-39e3-4748-90f0-2d631c0f1df8/resource/8332d60a-422a-497a-b59e-5cb85a91c16a/download/cell01-summary.csv')

The battery data reader from `battdat` holds logic to perform the grouping

In [9]:
next(reader.group(files.keys()))

['cell01-summary.csv', 'cell01-raw.csv']

## Perform the Download and Parse Loop
Collect metadata and files to parse together, then download and parse them, then save to HDF5

In [10]:
hdf5_path = Path('processed')
if hdf5_path.exists():
    rmtree(hdf5_path)
hdf5_path.mkdir(exist_ok=True)

In [11]:
bad_url_path = Path('bad-urls.txt')
bad_url_path.unlink(missing_ok=True)

In [20]:
for source in dataset_ids:
    # Get the metadata and file list
    description = requests.get(f'https://batterydata.energy.gov/api/3/action/package_show?id={source}', timeout=15).json()
    base_metadata = generate_metadata(description['result'], ['https://doi.org/10.23919/ACC.2017.7963578'])
    files = get_file_paths(description['result']['resources'])

    # Parse each pair of files
    for names in reader.group(files.keys()):
        with TemporaryDirectory(dir='.') as tmp:
            # Start by downloading into a temporary directory
            tmp = Path(tmp)
            paths = []
            is_bad = False
            for name in names:
                url = files[name]
                out_path = tmp / name
                with out_path.open('wb') as fp:
                    copyfileobj(requests.get(url, stream=True, timeout=15).raw, fp)
                # Check if "HTML error" is in the file contents
                with out_path.open() as fp:
                    if '<html>' in fp.readline():
                        is_bad = True
                        with bad_url_path.open('a') as fo:
                            print(url, file=fo)
                    else:
                        paths.append(out_path)

            # Parse them into a battery metadata object
            name = sorted(paths)[0].name[:-8]  # Get the name off the raw value
            my_metadata = base_metadata.copy()
            my_metadata.name = name
            dataset = reader.read_dataset(paths, metadata=my_metadata)

            # Save it to HDF5 
            path = hdf5_path / f'{name}.h5'
            dataset.to_hdf(path, complevel=9)

  nrel_data = pd.read_csv(path)
