# Convert Data into HDF5 format
Get the metadata from the cell files

In [1]:
from batdata.extractors.arbin import ArbinExtractor
from batdata.schemas import BatteryMetadata
from pathlib import Path
from tqdm import tqdm
import pandas as pd
import shutil
import os
import re

Configuration

In [2]:
source_dir = Path('raw')
output_dir = Path('processed/')
output_dir.mkdir(exist_ok=True, parents=True)

Cells to combine based on [2017-05-12 metadata](https://data.matr.io/1/projects/5c48dd2bc625d700019f3204/batches/5c86c0b5fa2ede00015ddf67)

In [3]:
channels_to_combine = {
    ('2017-05-12', '2017-06-30'): [1, 2, 3, 5, 6]
}  # Map of (date_1, date_2) -> (list of channels to combine)

Cells to exclude. Pairs of (date, channels)

In [4]:
to_exclude = {
    '2017-06-30': [10],  # Died early for unknown reasons
    '2018-04-12': [46],  # Electrical problems
    '2018-08-02': [17, 27],  # Failed to start
    '2019-01-24': [4, 5, 45, 46, 48]  # Cycling issues
}

## Gather Metadata of Cells
Gather the metadata about our cells given their filenames and other information we know about the tests

Define the regex for reading metadata from filenames

Some files, like `2017-05-12_5_4C-50per_3_6C_CH21`, are from a canvas of different charging policies. Their filenames follow the pattern, `<date>_<charge_1>_<SOC_switch>_<charge_2>_CH<channel_number>`, where
  - date: Test date
  - charge_1: First charge rate
  - charge_2: Second charge rate
  - SOC_switch: SOC at which charge rate switches
  - channel_number: Channel number in the testing machine


In [5]:
canvas_regex = re.compile(r"(?P<date>\d{4}-\d{2}-\d{2})_(?P<charge1>[\d_]+)C-"
                          r"(?P<soc_switch>\d+)per_(?P<charge2>[\d_]+)C_CH(?P<channel>\d+)\.csv")

Others, like `2019-01-24_batch9_CH6.csv`, have a less informative batch name

In [6]:
other_regex = re.compile(r"(?P<date>\d{4}-\d{2}-\d{2})_(?P<name>\w+)_CH(?P<channel>\d+)\.csv")

Most of the metadata for the batteries are the same

In [7]:
# Metadata for all of the batteries
test_metadata = {
    'cycler': 'Arbin LBT Potentiostat',
    'set_temperature': 30.0,
    'manufacturer': 'A123 Systems',
    'design': 'APR18650M1A',
    'nominal_capacity': 1.1,
    'anode': 'graphite',
    'cathode': 'LFP',
    'source': 'Stanford University',
}

The source data varies depending on the date.

In [8]:
def get_metadata_by_date(date: str) -> dict:
    if date > '2018-05':
        return {'dataset_name': 'Attia et al., Nature (2020)',
            'associated_ids': [
                'https://doi.org/10.1038/s41586-020-1994-5',
                'https://data.matr.io/1/projects/5d80e633f405260001c0b60a'
            ]}
    else:
        return {'dataset_name': 'Severson et al., Nature Energy (2019)',
            'associated_ids': [
                'https://doi.org/10.1038/s41560-019-0356-8',
                'https://data.matr.io/1/projects/5c48dd2bc625d700019f3204'
            ]}
get_metadata_by_date('2020-10')

{'dataset_name': 'Attia et al., Nature (2020)',
 'associated_ids': ['https://doi.org/10.1038/s41586-020-1994-5',
  'https://data.matr.io/1/projects/5d80e633f405260001c0b60a']}

In [9]:
def metadata_from_filename(path: Path) -> dict:
    """Get the metadata about a battery from its filename
    
    Args:
        filename: Filename of an Arbin file
    Returns:
        Metadata dictionary
    """
    # Get metdata from filename
    filename = path.name
    for regex in [canvas_regex, other_regex]:
        match = regex.match(filename)
        if match is not None:
            break
    else:
        raise ValueError(f'Failed to parse: {filename}')

    # Store the data in a dict
    metadata = match.groupdict()
    metadata['channel'] = int(metadata['channel'])
    metadata['path'] = path
    metadata['filename'] = filename
    return metadata

In [10]:
metadata = [metadata_from_filename(p) for p in sorted(source_dir.glob('*.csv'))]
print(f'Loaded data from {len(metadata)} cells')

Loaded data from 376 cells


In [11]:
metadata = pd.DataFrame(metadata)
metadata.head()

Unnamed: 0,date,charge1,soc_switch,charge2,channel,path,filename,name
0,2017-05-12,3_6,80,3_6,1,raw/2017-05-12_3_6C-80per_3_6C_CH1.csv,2017-05-12_3_6C-80per_3_6C_CH1.csv,
1,2017-05-12,3_6,80,3_6,2,raw/2017-05-12_3_6C-80per_3_6C_CH2.csv,2017-05-12_3_6C-80per_3_6C_CH2.csv,
2,2017-05-12,3_6,80,3_6,3,raw/2017-05-12_3_6C-80per_3_6C_CH3.csv,2017-05-12_3_6C-80per_3_6C_CH3.csv,
3,2017-05-12,4,80,4,5,raw/2017-05-12_4C-80per_4C_CH5.csv,2017-05-12_4C-80per_4C_CH5.csv,
4,2017-05-12,4,80,4,6,raw/2017-05-12_4C-80per_4C_CH6.csv,2017-05-12_4C-80per_4C_CH6.csv,


## Drop Bad Cells
Some cells had errors and, for now, we'll drop the data rather than figure out if we could use it

In [12]:
for batch_date, bad_channels in to_exclude.items():
    batch = metadata.query(f'date=="{batch_date}"')
    bad_inds = batch[batch['channel'].apply(bad_channels.__contains__)].index
    metadata.drop(bad_inds, inplace=True)
print(f'Reduced data to {len(metadata)} cells')

Reduced data to 372 cells


## Group Cells
Some experiments are continuations of previous ones

In [13]:
metadata['cell_id'] = range(len(metadata))  # Give each battery a unique cell ID
metadata.sort_values(['date', 'channel'], inplace=True, ascending=True)  # To have reporducible order
for batches, channels in channels_to_combine.items():
    for channel in channels:
        subset = metadata.query(f'channel=={channel}')
        subset = subset[subset['date'].isin(batches)]
        metadata.loc[subset.index, 'cell_id'] = subset['cell_id'].min()
n_cells = len(set(metadata["cell_id"]))
print(f'There are {n_cells} unique cells')

There are 367 unique cells


## Output into HDF5 format
For each unique cell, combine the records

In [14]:
extractor = ArbinExtractor()

In [15]:
for cell_index, (_, subset) in tqdm(enumerate(metadata.groupby('cell_id')), total=n_cells):
    # Get the files to be parsed
    files = subset['path'].tolist()
    
    # Get the source for the data
    my_metadata = {**test_metadata, **get_metadata_by_date(subset['date'].iloc[0])}

    # Make a metadata object
    cell_metadata = subset.iloc[0]
    my_metadata = BatteryMetadata(name=cell_metadata['filename'][:-4],
                                  start_date=cell_metadata['date'], **my_metadata)

    # Parse them into a single object
    cell_data = extractor.parse_to_dataframe(files, metadata=my_metadata)

    # Save it to disk
    out_path = output_dir / f'{my_metadata.name}.h5'
    cell_data.to_batdata_hdf(out_path, complevel=9)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 367/367 [32:23<00:00,  5.30s/it]
