# Convert MATLAB Simulation Files to HDF5
The original data are in MATLAB v73 format as large tabular formats. 

We use [Mat73](https://pypi.org/project/mat73/) to read the data, then save the data into a documented format.

In [1]:
from batdata.schemas import BatteryMetadata
from batdata.data import BatteryDataset
from pathlib import Path
from mat73 import loadmat
from shutil import rmtree
from h5py import File
import pandas as pd
import numpy as np
import json

Configuration

In [2]:
known_columns = {  # Columns in data mapped to names in schema
    't': 'cycle_start',
    'dt': 'cycle_duration',
    'Q': 'energy_discharge',
    'q': 'capacity_discharge',
    'TdegC': 'temperature_average'
}

## Make the base metadata
All batteries will use the same

In [3]:
metadata = BatteryMetadata(
    source='gasper_synthetic',
)

TODO: Make space in the schema for describing the code versions, inputs, etc

Load in the descriptions of the variables. (Parsing from the README)

In [4]:
new_columns = {}
with open('README.md') as fp:
    for line in fp:
        if line.startswith("-"):
            line = line.strip()[2:]  # Strip off the "- "
            name, desc = line.split(":", 1)
            if name not in known_columns:
                new_columns[name] = desc.strip()

In [5]:
metadata.cycle_stats_columns = new_columns

## Iterate over Each File
Write out each row of the "constant" and "varying" to a different HDF5 file

In [6]:
out_path = Path('processed')
if out_path.exists():
    rmtree(out_path)
out_path.mkdir(exist_ok=True)

In [7]:
for name in ['constant', 'varying']:
    # Start by loading the data
    all_data = loadmat(f'raw/ROVI - {name} inputs.mat')['simulations']

    # Get the column names for inputs and outputs
    input_cols = all_data.pop('Input_vars')[0]
    output_cols = all_data.pop('Output_vars')[0]

    # Iterate over each row, which is a different input parameter set
    for i, (inputs, outputs) in enumerate(zip(all_data['Inputs'], all_data['Outputs'])):
        inputs = inputs.astype(np.float32)
        outputs = np.array(outputs, dtype=np.float32)  # 4D: n_samples x 1 x num_days x num_cols

        # Iterate over each instance of this parameter set
        with pd.HDFStore(out_path / f'{name}-{i}.h5', complevel=9, complib='zlib') as fo:
            for j, output in enumerate(outputs[:, 0, :, :]):
                data = np.concatenate([inputs, output], axis=1)
    
                # Convert to dataframe then rename columns that are known to batdata
                data = pd.DataFrame(data, columns=input_cols + output_cols)
                data['cycle_number'] = np.arange(len(data))
    
                # Convert columns known to batdata
                data.rename(columns=known_columns, inplace=True)
                data[['cycle_duration', 'cycle_start']] *= 3600 * 24  # Convert to seconds
    
                # Assemble into a battery dataset
                bd = BatteryDataset(cycle_stats=data, metadata=metadata)
                bd.validate()
    
                # Save to a group with our specific data
                bd.to_batdata_hdf(fo, prefix=f'run_{j}', append=True)

    del all_data  # Clear old data before getting new