# Convert MATLAB Simulation Files to HDF5
The original data are in MATLAB v73 format as large tabular formats. 

We use [Mat73](https://pypi.org/project/mat73/) to read the data, then save the data into a documented format.

In [1]:
from batdata.schemas import BatteryMetadata
from batdata.data import BatteryDataset
from pathlib import Path
from mat73 import loadmat
from h5py import File
import pandas as pd
import numpy as np
import json

## Make the base metadata
All batteries will use the same

In [2]:
metadata = BatteryMetadata(
    source='gasper_synthetic',
)

TODO: Make space in the schema for describing the code versions, inputs, etc

## Iterate over Each File
Write out each row of the "constant" and "varying" to a different HDF5 file

In [3]:
out_path = Path('processed')
out_path.mkdir(exist_ok=True)

In [4]:
store = pd.HDFStore(out_path / f'constant-0.h5', 'w')

In [None]:
for name in ['constant', 'varying']:
    # Start by loading the data
    all_data = loadmat(f'raw/ROVI - {name} inputs.mat')['simulations']

    # Get the column names for inputs and outputs
    input_cols = all_data.pop('Input_vars')[0]
    output_cols = all_data.pop('Output_vars')[0]

    # Iterate over each row, which is a different input parameter set
    for i, (inputs, outputs) in enumerate(zip(all_data['Inputs'], all_data['Outputs'])):
        inputs = inputs.astype(np.float32)
        outputs = np.array(outputs, dtype=np.float32)  # 4D: n_samples x 1 x num_days x num_cols

        # Iterate over each instance of this parameter set
        for j, output in enumerate(outputs[:, 0, :, :]):
            data = np.concatenate([inputs, output], axis=1)

            # Convert to dataframe then rename columns that are known to batdata
            data = pd.DataFrame(data, columns=input_cols + output_cols)
            data['cycle_number'] = np.arange(len(data))

            # Convert columns known to batdata
            data.rename(columns={
                't': 'cycle_start',
                'dt': 'cycle_duration',
                'Q': 'energy_discharge',
                'q': 'capacity_discharge',
                'TdegC': 'temperature_average'
            }, inplace=True)
            data[['cycle_duration', 'cycle_start']] *= 3600 * 24  # Convert to seconds

            # Assemble into a battery dataset
            bd = BatteryDataset(cycle_stats=data, metadata=metadata)
            bd.validate()

            # Save to a group with our specific data
            bd.to_batdata_hdf(out_path / f'{name}-{i}-{j}.h5', complevel=9, complib='zlib')
            break

## 