In [1]:
import h5py
import numpy as np
import pandas as pd
from pathlib import Path

In [2]:
def parse_mat_file(mat_file):
    """
    Parse a .mat file and return the data as a pandas DataFrame.
    
    Args:
        mat_file: Path to the .mat file.
    
    Returns:
        pd.DataFrame: DataFrame containing the parsed data.
    """

    name = Path(mat_file).stem
    # Open the MATLAB file using h5py
    with h5py.File(mat_file, 'r') as f:
        # Print the keys in the file
        indices = list(f.keys())
        indices.remove('#refs#')
        
        # Access data from the file
        # Note: MATLAB arrays in HDF5 format are transposed
        # Example: data = f['dataset_name'][()]
        if all(key in indices for key in ['C', 'D', 'R', 'X']):
            C = f['C'][0][0]
            D = f['D'][0][0]
            R = f['R'][0][0]
            X = f['X'][0][0]
            # Remove the specified keys from indices
            for key in ['C', 'D', 'R', 'X']:
                indices.remove(key)
        else:
            raise KeyError("Required keys not found in the file.")

        df = pd.DataFrame()
        for key in indices:
            # Print the remaining keys
            # print(f"{key}")
            timestep = f[key]['time'][0].flatten()

            # Get the reference to the label
            label_ref = f[key]['signals']['label']
            label_data_1 = f[label_ref[0][0]][()]
            label_data_2 = f[label_ref[1][0]][()]
            # Convert the label data to an ASCII string
            if isinstance(label_data_1, np.ndarray) and label_data_1.ndim == 2:
                # Convert each number to the corresponding ASCII character and join them
                label_1 = ''.join([chr(int(char)) for char in label_data_1.flatten()])
                label_2 = ''.join([chr(int(char)) for char in label_data_2.flatten()])
            else:
                print("Label data is not a numpy array. Type:", type(label_data_1))

            value_ref = f[key]['signals']['values']
            value_data_1 = f[value_ref[0][0]][()].flatten()
            value_data_2 = f[value_ref[1][0]][()].flatten()

            # # Create the DataFrame
            # df = pd.DataFrame({
            #     'time': timestep,
            #     f'{key}': value_data_1,
            #     f'{key}_1': value_data_2
            # })
        
            df['time'] = timestep
            df[f'{key}'] = value_data_1
            df[f'{key}_1'] = value_data_2

        return df, label_1, label_2, name

In [3]:
data_path = 'data'
data_name = 'HIF'

# Create a Path object for the data directory
data_dir = Path(data_path) / data_name

# Get a list of all .mat files in the directory
mat_files = list(data_dir.glob('*.mat'))
# Sort the list of files
mat_files.sort()

In [4]:
for file in mat_files:
    # Parse each .mat file
    df, label_1, label_2, name = parse_mat_file(file)
    print(label_1, label_2, name)
    
    

CVT Real Data1
CVT Real Data10
CVT Real Data100
CVT Real Data101
CVT Real Data102
CVT Real Data103
CVT Real Data104
CVT Real Data105
CVT Real Data106
CVT Real Data107
CVT Real Data108
CVT Real Data109
CVT Real Data11
CVT Real Data110
CVT Real Data111
CVT Real Data112
CVT Real Data113
CVT Real Data114
CVT Real Data115
CVT Real Data116
CVT Real Data117
CVT Real Data118
CVT Real Data119
CVT Real Data12
CVT Real Data120
CVT Real Data121
CVT Real Data122
CVT Real Data123
CVT Real Data124
CVT Real Data125
CVT Real Data126
CVT Real Data127
CVT Real Data128
CVT Real Data129
CVT Real Data13
CVT Real Data130
CVT Real Data131
CVT Real Data132
CVT Real Data133
CVT Real Data134
CVT Real Data135
CVT Real Data136
CVT Real Data137
CVT Real Data138
CVT Real Data139
CVT Real Data14
CVT Real Data140
CVT Real Data141
CVT Real Data142
CVT Real Data143
CVT Real Data144
CVT Real Data145
CVT Real Data146
CVT Real Data147
CVT Real Data148
CVT Real Data149
CVT Real Data15
CVT Real Data150
CVT Real Data151
CVT R