# Simulation Data ETL

This notebook reads in the .mat simulation data created for the CognitiveGrid project and loads it into BTrDB.

In [1]:
import os 
import uuid
import glob
import json 
import btrdb
import scipy.io

import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

from btrdb.utils.timez import ns_delta, to_nanoseconds

In [2]:
# File Paths
DATA_FILES = glob.glob("BESS_*.mat")

## Extract MAT files to HDF5

In [3]:
def load_dataframes(files=DATA_FILES):
    """
    Load tables from .mat file and events CSV and return a
    dictionary of pandas DataFrames for the data.
    """
    dfs = {}
    
    for fpath in files:
        key = fpath.rstrip(".mat")
        dat = scipy.io.loadmat(fpath)
        arr = dat[key]
        typ = arr.dtype
        
        # Stupid depth extraction
        assert len(arr) == 1
        arr = arr[0]
        assert len(arr) == 1
        arr = arr[0]
        assert len(arr) == 12
        
        cols = []
        for idx, col in enumerate(arr):
            assert col.shape == (100000, 1)
            col = pd.Series(map(np.float64, col), name=typ.names[idx])
            cols.append(col)
        
        dfs[key] = pd.concat(cols, axis=1)
    
    return dfs


def save_hdf(data, path):
    """
    Save the dictionary of DataFrames as an HDF5 file.
    """
    # Use append mode, so delete existing file
    if os.path.exists(path):
        os.remove(path)
    
    for key, df in data.items():
        df.to_hdf(path, key=key, mode='a')

In [4]:
# ETL to transform data into HDF5 Data
data = load_dataframes(DATA_FILES)
save_hdf(data, "pow_epfl.hdf")

## Read Data from HDF5 Files

In [27]:
START = "2018-08-29T15:30:00.000Z" # The time of the GitHub commit of the files (rounded)
PATH = "pow_epfl.hdf"
KEYS = [
    'BESS_0_to_m500', 
    'BESS_m500_to_p500', 
    'BESS_0_to_m200', 
    'BESS_0_to_p200', 
    'BESS_idle', 
    'BESS_0_to_p500',
]
COLS = ['V1', 'V2', 'V3', 'ia1', 'ia2', 'ia3', 'ib1', 'ib2', 'ib3', 'ic1', 'ic2', 'ic3']


def get_dataframe(path=PATH, key=KEYS[0], start=START):
    start = to_nanoseconds(start)
    times = start + np.array(1e9*np.linspace(0,2,100000), dtype=np.int64)
        
    df = pd.read_hdf(path, key=key)
    df.index = times
    return df


def get_streams(path=PATH, keys=KEYS, start=START, cols=COLS, interval=2):
    dfs = []
    start = to_nanoseconds(start)
    interval = ns_delta(seconds=interval)
    
    for idx, key in enumerate(keys):
        begin = start + (idx*interval)
        dfs.append(get_dataframe(path, key, begin))
    
    for col in cols:
        s = pd.concat([df[col] for df in dfs], axis=0)
        s.name=col
        yield s

In [19]:
get_dataframe().head()

Unnamed: 0,V1,V2,V3,ia1,ia2,ia3,ib1,ib2,ib3,ic1,ic2,ic3
1535556600000000000,3.33659,1.4362,-4.77706,0.293283,0.083316,-0.267342,-0.622578,0.200201,0.333262,-0.205084,0.409253,-0.496841
1535556600000020000,3.31034,1.46428,-4.78225,0.293893,0.081179,-0.277413,-0.617695,0.203864,0.331126,-0.211493,0.426649,-0.509354
1535556600000040000,3.29112,1.49541,-4.78866,0.302438,0.082095,-0.281991,-0.619221,0.201117,0.326853,-0.225837,0.439772,-0.509354
1535556600000060000,3.26487,1.52928,-4.79415,0.303354,0.097354,-0.292978,-0.619221,0.204779,0.333567,-0.225837,0.437025,-0.501724
1535556600000080000,3.23923,1.55889,-4.80087,0.309763,0.09949,-0.300607,-0.612812,0.206915,0.327158,-0.234077,0.427564,-0.481277


In [28]:
streams = get_streams()