# Store in Parquet Files
We'll use PyArrow to write data to a Parquet file in specific formats

In [1]:
from datetime import datetime, timedelta
from tempfile import TemporaryDirectory
from pyarrow.parquet import write_table
from collections import defaultdict
from pathlib import Path
import pyarrow as pa
import pandas as pd
import numpy as np
import json

Configuration

In [2]:
to_store = {
    'Current_A': pa.uint16(),
    'Voltage_V': pa.uint16(),
    'Cell_Temperature_C': pa.uint16(),
    'Datetime': pa.timestamp('ms')
}
compression = {'compression': 'SNAPPY'}

In [3]:
to_store

{'Current_A': DataType(uint16),
 'Voltage_V': DataType(uint16),
 'Cell_Temperature_C': DataType(uint16),
 'Datetime': TimestampType(timestamp[ms])}

## Load Example Data
Use the XCEL data to for this experiment. We'll need to convert the measurement time column to a `datetime` so it'll convert with PyArrow into a Parquet date format

In [4]:
xcel = pd.read_csv('../example-data/xcel.csv')
xcel.query('Cycle_Label != "EIS"', inplace=True)  # Don't bother here

In [5]:
xcel['Datetime'] = xcel['Datenum_d'].apply(lambda x: datetime(year=1, month=1, day=1) + timedelta(days=x))

Run the casting

In [6]:
metadata = dict()
for c, t in to_store.items():
    if 'int' in str(t):
        d_max = np.ceil(xcel[c].max())
        d_min = np.floor(xcel[c].min())
        xcel[c] = np.round((xcel[c] - d_min) / (d_max - d_min)).astype(t.to_pandas_dtype())
        metadata[f'ranges_{c}'] = str([d_min, d_max])
                           
    xcel[c] = np.array(xcel[c], dtype=t.to_pandas_dtype())

## Save into Parquet
First convert to Table, then write using the above-provided schema

In [7]:
table = pa.Table.from_pandas(xcel[list(to_store.keys())], preserve_index=False)

Apply the new schema

In [8]:
table = table.cast(pa.schema(to_store, metadata=metadata), safe=False)
table.schema

Current_A: uint16
Voltage_V: uint16
Cell_Temperature_C: uint16
Datetime: timestamp[ms]
-- schema metadata --
ranges_Current_A: '[-1.0, 1.0]'
ranges_Voltage_V: '[2.0, 5.0]'
ranges_Cell_Temperature_C: '[29.0, 31.0]'

In [9]:
for rows in [1000, 3000, 10000, 30000]:
    with TemporaryDirectory() as tmp:
        tmp = Path(tmp)
        write_table(
            table.slice(length=rows), 
            tmp / 'test.parquet',
            **compression)
    
        # Get the total storage size
        total_size = 0
        for path in tmp.iterdir():
            if path.is_file():
                total_size += path.stat().st_size
    
    # Save the result
    with open('storage-results.json', 'a') as fp:
        print(json.dumps({
            'rows': rows,
            'columns': list(to_store.keys()),
            'schema': dict((k, str(v)) for k, v in to_store.items()),
            **compression,
            'size': total_size
        }), file=fp)