In [14]:
import pandas as pd
import numpy as np

In [15]:
data_npy = np.random.uniform(size=(10000, 5))
data = pd.DataFrame(data_npy, columns=(str(i) for i in range(data_npy.shape[1])))

In [26]:
os.makedirs('data/csv', exist_ok=True)
data.to_csv('data/csv/dataset.csv', index=False)

In [None]:
file_formats = ['.csv', 'parquet', 'h5py', 'npy', 'xarray']

In [16]:
!rm -rf data/parquet/

In [17]:
# parquet
!mkdir data/parquet
data.to_parquet('data/parquet/dataset.parquet')

In [19]:
!ls data

h5py  parquet  train.csv


In [21]:
# h5py
import os
import h5py

os.makedirs('data/h5py', exist_ok=True)


# Writing:

# Open HDF5 file
h5_file = h5py.File('data/h5py/dataset.h5', 'w')
# Write dataset
h5_file.create_dataset('dataset', data=data_npy)
# Close file and write data to disk. Important!
h5_file.close()

# Reading:

# Open HDF5 file again
h5_file = h5py.File('data/h5py/dataset.h5', 'r')

# Read the full dataset
data_array_h5 = h5_file['dataset'][()]

# Close file
h5_file.close()

In [24]:
# npy
os.makedirs('data/npy', exist_ok=True)
np.save('data/npy/dataset.npy', data_npy)

In [25]:
# xarray
import xarray as xr

os.makedirs('data/xr', exist_ok=True)

# Write tidy data as NetCDF4
data.to_xarray().to_netcdf('data/xr/dataset.nc', engine='h5netcdf')
# Read tidy data from NetCDF4
dataset_xarray = xr.open_dataset('data/xr/dataset.nc', engine='h5netcdf')
dataset_netcdf4 = dataset_xarray.to_pandas()
dataset_xarray.close()

### Measure time

In [31]:
def load_csv(file):
    return pd.read_csv(file)

def load_npy(file):
    return np.load(file)

def load_h5(file):
    h5_file = h5py.File(file, 'r')
    data_array_h5 = h5_file['dataset'][()]
    h5_file.close()
    return data_array_h5

def load_xr(file):
    dataset_xarray = xr.open_dataset(file, engine='h5netcdf')
    dataset_netcdf4 = dataset_xarray.to_pandas()
    dataset_xarray.close()
    return dataset_netcdf4

def load_pq(file):
    return pd.read_parquet(file)

In [40]:
save_dir = Path('tmp')
save_dir.mkdir(exist_ok=True)

def save_csv(data, file):
    return data.to_csv(file, index=False)

def save_npy(data, file):
    return np.save(file, data)

def save_h5(data_array, file):
    h5_file = h5py.File(file, 'w')
    h5_file.create_dataset('data_array', data=data_array)
    h5_file.close()

def save_xr(data_array, file):
    xr.DataArray(data_array).to_netcdf(file, engine='h5netcdf')

def save_pq(data, file):
    data.to_parquet(file)

In [49]:
from pathlib import Path
from functools import partial

root = Path('data')
files = {
    'csv': (root / 'csv' / 'dataset.csv', load_csv, 
            partial(save_csv, data=data, file=save_dir / 'csv.csv')),
    'h5': (root / 'h5py' / 'dataset.h5', load_h5, 
           partial(save_h5, data_array=data_npy, file=save_dir / 'h5.h5')),
    'npy': (root / 'npy' / 'dataset.npy', 
            load_npy, partial(save_csv, data=data, file=save_dir / 'npy.npy')),
    'xr': (root / 'xr' / 'dataset.nc', load_xr, 
           partial(save_xr, data_array=data_npy, file=save_dir / 'xr.nc')),
    'parquet': (root / 'parquet' / 'dataset.parquet', load_pq, 
                partial(save_pq, data=data, file=save_dir / 'pq.pq')),
}

In [46]:
import time
def measure_time(file, load_f):
    time1 = time.time()
    _ = load_f(file)
    time2 = time.time()
    
    return time2 - time1

def measure_time_no_arg(f):
    time1 = time.time()
    _ = f()
    time2 = time.time()
    
    return time2 - time1

In [37]:
def get_size_file(file):
    return os.stat(file).st_size / (1024 * 1024)

In [50]:
for key, (file, load_fu, save_f) in files.items():
    print(key, get_size_file(file), measure_time(file, load_fu), measure_time_no_arg(save_f))

csv 0.9186868667602539 0.023109912872314453 0.05780196189880371
h5 0.3834228515625 0.0005431175231933594 0.0005953311920166016
npy 0.381591796875 0.0003285408020019531 0.05182194709777832
xr 0.467529296875 0.018091917037963867 0.010646343231201172
parquet 0.4689798355102539 0.10973048210144043 0.047615766525268555
